From 60f90804d53304d270237ff66f476cdec82c91da Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 1 May 2022 04:42:33 -0500
Subject: [PATCH 01/96] Time tracking in MatchBox

---
 amgprec/impl/aggregator/MatchBoxPC.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp
index c1ec0976..8eb4af08 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.cpp
+++ b/amgprec/impl/aggregator/MatchBoxPC.cpp
@@ -64,6 +64,13 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
   fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n",
 	  myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
 #endif
+
+#ifdef #IE
+
+    #ifdef TIME_TRACKER
+        double tmr = MPI_Wtime();
+    #endif
+
   dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
 							   verLocPtr, verLocInd, edgeLocWeight,
 							   verDistance,  Mate,
@@ -71,6 +78,12 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 							   msgIndSent, msgActualSent, msgPercent,
 							   ph0_time, ph1_time, ph2_time,
 							   ph1_card, ph2_card );
+
+  #ifdef TIME_TRACKER
+    tmr = MPI_Wtime() - tmr;
+    fprintf(stderr, "Elaboration time: %f\n", tmr);
+  #endif
+
 #endif
 }
 

From 1760afbe97acb1d8e18f56b74b3c7dbe83aca863 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 1 May 2022 04:47:03 -0500
Subject: [PATCH 02/96] Time tracking in algoDistEdge

---
 ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
index 8be438b6..62e5112f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@@ -92,6 +92,21 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
 							      MilanReal* msgPercent,
                                                               MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
                                                               MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
+
+    /*
+     * verDistance: it's a vector long as the number of processors.
+     *              verDistance[i] contains the first node index of the i-th processor
+     *              verDistance[i + 1] contains the last node index of the i-th processor
+     * NLVer: number of elements in the LocPtr
+     * NLEdge: number of edges assigned to the current processor
+     *
+     * Contains the portion of matrix assigned to the processor in
+     * Yale notation
+     * verLocInd: contains the positions on row of the matrix
+     * verLocPtr: i-th value is the position of the first element on the i-th row and
+     *            i+1-th value is the position of the first element on the i+1-th row
+     */
+
 #if !defined(SERIAL_MPI)
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()"; fflush(stdout);
@@ -152,6 +167,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     if (myRank == 0)     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
 #endif
 
+#ifdef TIME_TRACKER
+    double Ghost2LocalInitialization = MPI_Wtime();
+#endif
     for ( i=0; i<NLEdge; i++ )  { //O(m) - Each edge stored twice
         insertMe = verLocInd[i];
         //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
@@ -170,6 +188,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
             } //End of else()
         } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
     } //End of for(ghost vertices)
+
+#ifdef TIME_TRACKER
+    Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
+    fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
+#endif
+
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
     if (!Ghost2LocalMap.empty()) {
@@ -218,6 +242,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
         cout<<verGhostPtr[numGhostVertices]<<"\n";
     fflush(stdout);
 #endif
+
+#ifdef TIME_TRACKER
+    double verGhostIndInitialization = MPI_Wtime();
+#endif
+
     for ( v=0; v < NLVer; v++ ) {
         adj1 = verLocPtr[v];   //Vertex Pointer
         adj2 = verLocPtr[v+1];
@@ -231,6 +260,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
         } //End of for(k)
     } //End of for (v)
     tempCounter.clear(); //Do not need this any more
+
+#ifdef TIME_TRACKER
+    verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+    fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+#endif
+
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
     for ( v=0; v < numGhostEdges; v++ )

From 7d40fde21d6cea39e1785a84c512acf2db31ee74 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 1 May 2022 05:42:42 -0500
Subject: [PATCH 03/96] verGhostIndInitialization and Ghost2LocalInitialization
 cycles parallelization

---
 amgprec/impl/aggregator/Makefile              |  2 +-
 amgprec/impl/aggregator/MatchBoxPC.cpp        |  3 +-
 ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 65 ++++++++++++++-----
 exec.sh                                       |  7 ++
 samples/advanced/pdegen/Makefile              |  2 +-
 5 files changed, 60 insertions(+), 19 deletions(-)
 create mode 100755 exec.sh

diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile
index d857a3b0..0444e60d 100644
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@@ -4,7 +4,7 @@ INCDIR=../../../include
 MODDIR=../../../modules
 HERE=../..
 
-FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES)
+FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES) -fopenmp
 CXXINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(INCDIR) $(FMFLAG)/.
 
 #CINCLUDES= -I${SUPERLU_INCDIR} -I${HSL_INCDIR}  -I${SPRAL_INCDIR} -I/home/users/pasqua/Ambra/BootCMatch/include -lBCM -L/home/users/pasqua/Ambra/BootCMatch/lib  -lm
diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp
index 8eb4af08..fc30e8fd 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.cpp
+++ b/amgprec/impl/aggregator/MatchBoxPC.cpp
@@ -60,13 +60,12 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 		MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
 #if !defined(SERIAL_MPI)
   MPI_Comm C_comm=MPI_Comm_f2c(icomm);
+
 #ifdef DEBUG
   fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n",
 	  myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
 #endif
 
-#ifdef #IE
-
     #ifdef TIME_TRACKER
         double tmr = MPI_Wtime();
     #endif
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
index 62e5112f..da8b3896 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@@ -1,4 +1,6 @@
 #include "MatchBoxPC.h"
+#include <omp.h>
+#include <stdio.h>
 // ***********************************************************************
 //
 //        MatchboxP: A C++ library for approximate weighted matching
@@ -167,25 +169,40 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     if (myRank == 0)     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
 #endif
 
+    /*
+     * OMP Ghost2LocalInitialization
+     * The cycle analyzes all the edges and when finds a ghost edge
+     * puts it in the Ghost2LocalMap.
+     * A critical region is needed when inserting data in the map.
+     *
+     * Despite the critical region it is still productive to
+     * parallelize this for because the critical region is exeuted
+     * only when a ghost edge is found and ghost edges are a minority.
+     */
+
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
+
+#pragma omp parallel for private(insertMe) firstprivate(StartIndex, EndIndex) default(shared)
     for ( i=0; i<NLEdge; i++ )  { //O(m) - Each edge stored twice
         insertMe = verLocInd[i];
         //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
         if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) { //Find a ghost
-            storedAlready = Ghost2LocalMap.find( insertMe );
-            if ( storedAlready != Ghost2LocalMap.end() ) { //Has already been added
-                //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
-                Counter[storedAlready->second]++; //Increment the counter
-                numGhostEdges++;
-            } else { //Insert an entry for the ghost:
-                //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
-                Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
-                Counter.push_back(1); //Initialize the counter
+#pragma omp critical
+            {
                 numGhostEdges++;
-                numGhostVertices++;  //Increment the number of ghost vertices
-            } //End of else()
+                storedAlready = Ghost2LocalMap.find(insertMe);
+                if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
+                    //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
+                    Counter[storedAlready->second]++; //Increment the counter
+                } else { //Insert an entry for the ghost:
+                    //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
+                    Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
+                    Counter.push_back(1); //Initialize the counter
+                    numGhostVertices++;  //Increment the number of ghost vertices
+                } //End of else()
+            }
         } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
     } //End of for(ghost vertices)
 
@@ -243,19 +260,37 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     fflush(stdout);
 #endif
 
+    /*
+     * OMP verGhostIndInitialization
+     *
+     * In this cycle the verGhostInd is initialized
+     * with the datas related to ghost edges.
+     * The check to see if a node is a ghost node is
+     * executed in paralle and when a ghost node
+     * is found a critical region is started.
+     *
+     * Despite the critical region it's still useful to
+     * parallelize the for cause the ghost nodes
+     * are a minority hence the critical region is executed
+     * few times.
+     */
+
 #ifdef TIME_TRACKER
     double verGhostIndInitialization = MPI_Wtime();
 #endif
-
+#pragma omp parallel for private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared)
     for ( v=0; v < NLVer; v++ ) {
         adj1 = verLocPtr[v];   //Vertex Pointer
         adj2 = verLocPtr[v+1];
         for( k = adj1; k < adj2; k++ ) {
             w = verLocInd[k]; //Get the adjacent vertex
             if ( (w < StartIndex) || (w > EndIndex) ) { //Find a ghost
-                insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
-                verGhostInd[insertMe] = v+StartIndex; //Add the adjacency
-                tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
+#pragma omp critical
+                {
+                    insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
+                    verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
+                    tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
+                }
             } //End of if((w < StartIndex) || (w > EndIndex))
         } //End of for(k)
     } //End of for (v)
diff --git a/exec.sh b/exec.sh
new file mode 100755
index 00000000..02f4012a
--- /dev/null
+++ b/exec.sh
@@ -0,0 +1,7 @@
+make all
+cd samples/advanced/pdegen
+make amg_d_pde3d
+cd runs
+mpirun -np 8 amg_d_pde3d amg_pde3d.inp
+
+
diff --git a/samples/advanced/pdegen/Makefile b/samples/advanced/pdegen/Makefile
index 0720b6f3..8a49c73a 100644
--- a/samples/advanced/pdegen/Makefile
+++ b/samples/advanced/pdegen/Makefile
@@ -3,7 +3,7 @@ AMGINCDIR=$(AMGDIR)/include
 include $(AMGINCDIR)/Make.inc.amg4psblas
 AMGMODDIR=$(AMGDIR)/modules
 AMGLIBDIR=$(AMGDIR)/lib
-AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec
+AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec -llapack -lblas
 FINCLUDES=$(FMFLAG). $(FMFLAG)$(AMGMODDIR) $(FMFLAG)$(AMGINCDIR) $(PSBLAS_INCLUDES) $(FIFLAG).
 
 LINKOPT=

From 8f6dc5fac23da5f670a25afc616d428e6d92cf96 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 1 May 2022 06:05:16 -0500
Subject: [PATCH 04/96] verGhostPtrInitialization is now parallelized

---
 ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
index da8b3896..6f5dd9be 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@@ -248,12 +248,33 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
 #endif
+
+#define TIME_TRACKER
+#ifdef TIME_TRACKER
+    double verGhostPtrInitialization = MPI_Wtime();
+#endif
+
+    /*
+     * OMP verGhostPtrInitialization
+     * 
+     */
+
+#pragma omp parallel for default(shared)
     for ( i=0; i<numGhostVertices; i++ )  { //O(|Ghost Vertices|)
         verGhostPtr[i+1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
         cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
 #endif
     }
+
+#ifdef TIME_TRACKER
+    verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
+    fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+#endif
+
+#undef TIME_TRACKER
+
+
 #ifdef PRINT_DEBUG_INFO_
     if ( numGhostVertices > 0 )
         cout<<verGhostPtr[numGhostVertices]<<"\n";

From 0a8debe43af0fda4208fb6e6fd9764cbe5ca4159 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 1 May 2022 15:26:47 -0500
Subject: [PATCH 05/96] Single parallel regions with multiple for cycles Added
 OMP for testing

---
 amgprec/impl/aggregator/Makefile              |   2 +-
 amgprec/impl/aggregator/MatchBoxPC.cpp        |   3 +-
 ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 191 ++++++++++--------
 exec.sh                                       |   2 +-
 4 files changed, 115 insertions(+), 83 deletions(-)

diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile
index 0444e60d..d857a3b0 100644
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@@ -4,7 +4,7 @@ INCDIR=../../../include
 MODDIR=../../../modules
 HERE=../..
 
-FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES) -fopenmp
+FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES)
 CXXINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(INCDIR) $(FMFLAG)/.
 
 #CINCLUDES= -I${SUPERLU_INCDIR} -I${HSL_INCDIR}  -I${SPRAL_INCDIR} -I/home/users/pasqua/Ambra/BootCMatch/include -lBCM -L/home/users/pasqua/Ambra/BootCMatch/lib  -lm
diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp
index fc30e8fd..270c6d04 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.cpp
+++ b/amgprec/impl/aggregator/MatchBoxPC.cpp
@@ -66,6 +66,7 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 	  myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
 #endif
 
+#define TIME_TRACKER
     #ifdef TIME_TRACKER
         double tmr = MPI_Wtime();
     #endif
@@ -80,7 +81,7 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 
   #ifdef TIME_TRACKER
     tmr = MPI_Wtime() - tmr;
-    fprintf(stderr, "Elaboration time: %f\n", tmr);
+    fprintf(stderr, "Elaboration time: %f for $ld\n", tmr, NLEdge);
   #endif
 
 #endif
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
index 6f5dd9be..818c9f07 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@@ -124,7 +124,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     //inputSubGraph.getStartEndIndices(StartIndex, EndIndex);
     MilanLongInt StartIndex = verDistance[myRank]; //The starting vertex owned by the current rank
     //MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank
-    MilanLongInt EndIndex = verDistance[myRank+1]-1; //The ending vertex owned by the current rank
+    MilanLongInt EndIndex = verDistance[myRank + 1] - 1; //The ending vertex owned by the current rank
 
     MPI_Status computeStatus;
     const int ComputeTag = 7;  //Predefined tag
@@ -135,8 +135,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     int message_length;
 
     //MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0;
-    MilanLongInt msgActual=0, msgInd=0;
-    MilanReal heaviestEdgeWt=0.0f; //Assumes positive weight
+    MilanLongInt msgActual = 0, msgInd = 0;
+    MilanReal heaviestEdgeWt = 0.0f; //Assumes positive weight
     MilanReal startTime, finishTime;
     //MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer
     startTime = MPI_Wtime();
@@ -150,18 +150,18 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     cout<<"\n("<<myRank<<")StartIndex: "<<StartIndex<<"  EndIndex: "<<EndIndex; fflush(stdout);
 #endif
     //Other Variables:
-    MilanLongInt u=-1, v=-1, w=-1, i=0;
-    MilanLongInt k=-1, adj1=-1, adj2=-1;
-    MilanLongInt k1=-1, adj11=-1, adj12=-1;
+    MilanLongInt u = -1, v = -1, w = -1, i = 0;
+    MilanLongInt k = -1, adj1 = -1, adj2 = -1;
+    MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
     MilanLongInt myCard = 0;
-    MilanInt Sender=0; // This is the rank of the sending nodes, it has to be an integer! Fabio
+    MilanInt Sender = 0; // This is the rank of the sending nodes, it has to be an integer! Fabio
 
     //Build the Ghost Vertex Set: Vg
-    map<MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
+    map <MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
     // index that starts with zero to |Vg|  - 1
     map<MilanLongInt, MilanLongInt>::iterator storedAlready;
-    vector<MilanLongInt> Counter;  //Store the edge count for each ghost vertex
-    MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe=0; //Number of Ghost vertices
+    vector <MilanLongInt> Counter;  //Store the edge count for each ghost vertex
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
 #endif
@@ -180,101 +180,123 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
      * only when a ghost edge is found and ghost edges are a minority.
      */
 
+    //Define Adjacency Lists for Ghost Vertices:
+    //cout<<"Building Ghost data structures ... \n\n";
+    vector <MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
+    //Mate array for ghost vertices:
+    vector <MilanLongInt> GMate;  //Proportional to the number of ghost vertices
+
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel for private(insertMe) firstprivate(StartIndex, EndIndex) default(shared)
-    for ( i=0; i<NLEdge; i++ )  { //O(m) - Each edge stored twice
-        insertMe = verLocInd[i];
-        //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
-        if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) { //Find a ghost
+//#define OMP
+#ifdef OMP
+#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+    {
+#endif
+        //printf("Id %d\n", omp_get_thread_num());
+
+#ifdef OMP
+#pragma omp for
+#endif
+        for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
+            insertMe = verLocInd[i];
+            //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
+            if ((insertMe < StartIndex) || (insertMe > EndIndex)) { //Find a ghost
+#ifdef OMP
 #pragma omp critical
-            {
-                numGhostEdges++;
-                storedAlready = Ghost2LocalMap.find(insertMe);
-                if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
-                    //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
-                    Counter[storedAlready->second]++; //Increment the counter
-                } else { //Insert an entry for the ghost:
-                    //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
-                    Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
-                    Counter.push_back(1); //Initialize the counter
-                    numGhostVertices++;  //Increment the number of ghost vertices
-                } //End of else()
-            }
-        } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
-    } //End of for(ghost vertices)
+                {
+#endif
+                    numGhostEdges++;
+                    storedAlready = Ghost2LocalMap.find(insertMe);
+                    if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
+                        //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
+                        Counter[storedAlready->second]++; //Increment the counter
+                    } else { //Insert an entry for the ghost:
+                        //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
+                        Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
+                        Counter.push_back(1); //Initialize the counter
+                        numGhostVertices++;  //Increment the number of ghost vertices
+                    } //End of else()
+#ifdef OMP
+                }
+#endif
+            } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+        } //End of for(ghost vertices)
+
+#ifdef OMP
+#pragma omp single
+        {
+#endif
 
 #ifdef TIME_TRACKER
-    Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
-    fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
+            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
+            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
-    if (!Ghost2LocalMap.empty()) {
-        cout<<"\n("<<myRank<<")Final Map : on process ";
-        cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
-        storedAlready = Ghost2LocalMap.begin();
-        do {
-            cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
-            fflush(stdout);
-            storedAlready++;
-        } while ( storedAlready != Ghost2LocalMap.end() );
-    }
+            cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
+            if (!Ghost2LocalMap.empty()) {
+                cout<<"\n("<<myRank<<")Final Map : on process ";
+                cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
+                storedAlready = Ghost2LocalMap.begin();
+                do {
+                    cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
+                    fflush(stdout);
+                    storedAlready++;
+                } while ( storedAlready != Ghost2LocalMap.end() );
+            }
 #endif
-    //Build Adjacency Lists for Ghost Vertices:
-    //cout<<"Building Ghost data structures ... \n\n";
-    vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
-    //Mate array for ghost vertices:
-    vector<MilanLongInt> GMate;  //Proportional to the number of ghost vertices
-    try {
-        verGhostPtr.reserve(numGhostVertices+1); //Pointer Vector
-        tempCounter.reserve(numGhostVertices); //Pointer Vector
-        verGhostInd.reserve(numGhostEdges); //Index Vector
-        GMate.reserve(numGhostVertices); //Ghost Mate Vector
-    } catch ( length_error ) {
-        cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-        cout<<"Not enough memory to allocate the internal variables \n";
-        exit(1);
-    }
-    //Initialize the Vectors:
-    verGhostPtr.resize(numGhostVertices+1, 0); //Pointer Vector
-    tempCounter.resize(numGhostVertices, 0); //Temporary Counter
-    verGhostInd.resize(numGhostEdges, -1); //Index Vector
-    GMate.resize(numGhostVertices, -1); //Temporary Counter
-    verGhostPtr[0] = 0; //The first value
+
+            //Initialize adjacency Lists for Ghost Vertices:
+            try {
+                verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector
+                tempCounter.reserve(numGhostVertices); //Pointer Vector
+                verGhostInd.reserve(numGhostEdges); //Index Vector
+                GMate.reserve(numGhostVertices); //Ghost Mate Vector
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+            //Initialize the Vectors:
+            verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector
+            tempCounter.resize(numGhostVertices, 0); //Temporary Counter
+            verGhostInd.resize(numGhostEdges, -1); //Index Vector
+            GMate.resize(numGhostVertices, -1); //Temporary Counter
+            verGhostPtr[0] = 0; //The first value
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
+            cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
 #endif
 
-#define TIME_TRACKER
 #ifdef TIME_TRACKER
-    double verGhostPtrInitialization = MPI_Wtime();
+            double verGhostPtrInitialization = MPI_Wtime();
 #endif
 
-    /*
-     * OMP verGhostPtrInitialization
-     * 
-     */
+#ifdef OMP
+        }
+#endif
+        /*
+         * OMP verGhostPtrInitialization
+         *
+         */
 
-#pragma omp parallel for default(shared)
-    for ( i=0; i<numGhostVertices; i++ )  { //O(|Ghost Vertices|)
-        verGhostPtr[i+1] = verGhostPtr[i] + Counter[i];
+#ifdef OMP
+#pragma omp for nowait
+#endif
+        for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
+            verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
-        cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
+            cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
 #endif
-    }
+        }
 
 #ifdef TIME_TRACKER
     verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
     fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
 #endif
 
-#undef TIME_TRACKER
-
-
 #ifdef PRINT_DEBUG_INFO_
     if ( numGhostVertices > 0 )
         cout<<verGhostPtr[numGhostVertices]<<"\n";
@@ -299,24 +321,33 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
 #ifdef TIME_TRACKER
     double verGhostIndInitialization = MPI_Wtime();
 #endif
-#pragma omp parallel for private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared)
+#ifdef OMP
+#pragma omp for
+#endif
     for ( v=0; v < NLVer; v++ ) {
         adj1 = verLocPtr[v];   //Vertex Pointer
         adj2 = verLocPtr[v+1];
         for( k = adj1; k < adj2; k++ ) {
             w = verLocInd[k]; //Get the adjacent vertex
             if ( (w < StartIndex) || (w > EndIndex) ) { //Find a ghost
+#ifdef OMP
 #pragma omp critical
                 {
+#endif
                     insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
                     verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
                     tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
+#ifdef OMP
                 }
+#endif
             } //End of if((w < StartIndex) || (w > EndIndex))
         } //End of for(k)
     } //End of for (v)
     tempCounter.clear(); //Do not need this any more
 
+#ifdef OMP
+    } //end of parallel region
+#endif
 #ifdef TIME_TRACKER
     verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
     fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
diff --git a/exec.sh b/exec.sh
index 02f4012a..d6e77a21 100755
--- a/exec.sh
+++ b/exec.sh
@@ -2,6 +2,6 @@ make all
 cd samples/advanced/pdegen
 make amg_d_pde3d
 cd runs
-mpirun -np 8 amg_d_pde3d amg_pde3d.inp
+mpirun -np 2 amg_d_pde3d amg_pde3d.inp
 
 

From 76e04ee997e4d24e35414c32276b935902b08035 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 5 May 2022 15:57:58 -0500
Subject: [PATCH 06/96] The OMP and MPI version is now separated in two
 different files

---
 README.md                                     |    3 +-
 amgprec/impl/aggregator/Makefile              |    3 +-
 amgprec/impl/aggregator/MatchBoxPC.cpp        |   17 +-
 amgprec/impl/aggregator/MatchBoxPC.h          |   11 +
 ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp |  258 +--
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1467 +++++++++++++++++
 exec.sh                                       |    2 +-
 7 files changed, 1565 insertions(+), 196 deletions(-)
 create mode 100644 amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp

diff --git a/README.md b/README.md
index fbea8c39..1d330385 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-
-                         AMG4PSBLAS  
+                   AMG4PSBLAS  
  Algebraic Multigrid Package  based on PSBLAS (Parallel Sparse BLAS version 3.7)
     
 Salvatore Filippone    (University of Rome Tor Vergata and IAC-CNR)
diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile
index d857a3b0..1f6f52af 100644
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@@ -62,7 +62,8 @@ amg_s_parmatch_smth_bld.o \
 amg_s_parmatch_spmm_bld_inner.o
 
 MPCOBJS=MatchBoxPC.o \
-algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o
+algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
+algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
 
 OBJS = $(FOBJS) $(MPCOBJS)
 
diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp
index 270c6d04..90b448dc 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.cpp
+++ b/amgprec/impl/aggregator/MatchBoxPC.cpp
@@ -66,22 +66,35 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 	  myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
 #endif
 
+
 #define TIME_TRACKER
     #ifdef TIME_TRACKER
         double tmr = MPI_Wtime();
     #endif
 
-  dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
+#define OMP
+#ifdef OMP
+        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge,
 							   verLocPtr, verLocInd, edgeLocWeight,
 							   verDistance,  Mate,
 							   myRank, numProcs, C_comm,
 							   msgIndSent, msgActualSent, msgPercent,
 							   ph0_time, ph1_time, ph2_time,
 							   ph1_card, ph2_card );
+#else
+        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
+							   verLocPtr, verLocInd, edgeLocWeight,
+							   verDistance,  Mate,
+							   myRank, numProcs, C_comm,
+							   msgIndSent, msgActualSent, msgPercent,
+							   ph0_time, ph1_time, ph2_time,
+							   ph1_card, ph2_card );
+#endif
+
 
   #ifdef TIME_TRACKER
     tmr = MPI_Wtime() - tmr;
-    fprintf(stderr, "Elaboration time: %f for $ld\n", tmr, NLEdge);
+    fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer);
   #endif
 
 #endif
diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 21d0a181..6c3f765f 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -152,6 +152,17 @@ extern "C" {
 inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                                      MilanInt myRank, MilanInt numProcs);
 
+void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP
+        (
+                MilanLongInt NLVer, MilanLongInt NLEdge,
+                MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
+                MilanLongInt* verDistance,
+                MilanLongInt* Mate,
+                MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+                MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
+                MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
+                MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+
   void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
 (
  MilanLongInt NLVer, MilanLongInt NLEdge,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
index 818c9f07..8be438b6 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@@ -1,6 +1,4 @@
 #include "MatchBoxPC.h"
-#include <omp.h>
-#include <stdio.h>
 // ***********************************************************************
 //
 //        MatchboxP: A C++ library for approximate weighted matching
@@ -94,21 +92,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
 							      MilanReal* msgPercent,
                                                               MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
                                                               MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
-
-    /*
-     * verDistance: it's a vector long as the number of processors.
-     *              verDistance[i] contains the first node index of the i-th processor
-     *              verDistance[i + 1] contains the last node index of the i-th processor
-     * NLVer: number of elements in the LocPtr
-     * NLEdge: number of edges assigned to the current processor
-     *
-     * Contains the portion of matrix assigned to the processor in
-     * Yale notation
-     * verLocInd: contains the positions on row of the matrix
-     * verLocPtr: i-th value is the position of the first element on the i-th row and
-     *            i+1-th value is the position of the first element on the i+1-th row
-     */
-
 #if !defined(SERIAL_MPI)
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()"; fflush(stdout);
@@ -124,7 +107,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     //inputSubGraph.getStartEndIndices(StartIndex, EndIndex);
     MilanLongInt StartIndex = verDistance[myRank]; //The starting vertex owned by the current rank
     //MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank
-    MilanLongInt EndIndex = verDistance[myRank + 1] - 1; //The ending vertex owned by the current rank
+    MilanLongInt EndIndex = verDistance[myRank+1]-1; //The ending vertex owned by the current rank
 
     MPI_Status computeStatus;
     const int ComputeTag = 7;  //Predefined tag
@@ -135,8 +118,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     int message_length;
 
     //MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0;
-    MilanLongInt msgActual = 0, msgInd = 0;
-    MilanReal heaviestEdgeWt = 0.0f; //Assumes positive weight
+    MilanLongInt msgActual=0, msgInd=0;
+    MilanReal heaviestEdgeWt=0.0f; //Assumes positive weight
     MilanReal startTime, finishTime;
     //MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer
     startTime = MPI_Wtime();
@@ -150,18 +133,18 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     cout<<"\n("<<myRank<<")StartIndex: "<<StartIndex<<"  EndIndex: "<<EndIndex; fflush(stdout);
 #endif
     //Other Variables:
-    MilanLongInt u = -1, v = -1, w = -1, i = 0;
-    MilanLongInt k = -1, adj1 = -1, adj2 = -1;
-    MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
+    MilanLongInt u=-1, v=-1, w=-1, i=0;
+    MilanLongInt k=-1, adj1=-1, adj2=-1;
+    MilanLongInt k1=-1, adj11=-1, adj12=-1;
     MilanLongInt myCard = 0;
-    MilanInt Sender = 0; // This is the rank of the sending nodes, it has to be an integer! Fabio
+    MilanInt Sender=0; // This is the rank of the sending nodes, it has to be an integer! Fabio
 
     //Build the Ghost Vertex Set: Vg
-    map <MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
+    map<MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
     // index that starts with zero to |Vg|  - 1
     map<MilanLongInt, MilanLongInt>::iterator storedAlready;
-    vector <MilanLongInt> Counter;  //Store the edge count for each ghost vertex
-    MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices
+    vector<MilanLongInt> Counter;  //Store the edge count for each ghost vertex
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe=0; //Number of Ghost vertices
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
 #endif
@@ -169,160 +152,71 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
     if (myRank == 0)     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
 #endif
 
-    /*
-     * OMP Ghost2LocalInitialization
-     * The cycle analyzes all the edges and when finds a ghost edge
-     * puts it in the Ghost2LocalMap.
-     * A critical region is needed when inserting data in the map.
-     *
-     * Despite the critical region it is still productive to
-     * parallelize this for because the critical region is exeuted
-     * only when a ghost edge is found and ghost edges are a minority.
-     */
-
-    //Define Adjacency Lists for Ghost Vertices:
-    //cout<<"Building Ghost data structures ... \n\n";
-    vector <MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
-    //Mate array for ghost vertices:
-    vector <MilanLongInt> GMate;  //Proportional to the number of ghost vertices
-
-#ifdef TIME_TRACKER
-    double Ghost2LocalInitialization = MPI_Wtime();
-#endif
-
-//#define OMP
-#ifdef OMP
-#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
-    {
-#endif
-        //printf("Id %d\n", omp_get_thread_num());
-
-#ifdef OMP
-#pragma omp for
-#endif
-        for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
-            insertMe = verLocInd[i];
-            //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
-            if ((insertMe < StartIndex) || (insertMe > EndIndex)) { //Find a ghost
-#ifdef OMP
-#pragma omp critical
-                {
-#endif
-                    numGhostEdges++;
-                    storedAlready = Ghost2LocalMap.find(insertMe);
-                    if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
-                        //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
-                        Counter[storedAlready->second]++; //Increment the counter
-                    } else { //Insert an entry for the ghost:
-                        //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
-                        Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
-                        Counter.push_back(1); //Initialize the counter
-                        numGhostVertices++;  //Increment the number of ghost vertices
-                    } //End of else()
-#ifdef OMP
-                }
-#endif
-            } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
-        } //End of for(ghost vertices)
-
-#ifdef OMP
-#pragma omp single
-        {
-#endif
-
-#ifdef TIME_TRACKER
-            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
-            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
-#endif
-
+    for ( i=0; i<NLEdge; i++ )  { //O(m) - Each edge stored twice
+        insertMe = verLocInd[i];
+        //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
+        if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) { //Find a ghost
+            storedAlready = Ghost2LocalMap.find( insertMe );
+            if ( storedAlready != Ghost2LocalMap.end() ) { //Has already been added
+                //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
+                Counter[storedAlready->second]++; //Increment the counter
+                numGhostEdges++;
+            } else { //Insert an entry for the ghost:
+                //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
+                Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
+                Counter.push_back(1); //Initialize the counter
+                numGhostEdges++;
+                numGhostVertices++;  //Increment the number of ghost vertices
+            } //End of else()
+        } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+    } //End of for(ghost vertices)
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
-            if (!Ghost2LocalMap.empty()) {
-                cout<<"\n("<<myRank<<")Final Map : on process ";
-                cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
-                storedAlready = Ghost2LocalMap.begin();
-                do {
-                    cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
-                    fflush(stdout);
-                    storedAlready++;
-                } while ( storedAlready != Ghost2LocalMap.end() );
-            }
+    cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
+    if (!Ghost2LocalMap.empty()) {
+        cout<<"\n("<<myRank<<")Final Map : on process ";
+        cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
+        storedAlready = Ghost2LocalMap.begin();
+        do {
+            cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
+            fflush(stdout);
+            storedAlready++;
+        } while ( storedAlready != Ghost2LocalMap.end() );
+    }
 #endif
-
-            //Initialize adjacency Lists for Ghost Vertices:
-            try {
-                verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector
-                tempCounter.reserve(numGhostVertices); //Pointer Vector
-                verGhostInd.reserve(numGhostEdges); //Index Vector
-                GMate.reserve(numGhostVertices); //Ghost Mate Vector
-            } catch (length_error) {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-            //Initialize the Vectors:
-            verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector
-            tempCounter.resize(numGhostVertices, 0); //Temporary Counter
-            verGhostInd.resize(numGhostEdges, -1); //Index Vector
-            GMate.resize(numGhostVertices, -1); //Temporary Counter
-            verGhostPtr[0] = 0; //The first value
+    //Build Adjacency Lists for Ghost Vertices:
+    //cout<<"Building Ghost data structures ... \n\n";
+    vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
+    //Mate array for ghost vertices:
+    vector<MilanLongInt> GMate;  //Proportional to the number of ghost vertices
+    try {
+        verGhostPtr.reserve(numGhostVertices+1); //Pointer Vector
+        tempCounter.reserve(numGhostVertices); //Pointer Vector
+        verGhostInd.reserve(numGhostEdges); //Index Vector
+        GMate.reserve(numGhostVertices); //Ghost Mate Vector
+    } catch ( length_error ) {
+        cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+        cout<<"Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+    //Initialize the Vectors:
+    verGhostPtr.resize(numGhostVertices+1, 0); //Pointer Vector
+    tempCounter.resize(numGhostVertices, 0); //Temporary Counter
+    verGhostInd.resize(numGhostEdges, -1); //Index Vector
+    GMate.resize(numGhostVertices, -1); //Temporary Counter
+    verGhostPtr[0] = 0; //The first value
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
-#endif
-
-#ifdef TIME_TRACKER
-            double verGhostPtrInitialization = MPI_Wtime();
-#endif
-
-#ifdef OMP
-        }
-#endif
-        /*
-         * OMP verGhostPtrInitialization
-         *
-         */
-
-#ifdef OMP
-#pragma omp for nowait
+    cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
 #endif
-        for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
-            verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
+    for ( i=0; i<numGhostVertices; i++ )  { //O(|Ghost Vertices|)
+        verGhostPtr[i+1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
-            cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
-#endif
-        }
-
-#ifdef TIME_TRACKER
-    verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
-    fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+        cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
 #endif
-
+    }
 #ifdef PRINT_DEBUG_INFO_
     if ( numGhostVertices > 0 )
         cout<<verGhostPtr[numGhostVertices]<<"\n";
     fflush(stdout);
-#endif
-
-    /*
-     * OMP verGhostIndInitialization
-     *
-     * In this cycle the verGhostInd is initialized
-     * with the datas related to ghost edges.
-     * The check to see if a node is a ghost node is
-     * executed in paralle and when a ghost node
-     * is found a critical region is started.
-     *
-     * Despite the critical region it's still useful to
-     * parallelize the for cause the ghost nodes
-     * are a minority hence the critical region is executed
-     * few times.
-     */
-
-#ifdef TIME_TRACKER
-    double verGhostIndInitialization = MPI_Wtime();
-#endif
-#ifdef OMP
-#pragma omp for
 #endif
     for ( v=0; v < NLVer; v++ ) {
         adj1 = verLocPtr[v];   //Vertex Pointer
@@ -330,29 +224,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
         for( k = adj1; k < adj2; k++ ) {
             w = verLocInd[k]; //Get the adjacent vertex
             if ( (w < StartIndex) || (w > EndIndex) ) { //Find a ghost
-#ifdef OMP
-#pragma omp critical
-                {
-#endif
-                    insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
-                    verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
-                    tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
-#ifdef OMP
-                }
-#endif
+                insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
+                verGhostInd[insertMe] = v+StartIndex; //Add the adjacency
+                tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
             } //End of if((w < StartIndex) || (w > EndIndex))
         } //End of for(k)
     } //End of for (v)
     tempCounter.clear(); //Do not need this any more
-
-#ifdef OMP
-    } //end of parallel region
-#endif
-#ifdef TIME_TRACKER
-    verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
-    fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
-#endif
-
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
     for ( v=0; v < numGhostEdges; v++ )
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
new file mode 100644
index 00000000..f232cfa2
--- /dev/null
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -0,0 +1,1467 @@
+#include "MatchBoxPC.h"
+#include <omp.h>
+#include <stdio.h>
+// ***********************************************************************
+//
+//        MatchboxP: A C++ library for approximate weighted matching
+//               Mahantesh Halappanavar (hala@pnnl.gov)
+//               Pacific Northwest National Laboratory
+//
+// ***********************************************************************
+//
+//       Copyright (2021) Battelle Memorial Institute
+//                      All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////// DOMINATING EDGES MODEL ///////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+/* Function	: algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate()
+ *
+ * Date     : New update: Feb 17, 2019, Richland, Washington.
+ * Date		: Original development: May 17, 2009, E&CS Bldg.
+ *
+ * Purpose	: Compute Approximate Maximum Weight Matching in Linear Time
+ *
+ * Args		: inputMatrix - instance of Compressed-Col format of Matrix
+ *                Mate - The Mate array
+ *
+ * Returns	: By Value: (void)
+ *            By Reference: Mate
+ *
+ * Comments	: 1/2 Approx Algorithm. Picks the locally available heaviest edge.
+ *                Assumption: The Mate Array is empty.
+ */
+
+/*
+ NLVer = #of vertices, NLEdge = #of edges
+ CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers)
+ verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process
+ Mate = A vector of size |V_p| (local subgraph) to store the output (matching)
+ MPI: myRank, numProcs, comm,
+ Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world
+ Statistics: ph0_time, ph1_time, ph2_time: Runtimes
+ Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
+ */
+
+#ifdef SERIAL_MPI
+#else
+//MPI type map
+template<typename T> MPI_Datatype TypeMap();
+template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
+template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
+template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
+template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
+
+// DOUBLE PRECISION VERSION
+//WARNING: The vertex block on a given rank is contiguous
+void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+                                                              MilanLongInt NLVer, MilanLongInt NLEdge,
+                                                              MilanLongInt* verLocPtr, MilanLongInt* verLocInd,
+							      MilanReal* edgeLocWeight,
+                                                              MilanLongInt* verDistance,
+                                                              MilanLongInt* Mate,
+                                                              MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+                                                              MilanLongInt* msgIndSent, MilanLongInt* msgActualSent,
+							      MilanReal* msgPercent,
+                                                              MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
+                                                              MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
+    
+    /*
+     * verDistance: it's a vector long as the number of processors.
+     *              verDistance[i] contains the first node index of the i-th processor
+     *              verDistance[i + 1] contains the last node index of the i-th processor
+     * NLVer: number of elements in the LocPtr
+     * NLEdge: number of edges assigned to the current processor
+     *
+     * Contains the portion of matrix assigned to the processor in
+     * Yale notation
+     * verLocInd: contains the positions on row of the matrix
+     * verLocPtr: i-th value is the position of the first element on the i-th row and
+     *            i+1-th value is the position of the first element on the i+1-th row
+     */
+
+#if !defined(SERIAL_MPI)
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()"; fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<") verDistance ["<< verDistance[0] << "," << verDistance[1] << "," << verDistance[2] <<"," << verDistance[3] <<"]"; fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    if (myRank == 0) cout<<"\n("<<myRank<<") verDistance ["<< verDistance[0] << "," << verDistance[1] << "," << verDistance[2] <<"," << verDistance[3] <<"]"; fflush(stdout);
+#endif
+
+    //inputSubGraph.getStartEndIndices(StartIndex, EndIndex);
+    MilanLongInt StartIndex = verDistance[myRank]; //The starting vertex owned by the current rank
+    //MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank
+    MilanLongInt EndIndex = verDistance[myRank + 1] - 1; //The ending vertex owned by the current rank
+
+    MPI_Status computeStatus;
+    const int ComputeTag = 7;  //Predefined tag
+    const int BundleTag = 9;   //Predefined tag
+    int error_codeC;
+    error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
+    char error_message[MPI_MAX_ERROR_STRING];
+    int message_length;
+
+    //MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0;
+    MilanLongInt msgActual = 0, msgInd = 0;
+    MilanReal heaviestEdgeWt = 0.0f; //Assumes positive weight
+    MilanReal startTime, finishTime;
+    //MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer
+    startTime = MPI_Wtime();
+    //Get the iterators for the graph:
+    //vector<MilanLongInt>::iterator verLocPtr  = inputSubGraph.getVerPtr_b();
+    //vector<MilanLongInt>::iterator verLocInd  = inputSubGraph.getVerInd_b();
+    //vector<MilanReal>::iterator edgeLocWeight = inputSubGraph.getEdgeWt_b();
+
+    //Data structures for sending and receiving messages:
+    vector<MilanLongInt> Message; // [ u, v, message_type ]
+    Message.resize(3,-1);
+    const MilanLongInt REQUEST  = 1;
+    const MilanLongInt SUCCESS  = 2;
+    const MilanLongInt FAILURE  = 3;
+    const MilanLongInt SIZEINFO = 4;
+    MilanLongInt message_type = 0;
+    //Data structures for Message Bundling:
+    //Although up to two messages can be sent along any cross edge,
+    //only one message will be sent in the initialization phase -
+    //one of: REQUEST/FAILURE/SUCCESS
+    vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
+    vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
+    vector<MilanLongInt> PCounter;
+    MilanLongInt NumMessagesBundled;
+    MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
+    vector<MilanLongInt> candidateMate;
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")NV: "<<NLVer<<"  Edges: "<<NLEdge; fflush(stdout);
+    cout<<"\n("<<myRank<<")StartIndex: "<<StartIndex<<"  EndIndex: "<<EndIndex; fflush(stdout);
+#endif
+    //Other Variables:
+    MilanLongInt u = -1, v = -1, w = -1, i = 0;
+    MilanLongInt k = -1, adj1 = -1, adj2 = -1;
+    MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
+    MilanLongInt myCard = 0;
+    MilanInt Sender = 0; // This is the rank of the sending nodes, it has to be an integer! Fabio
+
+    //Build the Ghost Vertex Set: Vg
+    map <MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
+    // index that starts with zero to |Vg|  - 1
+    map<MilanLongInt, MilanLongInt>::iterator storedAlready;
+    vector <MilanLongInt> Counter;  //Store the edge count for each ghost vertex
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices
+
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    if (myRank == 0)     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
+#endif
+
+    /*
+     * OMP Ghost2LocalInitialization
+     * The cycle analyzes all the edges and when finds a ghost edge
+     * puts it in the Ghost2LocalMap.
+     * A critical region is needed when inserting data in the map.
+     *
+     * Despite the critical region it is still productive to
+     * parallelize this for because the critical region is exeuted
+     * only when a ghost edge is found and ghost edges are a minority.
+     */
+
+    //Define Adjacency Lists for Ghost Vertices:
+    //cout<<"Building Ghost data structures ... \n\n";
+    vector <MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
+    //Mate array for ghost vertices:
+    vector <MilanLongInt> GMate;  //Proportional to the number of ghost vertices
+
+#ifdef TIME_TRACKER
+    double Ghost2LocalInitialization = MPI_Wtime();
+#endif
+
+
+#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+    {
+
+#pragma omp for
+        for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
+            insertMe = verLocInd[i];
+            //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
+            if ((insertMe < StartIndex) || (insertMe > EndIndex)) { //Find a ghost
+#pragma omp critical
+                {
+                    numGhostEdges++;
+                    storedAlready = Ghost2LocalMap.find(insertMe);
+                    if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
+                        //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
+                        Counter[storedAlready->second]++; //Increment the counter
+                    } else { //Insert an entry for the ghost:
+                        //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
+                        Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
+                        Counter.push_back(1); //Initialize the counter
+                        numGhostVertices++;  //Increment the number of ghost vertices
+                    } //End of else()
+                }
+            } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+        } //End of for(ghost vertices)
+
+#pragma omp single
+        {
+
+#ifdef TIME_TRACKER
+            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
+            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
+            if (!Ghost2LocalMap.empty()) {
+                cout<<"\n("<<myRank<<")Final Map : on process ";
+                cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
+                storedAlready = Ghost2LocalMap.begin();
+                do {
+                    cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
+                    fflush(stdout);
+                    storedAlready++;
+                } while ( storedAlready != Ghost2LocalMap.end() );
+            }
+#endif
+
+            //Initialize adjacency Lists for Ghost Vertices:
+            try {
+                verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector
+                tempCounter.reserve(numGhostVertices); //Pointer Vector
+                verGhostInd.reserve(numGhostEdges); //Index Vector
+                GMate.reserve(numGhostVertices); //Ghost Mate Vector
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+            //Initialize the Vectors:
+            verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector
+            tempCounter.resize(numGhostVertices, 0); //Temporary Counter
+            verGhostInd.resize(numGhostEdges, -1); //Index Vector
+            GMate.resize(numGhostVertices, -1); //Temporary Counter
+            verGhostPtr[0] = 0; //The first value
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
+#endif
+
+#ifdef TIME_TRACKER
+            double verGhostPtrInitialization = MPI_Wtime();
+#endif
+
+        }
+        /*
+         * OMP verGhostPtrInitialization
+         *
+         */
+#pragma omp for nowait
+        for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
+            verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
+#ifdef PRINT_DEBUG_INFO_
+            cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
+#endif
+        }
+
+#ifdef TIME_TRACKER
+    verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
+    fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+    if ( numGhostVertices > 0 )
+        cout<<verGhostPtr[numGhostVertices]<<"\n";
+    fflush(stdout);
+#endif
+
+    /*
+     * OMP verGhostIndInitialization
+     *
+     * In this cycle the verGhostInd is initialized
+     * with the datas related to ghost edges.
+     * The check to see if a node is a ghost node is
+     * executed in paralle and when a ghost node
+     * is found a critical region is started.
+     *
+     * Despite the critical region it's still useful to
+     * parallelize the for cause the ghost nodes
+     * are a minority hence the critical region is executed
+     * few times.
+     */
+
+#ifdef TIME_TRACKER
+    double verGhostIndInitialization = MPI_Wtime();
+#endif
+#pragma omp for
+    for ( v=0; v < NLVer; v++ ) {
+        adj1 = verLocPtr[v];   //Vertex Pointer
+        adj2 = verLocPtr[v+1];
+        for( k = adj1; k < adj2; k++ ) {
+            w = verLocInd[k]; //Get the adjacent vertex
+            if ( (w < StartIndex) || (w > EndIndex) ) { //Find a ghost
+#pragma omp critical
+                {
+                    insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
+                    verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
+                    tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
+                }
+            } //End of if((w < StartIndex) || (w > EndIndex))
+        } //End of for(k)
+    } //End of for (v)
+    tempCounter.clear(); //Do not need this any more
+
+#pragma omp single
+        {
+
+#ifdef TIME_TRACKER
+    verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+    fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
+    for ( v=0; v < numGhostEdges; v++ )
+        cout<<verGhostInd[v]<<"\t";
+    cout<<endl; fflush(stdout);
+#endif
+
+
+    Message.resize(3,-1);
+    message_type = 0;
+    NumMessagesBundled=0;
+    ghostOwner=0;
+    try {
+        QLocalVtx.reserve(numGhostEdges); //Local Vertex
+        QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
+        QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure)
+        QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later
+        PCounter.reserve( numProcs); //Store How many messages will be sent to each processor
+    } catch ( length_error ) {
+        cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+        cout<<"Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+    PCounter.resize(numProcs, 0); //Only initialize the counter variable
+
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
+#endif
+    //Allocate Data Structures:
+    try {
+        candidateMate.reserve(NLVer+numGhostVertices); //Dominating edge
+    } catch ( length_error ) {
+        cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+        cout<<"Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+    //Initialize the Vectors:
+    candidateMate.resize(NLVer+numGhostVertices, -1);
+
+    //The Queue Data Structure for the Dominating Set:
+    staticQueue U(NLVer+numGhostVertices); //Max size is the number of vertices
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+    fflush(stdout);
+#endif
+    //MPI_Barrier(comm);
+    finishTime = MPI_Wtime();
+    *ph0_time = finishTime-startTime; //Time taken for Phase-0: Initialization
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
+    fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    if (myRank == 0) cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
+#endif
+    startTime = MPI_Wtime();
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////// INITIALIZATION /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //Compute the Initial Matching Set:
+    MilanLongInt S = numGhostVertices; //Initialize S with number of Ghost Vertices
+    for ( v=0; v < NLVer; v++ ) {
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
+#endif
+        //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+        //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+        adj1 = verLocPtr[v];
+        adj2 = verLocPtr[v+1];
+        w = -1;
+        heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+        for( k = adj1; k < adj2; k++ ) {
+            if ( (verLocInd[k]<StartIndex) || (verLocInd[k]>EndIndex) ) { //Is it a ghost vertex?
+                if(GMate[Ghost2LocalMap[verLocInd[k]]] >= 0 )// Already matched
+                    continue;
+            } else { //A local vertex
+                if( Mate[verLocInd[k]-StartIndex] >= 0 ) // Already matched
+                    continue;
+            }
+
+            if( (edgeLocWeight[k] > heaviestEdgeWt) ||
+               ((edgeLocWeight[k] == heaviestEdgeWt)&&(w < verLocInd[k])) ) {
+                heaviestEdgeWt = edgeLocWeight[k];
+                w = verLocInd[k];
+            }
+        } //End of for loop
+        candidateMate[v] = w;
+
+        //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
+#endif
+        //If found a dominating edge:
+        if ( w >= 0 ) {
+            if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost vertex
+                //Build the Message Packet:
+                //Message[0] = v+StartIndex; //LOCAL
+                //Message[1] = w;            //GHOST
+                //Message[2] = REQUEST;      //TYPE
+                //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                cout<<"\n("<<myRank<<")Sending a request message (291):";
+                cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
+                fflush(stdout);
+#endif
+                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                 ComputeTag, comm);*/
+                QLocalVtx.push_back(v+StartIndex);
+                QGhostVtx.push_back(w);
+                QMsgType.push_back(REQUEST);
+                //ghostOwner = inputSubGraph.findOwner(w);
+                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                QOwner.push_back(ghostOwner);
+                PCounter[ghostOwner]++;
+                NumMessagesBundled++;
+                msgInd++;
+                if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v+StartIndex ) {
+                    Mate[v] = w;
+                    GMate[Ghost2LocalMap[w]]=v+StartIndex; //w is a Ghost
+                    //Q.push_back(u);
+                    U.push_back(v+StartIndex);
+                    U.push_back(w);
+                    myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
+#endif
+                    //Decrement the counter:
+                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                    if ( Counter[Ghost2LocalMap[w]] > 0 ) {
+                        Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                        if ( Counter[Ghost2LocalMap[w]] == 0 ) {
+                            S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                            fflush(stdout);
+#endif
+                        }
+                    } //End of if Counter[w] > 0
+                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                } //End of if CandidateMate[w] = v
+            } //End of if a Ghost Vertex
+            else { // w is a local vertex
+                if ( candidateMate[w-StartIndex] == (v+StartIndex) ) {
+                    Mate[v] = w;  //v is local
+                    Mate[w-StartIndex] = v+StartIndex; //w is local
+                    //Q.push_back(u);
+                    U.push_back(v+StartIndex);
+                    U.push_back(w);
+                    myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
+#endif
+                } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+            } //End of Else
+        } //End of if(w >=0)
+        else {
+            adj11 = verLocPtr[v];
+            adj12 = verLocPtr[v+1];
+            for( k1 = adj11; k1 < adj12; k1++ ) {
+                w = verLocInd[k1];
+                if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+                    //Build the Message Packet:
+                    //Message[0] = v+StartIndex; //LOCAL
+                    //Message[1] = w;            //GHOST
+                    //Message[2] = FAILURE;      //TYPE
+                    //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")Sending a failure message: ";
+                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    fflush(stdout);
+#endif
+                    /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                     ComputeTag, comm); */
+                    QLocalVtx.push_back(v+StartIndex);
+                    QGhostVtx.push_back(w);
+                    QMsgType.push_back(FAILURE);
+                    //ghostOwner = inputSubGraph.findOwner(w);
+                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                    QOwner.push_back(ghostOwner);
+                    PCounter[ghostOwner]++;
+                    NumMessagesBundled++;
+                    msgInd++;
+                } //End of if(GHOST)
+            } //End of for loop
+        } // End of Else: w == -1
+        //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+    } //End of for ( v=0; v < NLVer; v++ )
+
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+    fflush(stdout);
+#endif
+    ///////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////
+    while ( /*!Q.empty()*/ !U.empty() ) {
+        //Q.pop_front();
+        u = U.pop_front(); //Get an element from the queue
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
+#endif
+        if ( (u >= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices
+            //Get the Adjacency list for u
+            adj1 = verLocPtr[u-StartIndex];  //Pointer
+            adj2 = verLocPtr[u-StartIndex+1];
+            for( k = adj1; k < adj2; k++ ) {
+                v = verLocInd[k];
+                if ( (v >= StartIndex) && (v <= EndIndex) ) { //If Local Vertex:
+                    if ( (v<StartIndex) || (v>EndIndex) ) { //Is it a ghost vertex?
+                        if(GMate[Ghost2LocalMap[v]] >= 0 )// Already matched
+                            continue;
+                    } else { //A local vertex
+                        if( Mate[v-StartIndex] >= 0 ) // Already matched
+                            continue;
+                    } //End of else
+
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
+                    fflush(stdout);
+#endif
+                    if ( candidateMate[v-StartIndex] == u ) { //Only if pointing to the matched vertex
+                        //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                        adj11 = verLocPtr[v-StartIndex];
+                        adj12 = verLocPtr[v-StartIndex+1];
+                        w = -1;
+                        heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+                        for( k1 = adj11; k1 < adj12; k1++ ) {
+                            if ( (verLocInd[k1]<StartIndex) || (verLocInd[k1]>EndIndex) ) { //Is it a ghost vertex?
+                                if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched
+                                    continue;
+                            } else { //A local vertex
+                                if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched
+                                    continue;
+                            }
+                            if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
+                               ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                heaviestEdgeWt = edgeLocWeight[k1];
+                                w = verLocInd[k1];
+                            }
+                        } //End of for loop
+                        candidateMate[v-StartIndex] = w;
+                        //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+                        cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
+#endif
+                        //If found a dominating edge:
+                        if ( w >= 0 ) {
+                            if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+                                //Build the Message Packet:
+                                //Message[0] = v; //LOCAL
+                                //Message[1] = w; //GHOST
+                                //Message[2] = REQUEST;  //TYPE
+                                //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                cout<<"\n("<<myRank<<")Sending a request message:";
+                                cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+#endif
+                                /*MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                                 ComputeTag, comm);*/
+                                QLocalVtx.push_back(v);
+                                QGhostVtx.push_back(w);
+                                QMsgType.push_back(REQUEST);
+                                //ghostOwner = inputSubGraph.findOwner(w);
+                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                QOwner.push_back(ghostOwner);
+                                PCounter[ghostOwner]++;
+                                NumMessagesBundled++;
+                                msgInd++;
+                                if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) {
+                                    Mate[v-StartIndex] = w;  //v is a local vertex
+                                    GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
+                                    //Q.push_back(u);
+                                    U.push_back(v);
+                                    U.push_back(w);
+                                    myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                    cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                    //Decrement the counter:
+                                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                    if ( Counter[Ghost2LocalMap[w]] > 0 ) {
+                                        Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                        if ( Counter[Ghost2LocalMap[w]] == 0 ) {
+                                            S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                            fflush(stdout);
+#endif
+                                        }
+                                    } //End of if Counter[w] > 0
+                                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                } //End of if CandidateMate[w] = v
+                            } //End of if a Ghost Vertex
+                            else { //w is a local vertex
+                                if ( candidateMate[w-StartIndex] == v )  {
+                                    Mate[v-StartIndex] = w;  //v is a local vertex
+                                    Mate[w-StartIndex] = v;  //w is a local vertex
+                                    //Q.push_back(u);
+                                    U.push_back(v);
+                                    U.push_back(w);
+                                    myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                    cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                } //End of if(CandidateMate(w) = v
+                            } //End of Else
+                        } //End of if(w >=0)
+                        else {
+                            adj11 = verLocPtr[v-StartIndex];
+                            adj12 = verLocPtr[v-StartIndex+1];
+                            for( k1 = adj11; k1 < adj12; k1++ ) {
+                                w = verLocInd[k1];
+                                if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+                                    //Build the Message Packet:
+                                    //Message[0] = v;	     //LOCAL
+                                    //Message[1] = w;            //GHOST
+                                    //Message[2] = FAILURE;      //TYPE
+                                    //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    fflush(stdout);
+#endif
+                                    /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                                     ComputeTag, comm); */
+                                    QLocalVtx.push_back(v);
+                                    QGhostVtx.push_back(w);
+                                    QMsgType.push_back(FAILURE);
+                                    //ghostOwner = inputSubGraph.findOwner(w);
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                    QOwner.push_back(ghostOwner);
+                                    PCounter[ghostOwner]++;
+                                    NumMessagesBundled++;
+                                    msgInd++;
+                                } //End of if(GHOST)
+                            } //End of for loop
+                        } // End of Else: w == -1
+                        //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                    } //End of If (candidateMate[v-StartIndex] == u)
+                } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                else { //Neighbor is a ghost vertex
+                    if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u )
+                        candidateMate[NLVer+Ghost2LocalMap[v]] = -1;
+                    if ( v != Mate[u-StartIndex] ) { //u is local
+                        //Build the Message Packet:
+                        //Message[0] = u; //LOCAL
+                        //Message[1] = v; //GHOST
+                        //Message[2] = SUCCESS;  //TYPE
+                        //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                        cout<<"\n("<<myRank<<")Sending a success message: ";
+                        cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs)<<"\n"; fflush(stdout);
+#endif
+                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(v),
+                         ComputeTag, comm); */
+                        QLocalVtx.push_back(u);
+                        QGhostVtx.push_back(v);
+                        QMsgType.push_back(SUCCESS);
+                        //ghostOwner = inputSubGraph.findOwner(v);
+                        ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                        QOwner.push_back(ghostOwner);
+                        PCounter[ghostOwner]++;
+                        NumMessagesBundled++;
+                        msgInd++;
+                    } //End of If( v != Mate[u] )
+                } //End of Else //A Ghost Vertex
+            } //End of For Loop adj(u)
+        } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+    } //End of while ( /*!Q.empty()*/ !U.empty() )
+    ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
+#ifdef DEBUG_HANG_
+    if (myRank == 0) cout<<"\n("<<myRank<<") Send Bundles" <<endl; fflush(stdout);
+#endif
+    /////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //Data structures for Bundled Messages:
+    vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
+    MilanLongInt myIndex=0;
+    try {
+        PMessageBundle.reserve(NumMessagesBundled*3); //Three integers per message
+        PCumulative.reserve(numProcs+1); //Similar to Row Pointer vector in CSR data structure
+        PSizeInfoMessages.reserve(numProcs*3); //Buffer to hold the Size info message packets
+    } catch ( length_error ) {
+        cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+        cout<<"Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+    PMessageBundle.resize(NumMessagesBundled*3, -1);//Initialize
+    PCumulative.resize(numProcs+1, 0); //Only initialize the counter variable
+    PSizeInfoMessages.resize(numProcs*3, 0);
+
+    for (MilanInt i=0; i<numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+        PCumulative[i+1]=PCumulative[i]+PCounter[i];
+    //Reuse PCounter to keep track of how many messages were inserted:
+    for (MilanInt i=0; i<numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+        PCounter[i]=0;
+    //Build the Message Bundle packet:
+    for (MilanInt i=0; i<NumMessagesBundled; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
+        myIndex = ( PCumulative[QOwner[i]] + PCounter[QOwner[i]] )*3;
+        PMessageBundle[myIndex+0] = QLocalVtx[i];
+        PMessageBundle[myIndex+1] = QGhostVtx[i];
+        PMessageBundle[myIndex+2] = QMsgType[i];
+        PCounter[QOwner[i]]++;
+    }
+    //Send the Bundled Messages: Use ISend
+    vector<MPI_Request> SRequest; //Requests that are used for each send message
+    vector<MPI_Status> SStatus;   //Status of sent messages, used in MPI_Wait
+    MilanLongInt MessageIndex=0; //Pointer for current message
+    try {
+        SRequest.reserve(numProcs*2); //At most two messages per processor
+        SStatus.reserve(numProcs*2);//At most two messages per processor
+    } catch ( length_error ) {
+        cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
+        cout<<"Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+    MPI_Request myReq; //A sample request
+    SRequest.resize(numProcs*2,myReq);
+    MPI_Status myStat; //A sample status
+    SStatus.resize(numProcs*2,myStat);
+    //Send the Messages
+    for (MilanInt i=0; i<numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
+        if (i==myRank) //Do not send anything to yourself
+            continue;
+        //Send the Message with information about the size of next message:
+        //Build the Message Packet:
+        PSizeInfoMessages[i*3+0] = (PCumulative[i+1]-PCumulative[i])*3; // # of integers in the next message
+        PSizeInfoMessages[i*3+1] = -1; //Dummy packet
+        PSizeInfoMessages[i*3+2] = SIZEINFO;  //TYPE
+        //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")Sending bundled message to process "<<i<<" size: "<<PSizeInfoMessages[i*3+0]<<endl;
+        fflush(stdout);
+#endif
+        if ( PSizeInfoMessages[i*3+0] > 0 ) { //Send only if it is a nonempty packet
+            MPI_Isend(&PSizeInfoMessages[i*3+0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm, &SRequest[MessageIndex]);
+            msgActual++;
+            MessageIndex++;
+            //Now Send the message with the data packet:
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Sending Bundle to : "<<i<<endl;
+            for (k=(PCumulative[i]*3); k< (PCumulative[i]*3+PSizeInfoMessages[i*3+0]); k++)
+                cout<<PMessageBundle[k]<<",";
+            cout<<endl;
+            fflush(stdout);
+#endif
+            MPI_Isend(&PMessageBundle[PCumulative[i]*3], PSizeInfoMessages[i*3+0], TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[MessageIndex]);
+            MessageIndex++;
+        } //End of if size > 0
+    }
+    //Free up temporary memory:
+    PCumulative.clear();
+    QLocalVtx.clear();
+    QGhostVtx.clear();
+    QMsgType.clear();
+    QOwner.clear();
+    PCounter.clear();
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
+    cout<<"\n("<<myRank<<")Total number of potential message X 2 = "<<numGhostEdges*2;
+    cout<<"\n("<<myRank<<")Number messages already sent in bundles = "<<NumMessagesBundled;
+    if (numGhostEdges>0) {
+      cout<<"\n("<<myRank<<")Percentage of total = "<<((double)NumMessagesBundled/(double)(numGhostEdges*2))*100.0<<"% \n";
+    }
+    fflush(stdout);
+#endif
+
+    //Allocate memory for MPI Send messages:
+    /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
+    MilanInt OneMessageSize=0;
+    MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); //Size of one message packet
+    //How many messages to send?
+    //Potentially three kinds of messages will be sent/received:
+    //Request, Success, Failure.
+    //But only two will be sent from a given processor.
+    //Substract the number of messages that have already been sent as bundled messages:
+    MilanLongInt numMessagesToSend = numGhostEdges*2 - NumMessagesBundled;
+    MilanInt     BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend;
+
+    MilanLongInt *Buffer=0;
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")Size of One Message from PACK= "<<OneMessageSize;
+    cout<<"\n("<<myRank<<")Size of Message overhead = "<<MPI_BSEND_OVERHEAD;
+    cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
+    cout<<"\n("<<myRank<<")Number of remaining message = "<<numMessagesToSend;
+    cout<<"\n("<<myRank<<")BufferSize = "<<BufferSize;
+    cout<<"\n("<<myRank<<")Attaching Buffer on.. ";
+    fflush(stdout);
+#endif
+    if ( BufferSize > 0 ) {
+        Buffer = (MilanLongInt *) malloc(BufferSize);  //Allocate memory
+        if ( Buffer == 0 ) {
+            cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+            cout<<"Not enough memory to allocate for send buffer on process "<<myRank<<"\n";
+            exit(1);
+        }
+        MPI_Buffer_attach(Buffer, BufferSize); //Attach the Buffer
+    }
+    ///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
+
+    finishTime = MPI_Wtime();
+    *ph1_time = finishTime-startTime; //Time taken for Phase-1
+    *ph1_card = myCard ; //Cardinality at the end of Phase-1
+    startTime = MPI_Wtime();
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////// MAIN LOOP //////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //Main While Loop:
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")Entering While(true) loop.."; fflush(stdout);
+    //U.display(); fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+    fflush(stdout);
+#endif
+    //Buffer to receive bundled messages
+    //Maximum messages that can be received from any processor is
+    //twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
+    vector<MilanLongInt> ReceiveBuffer;
+    MilanLongInt bundleSize=0, bundleCounter=0;
+    try {
+        ReceiveBuffer.reserve(numGhostEdges*2*3); //Three integers per cross edge
+    } catch ( length_error ) {
+        cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+        cout<<"Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+    while ( true ) {
+#ifdef DEBUG_HANG_
+      if (myRank == 0) cout<<"\n("<<myRank<<") Main loop" <<endl; fflush(stdout);
+#endif
+        ///////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////
+        while ( /*!Q.empty()*/ !U.empty() ) {
+            //Q.pop_front();
+            u = U.pop_front(); //Get an element from the queue
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
+#endif
+            if ( (u >= StartIndex) && (u <= EndIndex) ) { //Process Only If a Local Vertex
+                //Get the Adjacency list for u
+                adj1 = verLocPtr[u-StartIndex];  //Pointer
+                adj2 = verLocPtr[u-StartIndex+1];
+                for( k = adj1; k < adj2; k++ ) {
+                    v = verLocInd[k];
+                    if ( (v >= StartIndex) && (v <= EndIndex) ) { //v is a Local Vertex:
+                        if ( Mate[v-StartIndex] >= 0 )   // v is already matched
+                            continue;
+#ifdef PRINT_DEBUG_INFO_
+                        cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
+                        fflush(stdout);
+#endif
+                        if ( candidateMate[v-StartIndex] == u ) { //Only if pointing to the matched vertex
+                            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                            adj11 = verLocPtr[v-StartIndex];
+                            adj12 = verLocPtr[v-StartIndex+1];
+                            w = -1;
+                            heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+                            for( k1 = adj11; k1 < adj12; k1++ ) {
+                                if ( (verLocInd[k1]<StartIndex) || (verLocInd[k1]>EndIndex) ) { //Is it a ghost vertex?
+                                    if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched
+                                        continue;
+                                }
+                                else { //A local vertex
+                                    if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched
+                                        continue;
+                                }
+
+                                if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
+                                   ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                    heaviestEdgeWt = edgeLocWeight[k1];
+                                    w = verLocInd[k1];
+                                }
+                            } //End of for loop
+                            candidateMate[v-StartIndex] = w;
+                            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
+#endif
+                            //If found a dominating edge:
+                            if ( w >= 0 ) {
+                                if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost
+                                    //Build the Message Packet:
+                                    Message[0] = v; //LOCAL
+                                    Message[1] = w; //GHOST
+                                    Message[2] = REQUEST;  //TYPE
+                                    //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout<<"\n("<<myRank<<")Sending a request message:";
+                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    fflush(stdout);
+#endif
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                    msgInd++; msgActual++;
+                                    if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) {
+                                        Mate[v-StartIndex] = w; //v is local
+                                        GMate[Ghost2LocalMap[w]] = v; //w is ghost
+                                        //Q.push_back(u);
+                                        U.push_back(v);
+                                        U.push_back(w);
+                                        myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                        //Decrement the counter:
+                                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                        if ( Counter[Ghost2LocalMap[w]] > 0 ) {
+                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                            if ( Counter[Ghost2LocalMap[w]] == 0 ) {
+                                                S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                                fflush(stdout);
+#endif
+                                            }
+                                        } //End of if Counter[w] > 0
+                                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                    } //End of if CandidateMate[w] = v
+                                } //End of if a Ghost Vertex
+                                else { //w is a local vertex
+                                    if ( candidateMate[w-StartIndex] == v )  {
+                                        Mate[v-StartIndex] = w; //v is local
+                                        Mate[w-StartIndex] = v; //w is local
+                                        //Q.push_back(u);
+                                        U.push_back(v);
+                                        U.push_back(w);
+                                        myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                    } //End of if(CandidateMate(w) = v
+                                } //End of Else
+                            } //End of if(w >=0)
+                            else { //no dominating edge found: w == -1
+                                adj11 = verLocPtr[v-StartIndex];
+                                adj12 = verLocPtr[v-StartIndex+1];
+                                for( k1 = adj11; k1 < adj12; k1++ ) {
+                                    w = verLocInd[k1];
+                                    if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+                                        //Build the Message Packet:
+                                        Message[0] = v;			 //LOCAL
+                                        Message[1] = w;            //GHOST
+                                        Message[2] = FAILURE;      //TYPE
+                                        //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        fflush(stdout);
+#endif
+                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                        msgInd++; msgActual++;
+                                    } //End of if(GHOST)
+                                } //End of for loop
+                            } // End of Else: w == -1
+                            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        } //End of If (candidateMate[v-StartIndex] == u)
+                    } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                    else { //Neighbor v is a ghost vertex
+                        if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u )
+                            candidateMate[NLVer+Ghost2LocalMap[v]] = -1;
+                        if ( v != Mate[u-StartIndex] ) { //u is a local vertex
+                            //Build the Message Packet:
+                            Message[0] = u; //LOCAL
+                            Message[1] = v; //GHOST
+                            Message[2] = SUCCESS;  //TYPE
+                            //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")Sending a success message: ";
+                            cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                            fflush(stdout);
+#endif
+                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                            MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                            msgInd++; msgActual++;
+#ifdef DEBUG_GHOST_
+			    if ((u<StartIndex) || (u>EndIndex)) {
+			      cout<<"\n("<<myRank<<") "<<__LINE__<<" From Send: should not happen: u= "<<u<<" v= "<<v<<
+				" StartIndex "<<StartIndex<<" EndIndex "<<EndIndex<<endl;
+			      fflush(stdout);
+			    }
+#endif
+
+                        } //End of If( v != Mate[u] )
+                    } //End of Else //A Ghost Vertex
+                } //End of For Loop adj(u)
+            } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+        } //End of while ( /*!Q.empty()*/ !U.empty() )
+        ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
+
+        //// BREAK IF NO MESSAGES EXPECTED /////////
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")Deciding whether to break: S= "<<S<<endl;
+#endif
+
+        if ( S == 0 ) {
+#ifdef DEBUG_HANG_
+	  cout<<"\n("<<myRank<<") Breaking out" <<endl; fflush(stdout);
+#endif
+            break;
+	}
+        ///////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////
+        /*
+         RECEIVE message ( u, v, message_type );
+         // u is a GHOST vertex ... v is a LOCAL vertex
+         */
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+        fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")About to begin Message processing phase ... S="<<S<<endl;
+        fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+        fflush(stdout);
+#endif
+        //BLOCKING RECEIVE:
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<" Waiting for blocking receive..."<<endl; fflush(stdout);
+        fflush(stdout);
+#endif
+        error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
+        if (error_codeC != MPI_SUCCESS ) {
+            MPI_Error_string(error_codeC, error_message, &message_length);
+            cout<<"\n*Error in call to MPI_Receive on Slave: "<<error_message<<"\n"; fflush(stdout);
+        }
+        Sender = computeStatus.MPI_SOURCE;
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")Received message from Process "<<Sender<<" Type= "<<Message[2]<<endl;
+        fflush(stdout);
+#endif
+        //If the Message Type is a size indicator, then receive the bigger message.
+        if ( Message[2] == SIZEINFO ) {
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Received bundled message from Process "<<Sender<<" Size= "<<Message[0]<<endl;
+            fflush(stdout);
+#endif
+            bundleSize = Message[0]; //#of integers in the message
+            //Build the Message Buffer:
+            if (!ReceiveBuffer.empty())
+                ReceiveBuffer.clear(); //Empty it out first
+            ReceiveBuffer.resize(bundleSize, -1); //Initialize
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Message Bundle Before: "<<endl;
+            for (i=0; i<bundleSize; i++)
+                cout<<ReceiveBuffer[i]<<",";
+            cout<<endl;
+            fflush(stdout);
+#endif
+            //Receive the message
+            error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
+            if (error_codeC != MPI_SUCCESS ) {
+                MPI_Error_string(error_codeC, error_message, &message_length);
+                cout<<"\n*Error in call to MPI_Receive on processor "<<myRank<<" Error: "<<error_message<<"\n"; fflush(stdout);
+            }
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Message Bundle After: "<<endl;
+            for (i=0; i<bundleSize; i++)
+                cout<<ReceiveBuffer[i]<<",";
+            cout<<endl;
+            fflush(stdout);
+#endif
+        }
+        else { //Just a single message:
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Received regular message from Process "<<Sender<<" u= "<<Message[0]<<" v= "<<Message[1]<<endl;
+            fflush(stdout);
+#endif
+            //Add the current message to Queue:
+            bundleSize = 3; //#of integers in the message
+            //Build the Message Buffer:
+            if (!ReceiveBuffer.empty())
+                ReceiveBuffer.clear(); //Empty it out first
+            ReceiveBuffer.resize(bundleSize, -1); //Initialize
+
+            ReceiveBuffer[0]=Message[0]; //u
+            ReceiveBuffer[1]=Message[1]; //v
+            ReceiveBuffer[2]=Message[2]; //message_type
+        }
+        bundleCounter = 0;
+        while ( bundleCounter < bundleSize ) {
+            u = ReceiveBuffer[bundleCounter]; //GHOST
+            bundleCounter++;
+            v = ReceiveBuffer[bundleCounter]; //LOCAL
+            bundleCounter++;
+            message_type = ReceiveBuffer[bundleCounter]; //TYPE
+            bundleCounter++;
+#ifdef DEBUG_GHOST_
+	    if ((v<StartIndex) || (v>EndIndex)) {
+	      cout<<"\n("<<myRank<<") From ReceiveBuffer: This should not happen: u= "<<u<<" v= "<<v<<" Type= "<<message_type<<
+		" StartIndex "<<StartIndex<<" EndIndex "<<EndIndex<<endl;
+	      fflush(stdout);
+	    }
+#endif
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Processing message: u= "<<u<<" v= "<<v<<" Type= "<<message_type<<endl;
+            fflush(stdout);
+#endif
+            // CASE I: REQUEST
+            if ( message_type == REQUEST ) {
+#ifdef PRINT_DEBUG_INFO_
+                cout<<"\n("<<myRank<<")Message type is REQUEST"<<endl; fflush(stdout);
+#endif
+#ifdef DEBUG_GHOST_
+		if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
+		  cout<<"\n("<<myRank<<") case 1 Bad address "<<v<<" "<<StartIndex<<" "<<v-StartIndex<<" "<<NLVer<<endl; fflush(stdout);
+		}
+
+#endif
+                if ( Mate[v-StartIndex] == -1 ) { //Process only if not already matched  (v is local)
+                    candidateMate[NLVer+Ghost2LocalMap[u]] = v;  //Set CandidateMate for the ghost
+                    if ( candidateMate[v-StartIndex] == u ) {
+                        GMate[Ghost2LocalMap[u]] = v; //u is ghost
+                        Mate[v-StartIndex] = u; //v is local
+                        //Q.push_back(u);
+                        U.push_back(v);
+                        U.push_back(u);
+                        myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<u<<") "<<endl; fflush(stdout);
+#endif
+                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                        if ( Counter[Ghost2LocalMap[u]] > 0 ) {
+                            Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement
+                            if ( Counter[Ghost2LocalMap[u]] == 0 ) {
+                                S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<u<<" has received all its messages"<<endl;
+                                fflush(stdout);
+#endif
+                            }
+                        } //End of if Counter[w] > 0
+                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    } //End of if ( candidateMate[v-StartIndex] == u )e
+                } //End of if ( Mate[v] == -1 )
+            } //End of REQUEST
+            else {   //CASE II: SUCCESS
+                if ( message_type == SUCCESS ) {
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")Message type is SUCCESS"<<endl; fflush(stdout);
+#endif
+                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    GMate[Ghost2LocalMap[u]] = EndIndex+1; //Set a Dummy Mate to make sure that we do not (u is a ghost)
+                    //process it again
+                    if ( Counter[Ghost2LocalMap[u]] > 0 ) {
+                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement
+                        if ( Counter[Ghost2LocalMap[u]] == 0 ) {
+                            S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<u<<" has received all its messages";
+                            fflush(stdout);
+#endif
+                        }
+                    } //End of if Counter[w] > 0
+                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+#ifdef DEBUG_GHOST_
+		    if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
+		      cout<<"\n("<<myRank<<") case 2  Bad address "<<v<<" "<<StartIndex<<" "<<v-StartIndex<<" "<<NLVer<<endl; fflush(stdout);
+		    }
+#endif
+                    if ( Mate[v-StartIndex] == -1 ) { //Process only if not already matched ( v is local)
+                        if ( candidateMate[v-StartIndex] == u ) {
+                            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                            adj11 = verLocPtr[v-StartIndex];
+                            adj12 = verLocPtr[v-StartIndex+1];
+                            w = -1;
+                            heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+                            for( k1 = adj11; k1 < adj12; k1++ ) {
+                                if ( (verLocInd[k1]<StartIndex) || (verLocInd[k1]>EndIndex) ) { //Is it a ghost vertex?
+                                    if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched
+                                        continue;
+                                }
+                                else { //A local vertex
+                                    if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched
+                                        continue;
+                                }
+
+                                if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
+                                   ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                    heaviestEdgeWt = edgeLocWeight[k1];
+                                    w = verLocInd[k1];
+                                }
+                            } //End of for loop
+                            candidateMate[v-StartIndex] = w;
+                            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w<<endl; fflush(stdout);
+#endif
+                            //If found a dominating edge:
+                            if ( w >= 0 ) {
+                                if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost
+                                    //Build the Message Packet:
+                                    Message[0] = v; //LOCAL
+                                    Message[1] = w; //GHOST
+                                    Message[2] = REQUEST;  //TYPE
+                                    //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout<<"\n("<<myRank<<")Sending a request message: ";
+                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs)<<endl;
+                                    fflush(stdout);
+#endif
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                    msgInd++; msgActual++;
+                                    if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) {
+                                        Mate[v-StartIndex] = w; //v is local
+                                        GMate[Ghost2LocalMap[w]] = v; //w is ghost
+                                        //Q.push_back(u);
+                                        U.push_back(v);
+                                        U.push_back(w);
+                                        myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "<<endl; fflush(stdout);
+#endif
+                                        //Decrement the counter:
+                                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                        if ( Counter[Ghost2LocalMap[w]] > 0 ) {
+                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                            if ( Counter[Ghost2LocalMap[w]] == 0 ) {
+                                                S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                                fflush(stdout);
+#endif
+                                            }
+                                        } //End of if Counter[w] > 0
+                                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                    } //End of if CandidateMate[w] = v
+                                } //End of if a Ghost Vertex
+                                else { //w is a local vertex
+                                    if ( candidateMate[w-StartIndex] == v ) {
+                                        Mate[v-StartIndex] = w; //v is local
+                                        Mate[w-StartIndex] = v; //w is local
+                                        //Q.push_back(u);
+                                        U.push_back(v);
+                                        U.push_back(w);
+                                        myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "<<endl; fflush(stdout);
+#endif
+                                    } //End of if(CandidateMate(w) = v
+                                } //End of Else
+                            } //End of if(w >=0)
+                            else { //No dominant edge found
+                                adj11 = verLocPtr[v-StartIndex];
+                                adj12 = verLocPtr[v-StartIndex+1];
+                                for( k1 = adj11; k1 < adj12; k1++ ) {
+                                    w = verLocInd[k1];
+                                    if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+                                        //Build the Message Packet:
+                                        Message[0] = v;			 //LOCAL
+                                        Message[1] = w;            //GHOST
+                                        Message[2] = FAILURE;      //TYPE
+                                        //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs)<<endl;
+                                        fflush(stdout);
+#endif
+                                        //MPI_Bsend(&Message[0], 3, MilanMpiLongInt, findOwnerOfGhost(w, verDistance, myRank, numProcs),
+                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                        msgInd++; msgActual++;
+                                    } //End of if(GHOST)
+                                } //End of for loop
+                            } // End of Else: w == -1
+                            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        } //End of if ( candidateMate[v-StartIndex] == u )
+                    } //End of if ( Mate[v] == -1 )
+                } //End of if ( message_type == SUCCESS )
+                else { //CASE III: FAILURE
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")Message type is FAILURE"<<endl; fflush(stdout);
+#endif
+                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    GMate[Ghost2LocalMap[u]] = EndIndex+1; //Set a Dummy Mate to make sure that we do not (u is a ghost)
+                    //process it again
+                    if ( Counter[Ghost2LocalMap[u]] > 0 ) {
+                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement
+                        if ( Counter[Ghost2LocalMap[u]] == 0 ) {
+                            S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<u<<" has received all its messages";
+                            fflush(stdout);
+#endif
+                        }
+                    } //End of if Counter[w] > 0
+                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                } //End of else: CASE III
+            } //End of else: CASE I
+        } //End of if (!MsgQ.empty())
+        ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
+#ifdef PRINT_DEBUG_INFO_
+        cout<<"\n("<<myRank<<")Finished Message processing phase: S= "<<S; fflush(stdout);
+        cout<<"\n("<<myRank<<")** SENT     : ACTUAL= "<<msgActual; fflush(stdout);
+        cout<<"\n("<<myRank<<")** SENT     : INDIVIDUAL= "<<msgInd<<endl; fflush(stdout);
+#endif
+    } //End of while (true)
+
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<") Waitall= "<<endl; fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    cout<<"\n("<<myRank<<") Waitall " <<endl; fflush(stdout);
+#endif
+    //MPI_Barrier(comm);
+    //Cleanup Phase
+    MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
+    //MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
+    if ( BufferSize > 0 ) {
+      MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer
+      free(Buffer); //Free the memory that was allocated
+    }
+    finishTime = MPI_Wtime();
+    *ph2_time = finishTime-startTime; //Time taken for Phase-2
+    *ph2_card = myCard ; //Cardinality at the end of Phase-2
+
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<")End of function to compute matching: "<<endl; fflush(stdout);
+    cout<<"\n("<<myRank<<")myCardinality: "<<myCard<<endl; fflush(stdout);
+    cout<<"\n("<<myRank<<")Matching took "<<finishTime-startTime<<"seconds"<<endl; fflush(stdout);
+    cout<<"\n("<<myRank<<")** Getting out of the matching function **"<<endl; fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout<<"\n("<<myRank<<") Number of Ghost edges = "<<numGhostEdges;
+    cout<<"\n("<<myRank<<") Total number of potential message X 2 = "<<numGhostEdges*2;
+    cout<<"\n("<<myRank<<") Number messages bundled = "<<NumMessagesBundled;
+    cout<<"\n("<<myRank<<") Total Individual Messages sent = "<< msgInd;
+    if (msgInd>0) {
+      cout<<"\n("<<myRank<<") Percentage of messages bundled = "<<((double)NumMessagesBundled/(double)(msgInd))*100.0<<"% \n";
+    }
+    fflush(stdout);
+#endif
+
+    *msgActualSent = msgActual;
+    *msgIndSent = msgInd;
+    if (msgInd > 0) {
+      *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0;
+    } else {
+      *msgPercent = 0;
+    }
+
+        } //end single
+
+    } //end of parallel region
+
+#ifdef DEBUG_HANG_
+    if (myRank == 0) cout<<"\n("<<myRank<<") Done" <<endl; fflush(stdout);
+#endif
+    //MPI_Barrier(comm);
+}
+//End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
+
+///Find the owner of a ghost node:
+inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+                                 MilanInt myRank, MilanInt numProcs) {
+    //MilanLongInt Size = mVerDistance.size();
+    MilanLongInt mStartInd = mVerDistance[myRank];
+    MilanInt Start = 0;
+    MilanInt End = numProcs;
+    MilanInt Current = 0;
+
+#if 0
+    if ( vtxIndex < mStartInd )
+    End = myRank;
+  else
+    Start = myRank;
+#endif
+
+    while ( Start <= End ) {
+        Current = (End + Start)/2;
+        //CASE-1:
+        if ( mVerDistance[Current] == vtxIndex ) {
+            while ( mVerDistance[Current+1] == vtxIndex ) {
+                Current++;
+                if ( Current == numProcs )
+                    return (-1);
+            }
+            return (Current);
+        }
+        else { //CASE 2:
+            if ( mVerDistance[Current] > vtxIndex )
+                End = Current - 1;
+            else //CASE 3:
+                Start = Current + 1;
+        }
+    } //End of While()
+    if ( Current == 0 )
+        return (Current);
+    else {
+        if ( mVerDistance[Current] > vtxIndex )
+            return (Current-1);
+        else
+            return (Current);
+    } //End of else
+    return (-1); //It should not reach here!
+} //End of findOwnerOfGhost()
+#endif
+
+#endif
\ No newline at end of file
diff --git a/exec.sh b/exec.sh
index d6e77a21..3174e0a5 100755
--- a/exec.sh
+++ b/exec.sh
@@ -2,6 +2,6 @@ make all
 cd samples/advanced/pdegen
 make amg_d_pde3d
 cd runs
-mpirun -np 2 amg_d_pde3d amg_pde3d.inp
+mpirun -np 4 amg_d_pde3d amg_pde3d.inp
 
 

From a20f0d47e7265b29a97a16e67c01e0ac22183681 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 8 May 2022 12:15:46 -0500
Subject: [PATCH 07/96] Solved the static queue out of scope problem

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 407 ++++++++++--------
 exec.sh                                       |   1 +
 2 files changed, 223 insertions(+), 185 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index f232cfa2..27014cca 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -204,23 +204,25 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector <MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
     //Mate array for ghost vertices:
     vector <MilanLongInt> GMate;  //Proportional to the number of ghost vertices
-
+    MilanLongInt S;
+    staticQueue U;
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-
-#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, adj1, adj2, heaviestEdgeWt) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
-#pragma omp for
+        // TODO comments about the fking reduction
+
+#pragma omp for reduction(+ : numGhostEdges)
         for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
             insertMe = verLocInd[i];
             //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
             if ((insertMe < StartIndex) || (insertMe > EndIndex)) { //Find a ghost
+                numGhostEdges++;
 #pragma omp critical
                 {
-                    numGhostEdges++;
                     storedAlready = Ghost2LocalMap.find(insertMe);
                     if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
                         //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
@@ -237,7 +239,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
 #pragma omp single
         {
-
+            //numGhostEdges = atomicNumGhostEdges;
 #ifdef TIME_TRACKER
             Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
             fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
@@ -296,121 +298,137 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         }
 
 #ifdef TIME_TRACKER
-    verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
-    fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+        verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
+        fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-    if ( numGhostVertices > 0 )
-        cout<<verGhostPtr[numGhostVertices]<<"\n";
-    fflush(stdout);
+        if ( numGhostVertices > 0 )
+            cout<<verGhostPtr[numGhostVertices]<<"\n";
+        fflush(stdout);
 #endif
 
-    /*
-     * OMP verGhostIndInitialization
-     *
-     * In this cycle the verGhostInd is initialized
-     * with the datas related to ghost edges.
-     * The check to see if a node is a ghost node is
-     * executed in paralle and when a ghost node
-     * is found a critical region is started.
-     *
-     * Despite the critical region it's still useful to
-     * parallelize the for cause the ghost nodes
-     * are a minority hence the critical region is executed
-     * few times.
-     */
+        /*
+         * OMP verGhostIndInitialization
+         *
+         * In this cycle the verGhostInd is initialized
+         * with the datas related to ghost edges.
+         * The check to see if a node is a ghost node is
+         * executed in paralle and when a ghost node
+         * is found a critical region is started.
+         *
+         * Despite the critical region it's still useful to
+         * parallelize the for cause the ghost nodes
+         * are a minority hence the critical region is executed
+         * few times.
+         */
 
 #ifdef TIME_TRACKER
-    double verGhostIndInitialization = MPI_Wtime();
+        double verGhostIndInitialization = MPI_Wtime();
 #endif
-#pragma omp for
-    for ( v=0; v < NLVer; v++ ) {
-        adj1 = verLocPtr[v];   //Vertex Pointer
-        adj2 = verLocPtr[v+1];
-        for( k = adj1; k < adj2; k++ ) {
-            w = verLocInd[k]; //Get the adjacent vertex
-            if ( (w < StartIndex) || (w > EndIndex) ) { //Find a ghost
+
+        //TODO why the nowait here fails?
+
+#pragma omp for nowait
+        for (v = 0; v < NLVer; v++) {
+            adj1 = verLocPtr[v];   //Vertex Pointer
+            adj2 = verLocPtr[v + 1];
+            for (k = adj1; k < adj2; k++) {
+                w = verLocInd[k]; //Get the adjacent vertex
+                if ((w < StartIndex) || (w > EndIndex)) { //Find a ghost
 #pragma omp critical
-                {
-                    insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
-                    verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
-                    tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
-                }
-            } //End of if((w < StartIndex) || (w > EndIndex))
-        } //End of for(k)
-    } //End of for (v)
-    tempCounter.clear(); //Do not need this any more
+                    {
+                        insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
+                        verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
+                        tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
+                    }
+                } //End of if((w < StartIndex) || (w > EndIndex))
+            } //End of for(k)
+        } //End of for (v)
 
 #pragma omp single
         {
 
 #ifdef TIME_TRACKER
-    verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
-    fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
-    for ( v=0; v < numGhostEdges; v++ )
-        cout<<verGhostInd[v]<<"\t";
-    cout<<endl; fflush(stdout);
+            cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
+            for ( v=0; v < numGhostEdges; v++ )
+                cout<<verGhostInd[v]<<"\t";
+            cout<<endl; fflush(stdout);
 #endif
 
 
-    Message.resize(3,-1);
-    message_type = 0;
-    NumMessagesBundled=0;
-    ghostOwner=0;
-    try {
-        QLocalVtx.reserve(numGhostEdges); //Local Vertex
-        QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
-        QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure)
-        QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later
-        PCounter.reserve( numProcs); //Store How many messages will be sent to each processor
-    } catch ( length_error ) {
-        cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-        cout<<"Not enough memory to allocate the internal variables \n";
-        exit(1);
-    }
-    PCounter.resize(numProcs, 0); //Only initialize the counter variable
+            Message.resize(3, -1);
+            message_type = 0;
+            NumMessagesBundled = 0;
+            ghostOwner = 0;
+            try {
+                QLocalVtx.reserve(numGhostEdges); //Local Vertex
+                QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
+                QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure)
+                QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later
+                PCounter.reserve(numProcs); //Store How many messages will be sent to each processor
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+            PCounter.resize(numProcs, 0); //Only initialize the counter variable
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
+            cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
 #endif
-    //Allocate Data Structures:
-    try {
-        candidateMate.reserve(NLVer+numGhostVertices); //Dominating edge
-    } catch ( length_error ) {
-        cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-        cout<<"Not enough memory to allocate the internal variables \n";
-        exit(1);
-    }
-    //Initialize the Vectors:
-    candidateMate.resize(NLVer+numGhostVertices, -1);
+            //Allocate Data Structures:
+            try {
+                candidateMate.reserve(NLVer + numGhostVertices); //Dominating edge
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+            //Initialize the Vectors:
+            candidateMate.resize(NLVer + numGhostVertices, -1);
+
+            /*
+             * Create the Queue Data Structure for the Dominating Set
+             *
+             * I had to declare the staticuQueue U before the parallel region
+             * to have it in the correct scope. Since we can't chane the dimension
+             * of a staticQueue I had to destroy the previous object and instantiate
+             * a new one of the correct size.
+             */
+            U.~staticQueue();
+            new(&U) staticQueue(NLVer + numGhostVertices);
 
-    //The Queue Data Structure for the Dominating Set:
-    staticQueue U(NLVer+numGhostVertices); //Max size is the number of vertices
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
-    fflush(stdout);
+            cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+            fflush(stdout);
 #endif
-    //MPI_Barrier(comm);
-    finishTime = MPI_Wtime();
-    *ph0_time = finishTime-startTime; //Time taken for Phase-0: Initialization
+            //MPI_Barrier(comm);
+            finishTime = MPI_Wtime();
+            *ph0_time = finishTime - startTime; //Time taken for Phase-0: Initialization
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
-    fflush(stdout);
+            cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
+            fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
-    if (myRank == 0) cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
+            if (myRank == 0) cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
 #endif
-    startTime = MPI_Wtime();
-    /////////////////////////////////////////////////////////////////////////////////////////
-    //////////////////////////////////// INITIALIZATION /////////////////////////////////////
-    /////////////////////////////////////////////////////////////////////////////////////////
-    //Compute the Initial Matching Set:
-    MilanLongInt S = numGhostVertices; //Initialize S with number of Ghost Vertices
+            startTime = MPI_Wtime();
+            /////////////////////////////////////////////////////////////////////////////////////////
+            //////////////////////////////////// INITIALIZATION /////////////////////////////////////
+            /////////////////////////////////////////////////////////////////////////////////////////
+            //Compute the Initial Matching Set:
+
+            S = numGhostVertices; //Initialize S with number of Ghost Vertices
+        } // end of single region
+
+    } // end of parallel region
+//#pragma omp for
     for ( v=0; v < NLVer; v++ ) {
 #ifdef PRINT_DEBUG_INFO_
         cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
@@ -418,125 +436,148 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
         adj1 = verLocPtr[v];
-        adj2 = verLocPtr[v+1];
+        adj2 = verLocPtr[v + 1];
         w = -1;
         heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-        for( k = adj1; k < adj2; k++ ) {
-            if ( (verLocInd[k]<StartIndex) || (verLocInd[k]>EndIndex) ) { //Is it a ghost vertex?
-                if(GMate[Ghost2LocalMap[verLocInd[k]]] >= 0 )// Already matched
+        for (k = adj1; k < adj2; k++) {
+            if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
+                if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
                     continue;
             } else { //A local vertex
-                if( Mate[verLocInd[k]-StartIndex] >= 0 ) // Already matched
+                if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
                     continue;
             }
 
-            if( (edgeLocWeight[k] > heaviestEdgeWt) ||
-               ((edgeLocWeight[k] == heaviestEdgeWt)&&(w < verLocInd[k])) ) {
+            if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+                ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
                 heaviestEdgeWt = edgeLocWeight[k];
                 w = verLocInd[k];
             }
         } //End of for loop
         candidateMate[v] = w;
+  //  }
 
-        //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
-#endif
-        //If found a dominating edge:
-        if ( w >= 0 ) {
-            if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost vertex
-                //Build the Message Packet:
-                //Message[0] = v+StartIndex; //LOCAL
-                //Message[1] = w;            //GHOST
-                //Message[2] = REQUEST;      //TYPE
-                //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                cout<<"\n("<<myRank<<")Sending a request message (291):";
-                cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
-                fflush(stdout);
-#endif
-                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                 ComputeTag, comm);*/
-                QLocalVtx.push_back(v+StartIndex);
-                QGhostVtx.push_back(w);
-                QMsgType.push_back(REQUEST);
-                //ghostOwner = inputSubGraph.findOwner(w);
-                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
-                QOwner.push_back(ghostOwner);
-                PCounter[ghostOwner]++;
-                NumMessagesBundled++;
-                msgInd++;
-                if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v+StartIndex ) {
-                    Mate[v] = w;
-                    GMate[Ghost2LocalMap[w]]=v+StartIndex; //w is a Ghost
-                    //Q.push_back(u);
-                    U.push_back(v+StartIndex);
-                    U.push_back(w);
-                    myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
-#endif
-                    //Decrement the counter:
-                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                    if ( Counter[Ghost2LocalMap[w]] > 0 ) {
-                        Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                        if ( Counter[Ghost2LocalMap[w]] == 0 ) {
-                            S--; //Decrement S
+    /*
+        for ( v=0; v < NLVer; v++ ) {
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                            fflush(stdout);
+            cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
 #endif
-                        }
-                    } //End of if Counter[w] > 0
-                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                } //End of if CandidateMate[w] = v
-            } //End of if a Ghost Vertex
-            else { // w is a local vertex
-                if ( candidateMate[w-StartIndex] == (v+StartIndex) ) {
-                    Mate[v] = w;  //v is local
-                    Mate[w-StartIndex] = v+StartIndex; //w is local
-                    //Q.push_back(u);
-                    U.push_back(v+StartIndex);
-                    U.push_back(w);
-                    myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
-#endif
-                } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-            } //End of Else
-        } //End of if(w >=0)
-        else {
-            adj11 = verLocPtr[v];
-            adj12 = verLocPtr[v+1];
-            for( k1 = adj11; k1 < adj12; k1++ ) {
-                w = verLocInd[k1];
-                if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            adj1 = verLocPtr[v];
+            adj2 = verLocPtr[v + 1];
+            w = candidateMate[v];
+*/
+//#pragma omp critical
+      //  {
+            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
+#endif
+            //If found a dominating edge:
+            if (w >= 0) {
+                if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
                     //Build the Message Packet:
                     //Message[0] = v+StartIndex; //LOCAL
                     //Message[1] = w;            //GHOST
-                    //Message[2] = FAILURE;      //TYPE
+                    //Message[2] = REQUEST;      //TYPE
                     //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")Sending a failure message: ";
-                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    cout<<"\n("<<myRank<<")Sending a request message (291):";
+                    cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
                     fflush(stdout);
 #endif
                     /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                     ComputeTag, comm); */
-                    QLocalVtx.push_back(v+StartIndex);
+                     ComputeTag, comm);*/
+                    QLocalVtx.push_back(v + StartIndex);
                     QGhostVtx.push_back(w);
-                    QMsgType.push_back(FAILURE);
+                    QMsgType.push_back(REQUEST);
                     //ghostOwner = inputSubGraph.findOwner(w);
-                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    assert(ghostOwner != -1);
+                    assert(ghostOwner != myRank);
                     QOwner.push_back(ghostOwner);
                     PCounter[ghostOwner]++;
                     NumMessagesBundled++;
                     msgInd++;
-                } //End of if(GHOST)
-            } //End of for loop
-        } // End of Else: w == -1
-        //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-    } //End of for ( v=0; v < NLVer; v++ )
+                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
+                        Mate[v] = w;
+                        GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
+                        //Q.push_back(u);
+                        U.push_back(v + StartIndex);
+                        U.push_back(w);
+                        myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                        cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
+#endif
+                        //Decrement the counter:
+                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                        if (Counter[Ghost2LocalMap[w]] > 0) {
+                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                            if (Counter[Ghost2LocalMap[w]] == 0) {
+                                S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                fflush(stdout);
+#endif
+                            }
+                        } //End of if Counter[w] > 0
+                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                    } //End of if CandidateMate[w] = v
+                } //End of if a Ghost Vertex
+                else { // w is a local vertex
+                    if (candidateMate[w - StartIndex] == (v + StartIndex)) {
+                        Mate[v] = w;  //v is local
+                        Mate[w - StartIndex] = v + StartIndex; //w is local
+                        //Q.push_back(u);
+                        U.push_back(v + StartIndex);
+                        U.push_back(w);
+                        myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                        cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
+#endif
+                    } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                } //End of Else
+            } //End of if(w >=0)
+            else {
+                adj11 = verLocPtr[v];
+                adj12 = verLocPtr[v + 1];
+                for (k1 = adj11; k1 < adj12; k1++) {
+                    w = verLocInd[k1];
+                    if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+                        //Build the Message Packet:
+                        //Message[0] = v+StartIndex; //LOCAL
+                        //Message[1] = w;            //GHOST
+                        //Message[2] = FAILURE;      //TYPE
+                        //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                        cout<<"\n("<<myRank<<")Sending a failure message: ";
+                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        fflush(stdout);
+#endif
+                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                         ComputeTag, comm); */
+                        QLocalVtx.push_back(v + StartIndex);
+                        QGhostVtx.push_back(w);
+                        QMsgType.push_back(FAILURE);
+                        //ghostOwner = inputSubGraph.findOwner(w);
+                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        assert(ghostOwner != -1);
+                        assert(ghostOwner != myRank);
+                        QOwner.push_back(ghostOwner);
+                        PCounter[ghostOwner]++;
+                        NumMessagesBundled++;
+                        msgInd++;
+                    } //End of if(GHOST)
+                } //End of for loop
+            } // End of Else: w == -1
+            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+        //} // end of critical
+        } //End of for ( v=0; v < NLVer; v++ )
+
+            tempCounter.clear(); //Do not need this any more
+    //} // end of parallel region
+
 
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
@@ -1407,10 +1448,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
       *msgPercent = 0;
     }
 
-        } //end single
-
-    } //end of parallel region
-
 #ifdef DEBUG_HANG_
     if (myRank == 0) cout<<"\n("<<myRank<<") Done" <<endl; fflush(stdout);
 #endif
diff --git a/exec.sh b/exec.sh
index 3174e0a5..3bb7bd90 100755
--- a/exec.sh
+++ b/exec.sh
@@ -5,3 +5,4 @@ cd runs
 mpirun -np 4 amg_d_pde3d amg_pde3d.inp
 
 
+

From 25a603debe549929310d892371330f76cb57b9da Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 8 May 2022 15:11:56 -0500
Subject: [PATCH 08/96] PARALLEL_COMPUTE_CANDIDATE_MATE_B

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 60 ++++++++++---------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 27014cca..63e76d6d 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -162,7 +162,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector<MilanLongInt> PCounter;
     MilanLongInt NumMessagesBundled;
     MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
-    vector<MilanLongInt> candidateMate;
+    //vector<MilanLongInt> candidateMate;
+    MilanLongInt* candidateMate = new MilanLongInt[1];
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")NV: "<<NLVer<<"  Edges: "<<NLEdge; fflush(stdout);
     cout<<"\n("<<myRank<<")StartIndex: "<<StartIndex<<"  EndIndex: "<<EndIndex; fflush(stdout);
@@ -210,10 +211,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel private(insertMe, k, adj1, adj2, heaviestEdgeWt) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, adj1, adj2, heaviestEdgeWt, w) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
-        // TODO comments about the fking reduction
+        // TODO comments about the reduction
 
 #pragma omp for reduction(+ : numGhostEdges)
         for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
@@ -327,8 +328,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         double verGhostIndInitialization = MPI_Wtime();
 #endif
 
-        //TODO why the nowait here fails?
-
 #pragma omp for nowait
         for (v = 0; v < NLVer; v++) {
             adj1 = verLocPtr[v];   //Vertex Pointer
@@ -383,15 +382,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
 #endif
             //Allocate Data Structures:
-            try {
-                candidateMate.reserve(NLVer + numGhostVertices); //Dominating edge
-            } catch (length_error) {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-            //Initialize the Vectors:
-            candidateMate.resize(NLVer + numGhostVertices, -1);
+            /*
+             * candidateMate was a vector and has been replaced with a raw array
+             * there is no point in using the vector (or maybe there is???)
+             * so I replaced it with an array wich is slightly faster
+             */
+            delete[] candidateMate;
+            candidateMate = new MilanLongInt[NLVer + numGhostVertices];
 
             /*
              * Create the Queue Data Structure for the Dominating Set
@@ -427,13 +424,22 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             S = numGhostVertices; //Initialize S with number of Ghost Vertices
         } // end of single region
 
-    } // end of parallel region
-//#pragma omp for
+        /*
+         * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B
+         * The next portion of code has been splitted
+         * to make it 100% parallelized
+         *
+         * TODO: would it make any sense to parallelize also the
+         *       inner for?
+         *
+         * TODO: we have a false sharing on candidateMate
+         */
+
+#pragma omp for
     for ( v=0; v < NLVer; v++ ) {
 #ifdef PRINT_DEBUG_INFO_
         cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
 #endif
-        //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
         adj1 = verLocPtr[v];
         adj2 = verLocPtr[v + 1];
@@ -452,25 +458,25 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
                 heaviestEdgeWt = edgeLocWeight[k];
                 w = verLocInd[k];
+
             }
         } //End of for loop
         candidateMate[v] = w;
-  //  }
+        //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+
+    }
+    } // end of parallel region
 
-    /*
         for ( v=0; v < NLVer; v++ ) {
+
+            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+
 #ifdef PRINT_DEBUG_INFO_
             cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
 #endif
-            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-            adj1 = verLocPtr[v];
-            adj2 = verLocPtr[v + 1];
+
             w = candidateMate[v];
-*/
-//#pragma omp critical
-      //  {
-            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+
 #ifdef PRINT_DEBUG_INFO_
             cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
 #endif

From baffff3d93828b9668aff933b54e19168215ad8b Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Mon, 9 May 2022 16:52:03 -0500
Subject: [PATCH 09/96] Instable PARALLEL_PROCESS_EXPOSED_VERTEX_B

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 141 +++++++++++-------
 1 file changed, 85 insertions(+), 56 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 63e76d6d..77494032 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -159,7 +159,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     //one of: REQUEST/FAILURE/SUCCESS
     vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
     vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
-    vector<MilanLongInt> PCounter;
+
+    MilanLongInt* PCounter = new MilanLongInt [numProcs];
+    for (int i = 0; i < numProcs; i++)
+        PCounter[i] = 0;
+
     MilanLongInt NumMessagesBundled;
     MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
     //vector<MilanLongInt> candidateMate;
@@ -211,7 +215,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel private(insertMe, k, adj1, adj2, heaviestEdgeWt, w) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
         // TODO comments about the reduction
@@ -370,13 +374,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
                 QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure)
                 QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later
-                PCounter.reserve(numProcs); //Store How many messages will be sent to each processor
             } catch (length_error) {
                 cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
                 cout << "Not enough memory to allocate the internal variables \n";
                 exit(1);
             }
-            PCounter.resize(numProcs, 0); //Only initialize the counter variable
 
 #ifdef PRINT_DEBUG_INFO_
             cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
@@ -429,6 +431,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
          * The next portion of code has been splitted
          * to make it 100% parallelized
          *
+         * TODO: I think it diminish the cache update, does it?
+         *
          * TODO: would it make any sense to parallelize also the
          *       inner for?
          *
@@ -465,8 +469,19 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 
     }
-    } // end of parallel region
 
+    /*
+        TODO this cycle has a lot of margin of improvement!!!!
+             This current version introduce some errors.
+             1 - ollback to the previous verison and check if it is
+                100% stable
+            2 - if the previous verison was stable all right, if not
+                that's a big deal
+            3 - reimplement step by step to check from where the instability
+                comes from
+    */
+
+#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs])
         for ( v=0; v < NLVer; v++ ) {
 
             //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
@@ -482,6 +497,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
             //If found a dominating edge:
             if (w >= 0) {
+                myCard++;
                 if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
                     //Build the Message Packet:
                     //Message[0] = v+StartIndex; //LOCAL
@@ -495,56 +511,65 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
                     /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
                      ComputeTag, comm);*/
-                    QLocalVtx.push_back(v + StartIndex);
-                    QGhostVtx.push_back(w);
-                    QMsgType.push_back(REQUEST);
-                    //ghostOwner = inputSubGraph.findOwner(w);
-                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                    assert(ghostOwner != -1);
-                    assert(ghostOwner != myRank);
-                    QOwner.push_back(ghostOwner);
-                    PCounter[ghostOwner]++;
-                    NumMessagesBundled++;
                     msgInd++;
-                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
-                        Mate[v] = w;
-                        GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
-                        //Q.push_back(u);
-                        U.push_back(v + StartIndex);
-                        U.push_back(w);
-                        myCard++;
+                    NumMessagesBundled++;
+                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    PCounter[ghostOwner]++; //TODO maybe reduction?
+#pragma omp critical
+                    {
+                        QLocalVtx.push_back(v + StartIndex);
+                        QGhostVtx.push_back(w);
+                        QMsgType.push_back(REQUEST);
+                        //ghostOwner = inputSubGraph.findOwner(w);
+                        assert(ghostOwner != -1);
+                        assert(ghostOwner != myRank);
+                        QOwner.push_back(ghostOwner);
+
+                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
+
+                                Mate[v] = w;
+                                GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
+                                U.push_back(v + StartIndex);
+                                U.push_back(w);
+
 #ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
+                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
 #endif
-                        //Decrement the counter:
-                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                        if (Counter[Ghost2LocalMap[w]] > 0) {
-                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                            if (Counter[Ghost2LocalMap[w]] == 0) {
-                                S--; //Decrement S
+                            //Decrement the counter:
+                            //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                            if (Counter[Ghost2LocalMap[w]] > 0) {
+
+                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                    if (Counter[Ghost2LocalMap[w]] == 0) {
+                                        S--; //Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                                fflush(stdout);
+                                        cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                        fflush(stdout);
 #endif
-                            }
-                        } //End of if Counter[w] > 0
-                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                    } //End of if CandidateMate[w] = v
+                                    }
+                                } //End of if Counter[w] > 0
+                                //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                            } //End of if CandidateMate[w] = v
+                        } // end of critical region
                 } //End of if a Ghost Vertex
                 else { // w is a local vertex
-                    if (candidateMate[w - StartIndex] == (v + StartIndex)) {
-                        Mate[v] = w;  //v is local
-                        Mate[w - StartIndex] = v + StartIndex; //w is local
-                        //Q.push_back(u);
-                        U.push_back(v + StartIndex);
-                        U.push_back(w);
-                        myCard++;
+
+                        if (candidateMate[w - StartIndex] == (v + StartIndex)) {
+#pragma omp critical
+                            {
+                                Mate[v] = w;  //v is local
+                                Mate[w - StartIndex] = v + StartIndex; //w is local
+                                //Q.push_back(u);
+                                U.push_back(v + StartIndex);
+                                U.push_back(w);
+
 #ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
+                                cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
 #endif
-                    } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-                } //End of Else
-            } //End of if(w >=0)
+                            }
+                        } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                    } //End of Else
+                } //End of if(w >=0)
             else {
                 adj11 = verLocPtr[v];
                 adj12 = verLocPtr[v + 1];
@@ -563,23 +588,28 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
                         /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
                          ComputeTag, comm); */
-                        QLocalVtx.push_back(v + StartIndex);
-                        QGhostVtx.push_back(w);
-                        QMsgType.push_back(FAILURE);
-                        //ghostOwner = inputSubGraph.findOwner(w);
-                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        assert(ghostOwner != -1);
-                        assert(ghostOwner != myRank);
-                        QOwner.push_back(ghostOwner);
-                        PCounter[ghostOwner]++;
                         NumMessagesBundled++;
                         msgInd++;
+                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        PCounter[ghostOwner]++;
+#pragma omp critical
+                        {
+                            QLocalVtx.push_back(v + StartIndex);
+                            QGhostVtx.push_back(w);
+                            QMsgType.push_back(FAILURE);
+                            //ghostOwner = inputSubGraph.findOwner(w);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
+                            QOwner.push_back(ghostOwner);
+                        }
+
                     } //End of if(GHOST)
                 } //End of for loop
             } // End of Else: w == -1
             //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         //} // end of critical
         } //End of for ( v=0; v < NLVer; v++ )
+    } // end of parallel region
 
             tempCounter.clear(); //Do not need this any more
     //} // end of parallel region
@@ -855,7 +885,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     QGhostVtx.clear();
     QMsgType.clear();
     QOwner.clear();
-    PCounter.clear();
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
     cout<<"\n("<<myRank<<")Total number of potential message X 2 = "<<numGhostEdges*2;

From 919e2a2918692dc49d891abe7ba247e7c068d11f Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 21 May 2022 05:56:05 -0500
Subject: [PATCH 10/96] PARALLEL_PROCESS_EXPOSED_VERTEX_B is actually not
 parallelizable. Atleast not as I was doing.

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 296 ++++++++----------
 1 file changed, 134 insertions(+), 162 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 77494032..4814d32e 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -84,17 +84,17 @@ template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
 // DOUBLE PRECISION VERSION
 //WARNING: The vertex block on a given rank is contiguous
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
-                                                              MilanLongInt NLVer, MilanLongInt NLEdge,
-                                                              MilanLongInt* verLocPtr, MilanLongInt* verLocInd,
-							      MilanReal* edgeLocWeight,
-                                                              MilanLongInt* verDistance,
-                                                              MilanLongInt* Mate,
-                                                              MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-                                                              MilanLongInt* msgIndSent, MilanLongInt* msgActualSent,
-							      MilanReal* msgPercent,
-                                                              MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-                                                              MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
-    
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt* verLocPtr, MilanLongInt* verLocInd,
+        MilanReal* edgeLocWeight,
+        MilanLongInt* verDistance,
+        MilanLongInt* Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt* msgIndSent, MilanLongInt* msgActualSent,
+        MilanReal* msgPercent,
+        MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
+        MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
+
     /*
      * verDistance: it's a vector long as the number of processors.
      *              verDistance[i] contains the first node index of the i-th processor
@@ -424,99 +424,73 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             //Compute the Initial Matching Set:
 
             S = numGhostVertices; //Initialize S with number of Ghost Vertices
-        } // end of single region
 
-        /*
-         * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B
-         * The next portion of code has been splitted
-         * to make it 100% parallelized
-         *
-         * TODO: I think it diminish the cache update, does it?
-         *
-         * TODO: would it make any sense to parallelize also the
-         *       inner for?
-         *
-         * TODO: we have a false sharing on candidateMate
-         */
-
-#pragma omp for
-    for ( v=0; v < NLVer; v++ ) {
-#ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
-#endif
-        //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-        adj1 = verLocPtr[v];
-        adj2 = verLocPtr[v + 1];
-        w = -1;
-        heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-        for (k = adj1; k < adj2; k++) {
-            if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
-                if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
-                    continue;
-            } else { //A local vertex
-                if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
-                    continue;
-            }
-
-            if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-                ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
-                heaviestEdgeWt = edgeLocWeight[k];
-                w = verLocInd[k];
-
-            }
-        } //End of for loop
-        candidateMate[v] = w;
-        //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            /*
+             * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B
+             * It is actually not possible to parallelize this cycle
+             * as it is.
+             *
+             * TODO think how it could be parallelizable
+             */
 
-    }
+            for ( v=0; v < NLVer; v++ ) {
+#ifdef PRINT_DEBUG_INFO_
+                cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
+#endif
+                //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                adj1 = verLocPtr[v];
+                adj2 = verLocPtr[v + 1];
+                w = -1;
+                heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+                for (k = adj1; k < adj2; k++) {
+                    if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
+                        if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
+                            continue;
+                    } else { //A local vertex
+                        if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
+                            continue;
+                    }
 
-    /*
-        TODO this cycle has a lot of margin of improvement!!!!
-             This current version introduce some errors.
-             1 - ollback to the previous verison and check if it is
-                100% stable
-            2 - if the previous verison was stable all right, if not
-                that's a big deal
-            3 - reimplement step by step to check from where the instability
-                comes from
-    */
+                    if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+                        ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
+                        heaviestEdgeWt = edgeLocWeight[k];
+                        w = verLocInd[k];
 
-#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs])
-        for ( v=0; v < NLVer; v++ ) {
+                    }
+                } //End of for loop
+                candidateMate[v] = w;
+                //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 
-            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
+                cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
 #endif
 
-            w = candidateMate[v];
-
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
+                cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
 #endif
-            //If found a dominating edge:
-            if (w >= 0) {
-                myCard++;
-                if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
-                    //Build the Message Packet:
-                    //Message[0] = v+StartIndex; //LOCAL
-                    //Message[1] = w;            //GHOST
-                    //Message[2] = REQUEST;      //TYPE
-                    //Send a Request (Asynchronous)
+                //If found a dominating edge:
+                if (w >= 0) {
+                    myCard++;
+                    if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
+                        //Build the Message Packet:
+                        //Message[0] = v+StartIndex; //LOCAL
+                        //Message[1] = w;            //GHOST
+                        //Message[2] = REQUEST;      //TYPE
+                        //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")Sending a request message (291):";
+                        cout<<"\n("<<myRank<<")Sending a request message (291):";
                     cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
                     fflush(stdout);
 #endif
-                    /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                     ComputeTag, comm);*/
-                    msgInd++;
-                    NumMessagesBundled++;
-                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                    PCounter[ghostOwner]++; //TODO maybe reduction?
-#pragma omp critical
-                    {
+                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                         ComputeTag, comm);*/
+                        msgInd++;
+                        NumMessagesBundled++;
+                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        PCounter[ghostOwner]++;
+
                         QLocalVtx.push_back(v + StartIndex);
                         QGhostVtx.push_back(w);
                         QMsgType.push_back(REQUEST);
@@ -527,10 +501,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
                         if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
 
-                                Mate[v] = w;
-                                GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
-                                U.push_back(v + StartIndex);
-                                U.push_back(w);
+                            Mate[v] = w;
+                            GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
+                            U.push_back(v + StartIndex);
+                            U.push_back(w);
 
 #ifdef PRINT_DEBUG_INFO_
                             cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
@@ -539,80 +513,78 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
                             if (Counter[Ghost2LocalMap[w]] > 0) {
 
-                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                    if (Counter[Ghost2LocalMap[w]] == 0) {
-                                        S--; //Decrement S
+                                Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                if (Counter[Ghost2LocalMap[w]] == 0) {
+                                    S--; //Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                    cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
                                         fflush(stdout);
 #endif
-                                    }
-                                } //End of if Counter[w] > 0
-                                //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                            } //End of if CandidateMate[w] = v
-                        } // end of critical region
-                } //End of if a Ghost Vertex
-                else { // w is a local vertex
+                                }
+                            } //End of if Counter[w] > 0
+                            //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                        } //End of if CandidateMate[w] = v
+                    } //End of if a Ghost Vertex
+                    else { // w is a local vertex
 
                         if (candidateMate[w - StartIndex] == (v + StartIndex)) {
-#pragma omp critical
-                            {
-                                Mate[v] = w;  //v is local
-                                Mate[w - StartIndex] = v + StartIndex; //w is local
-                                //Q.push_back(u);
-                                U.push_back(v + StartIndex);
-                                U.push_back(w);
+
+                            Mate[v] = w;  //v is local
+                            Mate[w - StartIndex] = v + StartIndex; //w is local
+                            //Q.push_back(u);
+                            U.push_back(v + StartIndex);
+                            U.push_back(w);
 
 #ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
+                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
 #endif
-                            }
+
                         } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
                     } //End of Else
                 } //End of if(w >=0)
-            else {
-                adj11 = verLocPtr[v];
-                adj12 = verLocPtr[v + 1];
-                for (k1 = adj11; k1 < adj12; k1++) {
-                    w = verLocInd[k1];
-                    if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-                        //Build the Message Packet:
-                        //Message[0] = v+StartIndex; //LOCAL
-                        //Message[1] = w;            //GHOST
-                        //Message[2] = FAILURE;      //TYPE
-                        //Send a Request (Asynchronous)
+                else {
+                    adj11 = verLocPtr[v];
+                    adj12 = verLocPtr[v + 1];
+                    for (k1 = adj11; k1 < adj12; k1++) {
+                        w = verLocInd[k1];
+                        if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+                            //Build the Message Packet:
+                            //Message[0] = v+StartIndex; //LOCAL
+                            //Message[1] = w;            //GHOST
+                            //Message[2] = FAILURE;      //TYPE
+                            //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")Sending a failure message: ";
+                            cout<<"\n("<<myRank<<")Sending a failure message: ";
                         cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
                         fflush(stdout);
 #endif
-                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                         ComputeTag, comm); */
-                        NumMessagesBundled++;
-                        msgInd++;
-                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        PCounter[ghostOwner]++;
+                            /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                             ComputeTag, comm); */
+                            NumMessagesBundled++;
+                            msgInd++;
+                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            PCounter[ghostOwner]++;
 #pragma omp critical
-                        {
-                            QLocalVtx.push_back(v + StartIndex);
-                            QGhostVtx.push_back(w);
-                            QMsgType.push_back(FAILURE);
-                            //ghostOwner = inputSubGraph.findOwner(w);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-                            QOwner.push_back(ghostOwner);
-                        }
+                            {
+                                QLocalVtx.push_back(v + StartIndex);
+                                QGhostVtx.push_back(w);
+                                QMsgType.push_back(FAILURE);
+                                //ghostOwner = inputSubGraph.findOwner(w);
+                                assert(ghostOwner != -1);
+                                assert(ghostOwner != myRank);
+                                QOwner.push_back(ghostOwner);
+                            }
 
-                    } //End of if(GHOST)
-                } //End of for loop
-            } // End of Else: w == -1
-            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-        //} // end of critical
-        } //End of for ( v=0; v < NLVer; v++ )
+                        } //End of if(GHOST)
+                    } //End of for loop
+                } // End of Else: w == -1
+                //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+            } //End of for ( v=0; v < NLVer; v++ )
+
+        } // end of single region
     } // end of parallel region
 
-            tempCounter.clear(); //Do not need this any more
-    //} // end of parallel region
+    tempCounter.clear(); //Do not need this any more
 
 
 #ifdef PRINT_DEBUG_INFO_
@@ -663,7 +635,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                     continue;
                             }
                             if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
-                               ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
                                 heaviestEdgeWt = edgeLocWeight[k1];
                                 w = verLocInd[k1];
                             }
@@ -962,7 +934,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     }
     while ( true ) {
 #ifdef DEBUG_HANG_
-      if (myRank == 0) cout<<"\n("<<myRank<<") Main loop" <<endl; fflush(stdout);
+        if (myRank == 0) cout<<"\n("<<myRank<<") Main loop" <<endl; fflush(stdout);
 #endif
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
@@ -1004,7 +976,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 }
 
                                 if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
-                                   ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                    ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
                                     heaviestEdgeWt = edgeLocWeight[k1];
                                     w = verLocInd[k1];
                                 }
@@ -1112,7 +1084,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
                             msgInd++; msgActual++;
 #ifdef DEBUG_GHOST_
-			    if ((u<StartIndex) || (u>EndIndex)) {
+                            if ((u<StartIndex) || (u>EndIndex)) {
 			      cout<<"\n("<<myRank<<") "<<__LINE__<<" From Send: should not happen: u= "<<u<<" v= "<<v<<
 				" StartIndex "<<StartIndex<<" EndIndex "<<EndIndex<<endl;
 			      fflush(stdout);
@@ -1133,10 +1105,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
         if ( S == 0 ) {
 #ifdef DEBUG_HANG_
-	  cout<<"\n("<<myRank<<") Breaking out" <<endl; fflush(stdout);
+            cout<<"\n("<<myRank<<") Breaking out" <<endl; fflush(stdout);
 #endif
             break;
-	}
+        }
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
@@ -1228,7 +1200,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             message_type = ReceiveBuffer[bundleCounter]; //TYPE
             bundleCounter++;
 #ifdef DEBUG_GHOST_
-	    if ((v<StartIndex) || (v>EndIndex)) {
+            if ((v<StartIndex) || (v>EndIndex)) {
 	      cout<<"\n("<<myRank<<") From ReceiveBuffer: This should not happen: u= "<<u<<" v= "<<v<<" Type= "<<message_type<<
 		" StartIndex "<<StartIndex<<" EndIndex "<<EndIndex<<endl;
 	      fflush(stdout);
@@ -1244,7 +1216,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 cout<<"\n("<<myRank<<")Message type is REQUEST"<<endl; fflush(stdout);
 #endif
 #ifdef DEBUG_GHOST_
-		if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
+                if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
 		  cout<<"\n("<<myRank<<") case 1 Bad address "<<v<<" "<<StartIndex<<" "<<v-StartIndex<<" "<<NLVer<<endl; fflush(stdout);
 		}
 
@@ -1296,7 +1268,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     } //End of if Counter[w] > 0
                     //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
 #ifdef DEBUG_GHOST_
-		    if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
+                    if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
 		      cout<<"\n("<<myRank<<") case 2  Bad address "<<v<<" "<<StartIndex<<" "<<v-StartIndex<<" "<<NLVer<<endl; fflush(stdout);
 		    }
 #endif
@@ -1319,7 +1291,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 }
 
                                 if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
-                                   ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                    ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
                                     heaviestEdgeWt = edgeLocWeight[k1];
                                     w = verLocInd[k1];
                                 }
@@ -1451,8 +1423,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
     //MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
     if ( BufferSize > 0 ) {
-      MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer
-      free(Buffer); //Free the memory that was allocated
+        MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer
+        free(Buffer); //Free the memory that was allocated
     }
     finishTime = MPI_Wtime();
     *ph2_time = finishTime-startTime; //Time taken for Phase-2
@@ -1478,9 +1450,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     *msgActualSent = msgActual;
     *msgIndSent = msgInd;
     if (msgInd > 0) {
-      *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0;
+        *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0;
     } else {
-      *msgPercent = 0;
+        *msgPercent = 0;
     }
 
 #ifdef DEBUG_HANG_

From 1140669ea7b07fd93b0d528ba12d9bb49742f920 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 21 May 2022 07:01:42 -0500
Subject: [PATCH 11/96] firstComputeCandidateMate

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  5 ++
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 59 ++++++++++++++++++-
 samples/advanced/pdegen/runs/amg_pde3d.inp    |  2 +-
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 6c3f765f..94ea7ea8 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -152,6 +152,11 @@ extern "C" {
 inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                                      MilanInt myRank, MilanInt numProcs);
 
+inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                              MilanLongInt adj2,
+                                              MilanLongInt* verLocInd,
+                                              MilanReal* edgeLocWeight);
+
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP
         (
                 MilanLongInt NLVer, MilanLongInt NLEdge,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 4814d32e..84d05e08 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -455,9 +455,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
                         heaviestEdgeWt = edgeLocWeight[k];
                         w = verLocInd[k];
-
                     }
                 } //End of for loop
+                //printf("Compare %ld, %ld\n", w, firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight));
                 candidateMate[v] = w;
                 //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 
@@ -594,8 +594,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     ///////////////////////////////////////////////////////////////////////////////////
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
-    while ( /*!Q.empty()*/ !U.empty() ) {
-        //Q.pop_front();
+    while ( !U.empty() ) {
         u = U.pop_front(); //Get an element from the queue
 #ifdef PRINT_DEBUG_INFO_
         cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
@@ -1506,6 +1505,60 @@ inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistan
     } //End of else
     return (-1); //It should not reach here!
 } //End of findOwnerOfGhost()
+
+/**
+ * Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
+ * Returns the vertices with the highest weight
+ * @param adj1
+ * @param adj2
+ * @param verLocInd
+ * @param edgeLocWeight
+ * @return
+ */
+inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                         MilanLongInt adj2,
+                                         MilanLongInt* verLocInd,
+                                         MilanReal* edgeLocWeight)
+                                         {
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+    for (int k = adj1; k < adj2; k++) {
+
+        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
+            heaviestEdgeWt = edgeLocWeight[k];
+            w = verLocInd[k];
+        }
+    } //End of for loop
+    return w;
+}
+
+/*
+inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                              MilanLongInt adj2,
+                                              MilanLongInt* verLocInd,
+                                              MilanReal* edgeLocWeight)
+{
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+    for (k = adj1; k < adj2; k++) {
+        if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
+            if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
+                continue;
+        } else { //A local vertex
+            if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
+                continue;
+        }
+
+        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
+            heaviestEdgeWt = edgeLocWeight[k];
+            w = verLocInd[k];
+        }
+    } //End of for loop
+    return w;
+}
+ */
 #endif
 
 #endif
\ No newline at end of file
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index eb254780..bdacc992 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0080                        ! IDIM; domain size. Linear system size is IDIM**3
+0123                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From b4bfdd83e5fdf310e54af8e8127dc30ace50a804 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 21 May 2022 10:22:58 -0500
Subject: [PATCH 12/96] computeCandidateMate and isAlreadyMatched

---
 amgprec/impl/aggregator/MatchBoxPC.h          | 19 +++++
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 73 ++++++++++++++-----
 2 files changed, 74 insertions(+), 18 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 94ea7ea8..73908b9b 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -157,6 +157,25 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
                                               MilanLongInt* verLocInd,
                                               MilanReal* edgeLocWeight);
 
+inline bool isAlreadyMatched(MilanLongInt k,
+                                MilanLongInt* verLocInd,
+                                MilanLongInt StartIndex,
+                                MilanLongInt EndIndex,
+                                vector <MilanLongInt> &GMate,
+                                MilanLongInt* Mate,
+                                map <MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
+inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                         MilanLongInt adj2,
+                                         MilanReal* edgeLocWeight,
+                                         MilanLongInt k,
+                                         MilanLongInt* verLocInd,
+                                         MilanLongInt StartIndex,
+                                         MilanLongInt EndIndex,
+                                         vector <MilanLongInt> &GMate,
+                                         MilanLongInt* Mate,
+                                         map <MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP
         (
                 MilanLongInt NLVer, MilanLongInt NLEdge,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 84d05e08..e73c7939 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -443,13 +443,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 w = -1;
                 heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
                 for (k = adj1; k < adj2; k++) {
-                    if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
-                        if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
-                            continue;
-                    } else { //A local vertex
-                        if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
-                            continue;
-                    }
+                    if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
 
                     if ((edgeLocWeight[k] > heaviestEdgeWt) ||
                         ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
@@ -1533,22 +1527,66 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
     return w;
 }
 
-/*
-inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+/**
+ * //TODO documentation
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+inline bool isAlreadyMatched(MilanLongInt k,
+                                MilanLongInt* verLocInd,
+                                MilanLongInt StartIndex,
+                                MilanLongInt EndIndex,
+                                vector <MilanLongInt> &GMate,
+                                MilanLongInt* Mate,
+                                map <MilanLongInt, MilanLongInt> &Ghost2LocalMap
+                                ) {
+
+    if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
+        if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
+            return true;
+    } else { //A local vertex
+        if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
+            return true;
+    }
+
+    return false;
+}
+
+/**
+ * //TODO documentation
+ * @param adj1
+ * @param adj2
+ * @param edgeLocWeight
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
                                               MilanLongInt adj2,
+                                              MilanReal* edgeLocWeight,
+                                              MilanLongInt k,
                                               MilanLongInt* verLocInd,
-                                              MilanReal* edgeLocWeight)
+                                              MilanLongInt StartIndex,
+                                              MilanLongInt EndIndex,
+                                              vector <MilanLongInt> &GMate,
+                                              MilanLongInt* Mate,
+                                              map <MilanLongInt, MilanLongInt> &Ghost2LocalMap)
 {
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
     for (k = adj1; k < adj2; k++) {
-        if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
-            if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
-                continue;
-        } else { //A local vertex
-            if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
-                continue;
-        }
+        if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
 
         if ((edgeLocWeight[k] > heaviestEdgeWt) ||
             ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
@@ -1558,7 +1596,6 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
     } //End of for loop
     return w;
 }
- */
 #endif
 
 #endif
\ No newline at end of file

From 6180f29f39380436c2a03d5e07c9fd76c0039d4d Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 21 May 2022 11:23:39 -0500
Subject: [PATCH 13/96] PARALLEL_COMPUTE_CANDIDATE_MATE_B is now paralle and
 correct

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 63 +++++++++++--------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index e73c7939..1b7014f5 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -424,38 +424,35 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             //Compute the Initial Matching Set:
 
             S = numGhostVertices; //Initialize S with number of Ghost Vertices
+        } // end of single region
 
-            /*
-             * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B
-             * It is actually not possible to parallelize this cycle
-             * as it is.
-             *
-             * TODO think how it could be parallelizable
-             */
+        /*
+        * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
+        * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
+        * the two.
+        * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
+        */
 
+#pragma omp for
             for ( v=0; v < NLVer; v++ ) {
 #ifdef PRINT_DEBUG_INFO_
                 cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
 #endif
                 //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                adj1 = verLocPtr[v];
-                adj2 = verLocPtr[v + 1];
-                w = -1;
-                heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-                for (k = adj1; k < adj2; k++) {
-                    if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
-
-                    if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-                        ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
-                        heaviestEdgeWt = edgeLocWeight[k];
-                        w = verLocInd[k];
-                    }
-                } //End of for loop
-                //printf("Compare %ld, %ld\n", w, firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight));
-                candidateMate[v] = w;
+                candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
                 //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            }
 
+#pragma omp single
+        {
+
+
+    for ( v=0; v < NLVer; v++ )
+            {
                 //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                k = candidateMate[v];
+                candidateMate[v] = verLocInd[k];
+                w = candidateMate[v];
 
 #ifdef PRINT_DEBUG_INFO_
                 cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
@@ -464,6 +461,20 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #ifdef PRINT_DEBUG_INFO_
                 cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
 #endif
+                if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                {
+                    w = computeCandidateMate(verLocPtr[v],
+                                             verLocPtr[v + 1],
+                                             edgeLocWeight, 0,
+                                             verLocInd,
+                                             StartIndex,
+                                             EndIndex,
+                                             GMate,
+                                             Mate,
+                                             Ghost2LocalMap);
+                    candidateMate[v] = w;
+                }
+
                 //If found a dominating edge:
                 if (w >= 0) {
                     myCard++;
@@ -1516,15 +1527,17 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
                                          {
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+    int finalK;
     for (int k = adj1; k < adj2; k++) {
 
         if ((edgeLocWeight[k] > heaviestEdgeWt) ||
             ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
             heaviestEdgeWt = edgeLocWeight[k];
             w = verLocInd[k];
+            finalK = k;
         }
     } //End of for loop
-    return w;
+    return finalK;
 }
 
 /**
@@ -1579,9 +1592,9 @@ inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
                                               MilanLongInt* verLocInd,
                                               MilanLongInt StartIndex,
                                               MilanLongInt EndIndex,
-                                              vector <MilanLongInt> &GMate,
+                                              vector <MilanLongInt>& GMate,
                                               MilanLongInt* Mate,
-                                              map <MilanLongInt, MilanLongInt> &Ghost2LocalMap)
+                                              map <MilanLongInt, MilanLongInt>& Ghost2LocalMap)
 {
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN

From 2cac21b345bfdc2e0eb1e2116bc7e8212f602ba7 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 21 May 2022 11:46:40 -0500
Subject: [PATCH 14/96] fix and reformatting

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 208 +++++++++---------
 samples/advanced/pdegen/runs/amg_pde3d.inp    |   2 +-
 2 files changed, 109 insertions(+), 101 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 1b7014f5..9349e9a2 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -446,7 +446,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #pragma omp single
         {
 
-
     for ( v=0; v < NLVer; v++ )
             {
                 //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
@@ -461,131 +460,140 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #ifdef PRINT_DEBUG_INFO_
                 cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
 #endif
-                if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
-                {
-                    w = computeCandidateMate(verLocPtr[v],
-                                             verLocPtr[v + 1],
-                                             edgeLocWeight, 0,
-                                             verLocInd,
-                                             StartIndex,
-                                             EndIndex,
-                                             GMate,
-                                             Mate,
-                                             Ghost2LocalMap);
-                    candidateMate[v] = w;
-                }
+
+
 
                 //If found a dominating edge:
                 if (w >= 0) {
-                    myCard++;
-                    if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
-                        //Build the Message Packet:
-                        //Message[0] = v+StartIndex; //LOCAL
-                        //Message[1] = w;            //GHOST
-                        //Message[2] = REQUEST;      //TYPE
-                        //Send a Request (Asynchronous)
+
+                    //This piece of code is actually executed under 0.01% of the times
+                    if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
+                        w = computeCandidateMate(verLocPtr[v],
+                                                 verLocPtr[v + 1],
+                                                 edgeLocWeight, 0,
+                                                 verLocInd,
+                                                 StartIndex,
+                                                 EndIndex,
+                                                 GMate,
+                                                 Mate,
+                                                 Ghost2LocalMap);
+                        candidateMate[v] = w;
+                    }
+
+                    if (w >= 0) {
+
+                        myCard++;
+                        if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
+                            //Build the Message Packet:
+                            //Message[0] = v+StartIndex; //LOCAL
+                            //Message[1] = w;            //GHOST
+                            //Message[2] = REQUEST;      //TYPE
+                            //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")Sending a request message (291):";
-                    cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
-                    fflush(stdout);
+                            cout<<"\n("<<myRank<<")Sending a request message (291):";
+                        cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
+                        fflush(stdout);
 #endif
-                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                         ComputeTag, comm);*/
-                        msgInd++;
-                        NumMessagesBundled++;
-                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        PCounter[ghostOwner]++;
+                            /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                             ComputeTag, comm);*/
+                            msgInd++;
+                            NumMessagesBundled++;
+                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            PCounter[ghostOwner]++;
 
-                        QLocalVtx.push_back(v + StartIndex);
-                        QGhostVtx.push_back(w);
-                        QMsgType.push_back(REQUEST);
-                        //ghostOwner = inputSubGraph.findOwner(w);
-                        assert(ghostOwner != -1);
-                        assert(ghostOwner != myRank);
-                        QOwner.push_back(ghostOwner);
+                            QLocalVtx.push_back(v + StartIndex);
+                            QGhostVtx.push_back(w);
+                            QMsgType.push_back(REQUEST);
+                            //ghostOwner = inputSubGraph.findOwner(w);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
+                            QOwner.push_back(ghostOwner);
 
-                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
+                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
 
-                            Mate[v] = w;
-                            GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
-                            U.push_back(v + StartIndex);
-                            U.push_back(w);
+                                Mate[v] = w;
+                                GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
+                                U.push_back(v + StartIndex);
+                                U.push_back(w);
 
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
+                                cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
 #endif
-                            //Decrement the counter:
-                            //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                            if (Counter[Ghost2LocalMap[w]] > 0) {
+                                //Decrement the counter:
+                                //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                                if (Counter[Ghost2LocalMap[w]] > 0) {
 
-                                Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                if (Counter[Ghost2LocalMap[w]] == 0) {
-                                    S--; //Decrement S
+                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                    if (Counter[Ghost2LocalMap[w]] == 0) {
+                                        S--; //Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                                        fflush(stdout);
+                                        cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                            fflush(stdout);
 #endif
-                                }
-                            } //End of if Counter[w] > 0
-                            //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                        } //End of if CandidateMate[w] = v
-                    } //End of if a Ghost Vertex
-                    else { // w is a local vertex
+                                    }
+                                } //End of if Counter[w] > 0
+                                //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                            } //End of if CandidateMate[w] = v
+                        } //End of if a Ghost Vertex
+                        else { // w is a local vertex
 
-                        if (candidateMate[w - StartIndex] == (v + StartIndex)) {
+                            if (candidateMate[w - StartIndex] == (v + StartIndex)) {
 
-                            Mate[v] = w;  //v is local
-                            Mate[w - StartIndex] = v + StartIndex; //w is local
-                            //Q.push_back(u);
-                            U.push_back(v + StartIndex);
-                            U.push_back(w);
+                                Mate[v] = w;  //v is local
+                                Mate[w - StartIndex] = v + StartIndex; //w is local
+                                //Q.push_back(u);
+                                U.push_back(v + StartIndex);
+                                U.push_back(w);
 
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
+                                cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
 #endif
 
-                        } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-                    } //End of Else
+                            } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                        } //End of Else
+                        continue;
+
+                    } //End of second if
                 } //End of if(w >=0)
-                else {
-                    adj11 = verLocPtr[v];
-                    adj12 = verLocPtr[v + 1];
-                    for (k1 = adj11; k1 < adj12; k1++) {
-                        w = verLocInd[k1];
-                        if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-                            //Build the Message Packet:
-                            //Message[0] = v+StartIndex; //LOCAL
-                            //Message[1] = w;            //GHOST
-                            //Message[2] = FAILURE;      //TYPE
-                            //Send a Request (Asynchronous)
+
+                    //if (w < 0) { -- if it arrives here this one if is useless, it is certainly -1
+                        adj11 = verLocPtr[v];
+                        adj12 = verLocPtr[v + 1];
+                        for (k1 = adj11; k1 < adj12; k1++) {
+                            w = verLocInd[k1];
+                            if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+                                //Build the Message Packet:
+                                //Message[0] = v+StartIndex; //LOCAL
+                                //Message[1] = w;            //GHOST
+                                //Message[2] = FAILURE;      //TYPE
+                                //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")Sending a failure message: ";
-                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        fflush(stdout);
+                                cout<<"\n("<<myRank<<")Sending a failure message: ";
+                            cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            fflush(stdout);
 #endif
-                            /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                             ComputeTag, comm); */
-                            NumMessagesBundled++;
-                            msgInd++;
-                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            PCounter[ghostOwner]++;
+                                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                                 ComputeTag, comm); */
+                                NumMessagesBundled++;
+                                msgInd++;
+                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                PCounter[ghostOwner]++;
 #pragma omp critical
-                            {
-                                QLocalVtx.push_back(v + StartIndex);
-                                QGhostVtx.push_back(w);
-                                QMsgType.push_back(FAILURE);
-                                //ghostOwner = inputSubGraph.findOwner(w);
-                                assert(ghostOwner != -1);
-                                assert(ghostOwner != myRank);
-                                QOwner.push_back(ghostOwner);
-                            }
+                                {
+                                    QLocalVtx.push_back(v + StartIndex);
+                                    QGhostVtx.push_back(w);
+                                    QMsgType.push_back(FAILURE);
+                                    //ghostOwner = inputSubGraph.findOwner(w);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
+                                    QOwner.push_back(ghostOwner);
+                                }
 
-                        } //End of if(GHOST)
-                    } //End of for loop
-                } // End of Else: w == -1
-                //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                            } //End of if(GHOST)
+                        } //End of for loop
+                    //} // End of Else: w == -1
+                    //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
             } //End of for ( v=0; v < NLVer; v++ )
-
         } // end of single region
     } // end of parallel region
 
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index bdacc992..eb254780 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0123                        ! IDIM; domain size. Linear system size is IDIM**3
+0080                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From 70b7c4db5591b7771494c341735a51567ed63b3b Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 22 May 2022 16:50:07 -0500
Subject: [PATCH 15/96] PARALLEL_PROCESS_EXPOSED_VERTEX_B named critical
 sections

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 49 ++++++++++++-------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 9349e9a2..e83c37b8 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -443,9 +443,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
             }
 
-#pragma omp single
-        {
+            /*
+             * PARALLEL_PROCESS_EXPOSED_VERTEX_B
+             * The sequential version could be a bit more
+             * efficient.
+             *
+             * TODO: Test when it's more efficient to execute this code
+             *       in parallel.
+             */
 
+#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs])
     for ( v=0; v < NLVer; v++ )
             {
                 //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
@@ -461,23 +468,24 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
 #endif
 
-
-
                 //If found a dominating edge:
                 if (w >= 0) {
 
                     //This piece of code is actually executed under 0.01% of the times
-                    if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
-                        w = computeCandidateMate(verLocPtr[v],
-                                                 verLocPtr[v + 1],
-                                                 edgeLocWeight, 0,
-                                                 verLocInd,
-                                                 StartIndex,
-                                                 EndIndex,
-                                                 GMate,
-                                                 Mate,
-                                                 Ghost2LocalMap);
-                        candidateMate[v] = w;
+#pragma omp critical
+                    {
+                        if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
+                            w = computeCandidateMate(verLocPtr[v],
+                                                     verLocPtr[v + 1],
+                                                     edgeLocWeight, 0,
+                                                     verLocInd,
+                                                     StartIndex,
+                                                     EndIndex,
+                                                     GMate,
+                                                     Mate,
+                                                     Ghost2LocalMap);
+                            candidateMate[v] = w;
+                        }
                     }
 
                     if (w >= 0) {
@@ -500,7 +508,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             NumMessagesBundled++;
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                             PCounter[ghostOwner]++;
-
+#pragma omp critical (QLocalPush)
+                            {
                             QLocalVtx.push_back(v + StartIndex);
                             QGhostVtx.push_back(w);
                             QMsgType.push_back(REQUEST);
@@ -534,11 +543,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 } //End of if Counter[w] > 0
                                 //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
                             } //End of if CandidateMate[w] = v
+                            } // end of critical region
                         } //End of if a Ghost Vertex
                         else { // w is a local vertex
 
                             if (candidateMate[w - StartIndex] == (v + StartIndex)) {
-
+#pragma omp critical (UPush)
+                                {
                                 Mate[v] = w;  //v is local
                                 Mate[w - StartIndex] = v + StartIndex; //w is local
                                 //Q.push_back(u);
@@ -548,6 +559,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #ifdef PRINT_DEBUG_INFO_
                                 cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
 #endif
+                                } //End of critical
 
                             } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
                         } //End of Else
@@ -578,7 +590,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 msgInd++;
                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                 PCounter[ghostOwner]++;
-#pragma omp critical
+#pragma omp critical (QLocalPush)
                                 {
                                     QLocalVtx.push_back(v + StartIndex);
                                     QGhostVtx.push_back(w);
@@ -594,7 +606,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     //} // End of Else: w == -1
                     //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
             } //End of for ( v=0; v < NLVer; v++ )
-        } // end of single region
     } // end of parallel region
 
     tempCounter.clear(); //Do not need this any more

From 002239f5b6a20ad18e9278499b410f383eaf8367 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 22 May 2022 17:35:08 -0500
Subject: [PATCH 16/96] False sharing fix

---
 ...DomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index e83c37b8..e598a21f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -293,8 +293,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         /*
          * OMP verGhostPtrInitialization
          *
+         * schedule(static) assign to each thread an huge chunk
+         * it is used in this case to reduce the overhead of chunk assignment
+         * and to reduce false sharing
          */
-#pragma omp for nowait
+#pragma omp for nowait schedule(static)
         for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
             verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
@@ -332,7 +335,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         double verGhostIndInitialization = MPI_Wtime();
 #endif
 
-#pragma omp for nowait
+#pragma omp for nowait schedule(static)
         for (v = 0; v < NLVer; v++) {
             adj1 = verLocPtr[v];   //Vertex Pointer
             adj2 = verLocPtr[v + 1];
@@ -433,7 +436,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
         */
 
-#pragma omp for
+#pragma omp for schedule(static)
             for ( v=0; v < NLVer; v++ ) {
 #ifdef PRINT_DEBUG_INFO_
                 cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
@@ -452,7 +455,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
              *       in parallel.
              */
 
-#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs])
+#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
     for ( v=0; v < NLVer; v++ )
             {
                 //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
@@ -508,7 +511,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             NumMessagesBundled++;
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                             PCounter[ghostOwner]++;
-#pragma omp critical (QLocalPush)
+#pragma omp critical
                             {
                             QLocalVtx.push_back(v + StartIndex);
                             QGhostVtx.push_back(w);
@@ -548,7 +551,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         else { // w is a local vertex
 
                             if (candidateMate[w - StartIndex] == (v + StartIndex)) {
-#pragma omp critical (UPush)
+#pragma omp critical
                                 {
                                 Mate[v] = w;  //v is local
                                 Mate[w - StartIndex] = v + StartIndex; //w is local
@@ -590,7 +593,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 msgInd++;
                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                 PCounter[ghostOwner]++;
-#pragma omp critical (QLocalPush)
+#pragma omp critical
                                 {
                                     QLocalVtx.push_back(v + StartIndex);
                                     QGhostVtx.push_back(w);

From 766ef320c29fc885673ca4c98c8e4544cbc0b845 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 29 May 2022 12:01:24 -0500
Subject: [PATCH 17/96] Refactoring + critical(Mate)

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 79 +++++++++----------
 1 file changed, 36 insertions(+), 43 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index e598a21f..a08c5485 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -215,7 +215,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner, u) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
         // TODO comments about the reduction
@@ -475,9 +475,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 if (w >= 0) {
 
                     //This piece of code is actually executed under 0.01% of the times
-#pragma omp critical
-                    {
-                        if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
+
+                        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
                             w = computeCandidateMate(verLocPtr[v],
                                                      verLocPtr[v + 1],
                                                      edgeLocWeight, 0,
@@ -489,7 +488,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                                      Ghost2LocalMap);
                             candidateMate[v] = w;
                         }
-                    }
 
                     if (w >= 0) {
 
@@ -511,7 +509,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             NumMessagesBundled++;
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                             PCounter[ghostOwner]++;
-#pragma omp critical
+#pragma omp critical(Mate)
                             {
                             QLocalVtx.push_back(v + StartIndex);
                             QGhostVtx.push_back(w);
@@ -551,7 +549,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         else { // w is a local vertex
 
                             if (candidateMate[w - StartIndex] == (v + StartIndex)) {
-#pragma omp critical
+#pragma omp critical(Mate)
                                 {
                                 Mate[v] = w;  //v is local
                                 Mate[w - StartIndex] = v + StartIndex; //w is local
@@ -632,41 +630,32 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             adj2 = verLocPtr[u-StartIndex+1];
             for( k = adj1; k < adj2; k++ ) {
                 v = verLocInd[k];
+
                 if ( (v >= StartIndex) && (v <= EndIndex) ) { //If Local Vertex:
-                    if ( (v<StartIndex) || (v>EndIndex) ) { //Is it a ghost vertex?
-                        if(GMate[Ghost2LocalMap[v]] >= 0 )// Already matched
-                            continue;
-                    } else { //A local vertex
-                        if( Mate[v-StartIndex] >= 0 ) // Already matched
-                            continue;
-                    } //End of else
+
+                    if (isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
 
 #ifdef PRINT_DEBUG_INFO_
                     cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
                     fflush(stdout);
 #endif
+
                     if ( candidateMate[v-StartIndex] == u ) { //Only if pointing to the matched vertex
                         //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
                         //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                        adj11 = verLocPtr[v-StartIndex];
-                        adj12 = verLocPtr[v-StartIndex+1];
-                        w = -1;
-                        heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-                        for( k1 = adj11; k1 < adj12; k1++ ) {
-                            if ( (verLocInd[k1]<StartIndex) || (verLocInd[k1]>EndIndex) ) { //Is it a ghost vertex?
-                                if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched
-                                    continue;
-                            } else { //A local vertex
-                                if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched
-                                    continue;
-                            }
-                            if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
-                                ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
-                                heaviestEdgeWt = edgeLocWeight[k1];
-                                w = verLocInd[k1];
-                            }
-                        } //End of for loop
-                        candidateMate[v-StartIndex] = w;
+                        w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                 verLocPtr[v - StartIndex + 1],
+                                                 edgeLocWeight, 0,
+                                                 verLocInd,
+                                                 StartIndex,
+                                                 EndIndex,
+                                                 GMate,
+                                                 Mate,
+                                                 Ghost2LocalMap);
+#pragma omp critical
+                        {
+                        candidateMate[v - StartIndex] = w;
+                        }
                         //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
                         cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
@@ -1573,8 +1562,7 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
  * @param Ghost2LocalMap
  * @return
  */
-inline bool isAlreadyMatched(MilanLongInt k,
-                                MilanLongInt* verLocInd,
+inline bool isAlreadyMatched(MilanLongInt node,
                                 MilanLongInt StartIndex,
                                 MilanLongInt EndIndex,
                                 vector <MilanLongInt> &GMate,
@@ -1582,15 +1570,20 @@ inline bool isAlreadyMatched(MilanLongInt k,
                                 map <MilanLongInt, MilanLongInt> &Ghost2LocalMap
                                 ) {
 
-    if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex?
-        if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched
-            return true;
-    } else { //A local vertex
-        if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched
-            return true;
+    bool result = false;
+#pragma omp critical(Mate)
+    {
+        if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
+            if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched
+                result = true;
+        } else { //A local vertex
+            if (Mate[node - StartIndex] >= 0) // Already matched
+                result = true;
+        }
+
     }
 
-    return false;
+    return result;
 }
 
 /**
@@ -1621,7 +1614,7 @@ inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
     for (k = adj1; k < adj2; k++) {
-        if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
+        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
 
         if ((edgeLocWeight[k] > heaviestEdgeWt) ||
             ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {

From f3d7b3ab5e508857dddb8eb5b4aac5fa43c92c57 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 29 May 2022 12:01:28 -0500
Subject: [PATCH 18/96] False sharing fix

---
 amgprec/impl/aggregator/MatchBoxPC.h       | 3 +--
 samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 73908b9b..ba7cb5c8 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -157,8 +157,7 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
                                               MilanLongInt* verLocInd,
                                               MilanReal* edgeLocWeight);
 
-inline bool isAlreadyMatched(MilanLongInt k,
-                                MilanLongInt* verLocInd,
+inline bool isAlreadyMatched(MilanLongInt node,
                                 MilanLongInt StartIndex,
                                 MilanLongInt EndIndex,
                                 vector <MilanLongInt> &GMate,
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index eb254780..bdacc992 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0080                        ! IDIM; domain size. Linear system size is IDIM**3
+0123                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From 2c8dc2ffddba0669e93c837e0bc440e674644eda Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Mon, 30 May 2022 13:49:34 -0500
Subject: [PATCH 19/96] PROCESS MATCHED VERTICES parallelization draft

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 126 ++++++++++++------
 samples/advanced/pdegen/runs/amg_pde3d.inp    |   2 +-
 2 files changed, 87 insertions(+), 41 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index a08c5485..0140c0c6 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -210,12 +210,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     //Mate array for ghost vertices:
     vector <MilanLongInt> GMate;  //Proportional to the number of ghost vertices
     MilanLongInt S;
-    staticQueue U;
+    MilanLongInt privateMyCard = 0;
+    staticQueue U, privateU;
+    bool isEmpty;
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner, u) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner, u, privateU, privateMyCard, isEmpty) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
         // TODO comments about the reduction
@@ -451,6 +453,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
              * The sequential version could be a bit more
              * efficient.
              *
+             * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner
+             *       first in a local variable and then, only at the end, append them to the real data structure
+             *       to remove the critical sections.
+             *
              * TODO: Test when it's more efficient to execute this code
              *       in parallel.
              */
@@ -508,6 +514,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             msgInd++;
                             NumMessagesBundled++;
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
                             PCounter[ghostOwner]++;
 #pragma omp critical(Mate)
                             {
@@ -590,15 +598,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 NumMessagesBundled++;
                                 msgInd++;
                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                assert(ghostOwner != -1);
+                                assert(ghostOwner != myRank);
                                 PCounter[ghostOwner]++;
 #pragma omp critical
                                 {
                                     QLocalVtx.push_back(v + StartIndex);
                                     QGhostVtx.push_back(w);
                                     QMsgType.push_back(FAILURE);
-                                    //ghostOwner = inputSubGraph.findOwner(w);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
                                     QOwner.push_back(ghostOwner);
                                 }
 
@@ -607,7 +614,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     //} // End of Else: w == -1
                     //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
             } //End of for ( v=0; v < NLVer; v++ )
-    } // end of parallel region
 
     tempCounter.clear(); //Do not need this any more
 
@@ -619,19 +625,37 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     ///////////////////////////////////////////////////////////////////////////////////
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
+    privateU.~staticQueue();
+    new(&privateU) staticQueue(1000); //TODO how can I put a meaningfull size?
+    /*
     while ( !U.empty() ) {
         u = U.pop_front(); //Get an element from the queue
+    */
+    isEmpty = false;
+    while( true )
+    {
+
+#pragma omp critical(U)
+        {
+            if (U.empty()) isEmpty = true;
+            else u = U.pop_front();
+        } // End of critical U
+        if (isEmpty) break;
+
 #ifdef PRINT_DEBUG_INFO_
         cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
 #endif
         if ( (u >= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices
+
+#pragma omp critical
+            {
             //Get the Adjacency list for u
-            adj1 = verLocPtr[u-StartIndex];  //Pointer
-            adj2 = verLocPtr[u-StartIndex+1];
-            for( k = adj1; k < adj2; k++ ) {
+            adj1 = verLocPtr[u - StartIndex];  //Pointer
+            adj2 = verLocPtr[u - StartIndex + 1];
+            for (k = adj1; k < adj2; k++) {
                 v = verLocInd[k];
 
-                if ( (v >= StartIndex) && (v <= EndIndex) ) { //If Local Vertex:
+                if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
 
                     if (isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
 
@@ -640,7 +664,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     fflush(stdout);
 #endif
 
-                    if ( candidateMate[v-StartIndex] == u ) { //Only if pointing to the matched vertex
+                    if (candidateMate[v - StartIndex] == u) { //Only if pointing to the matched vertex
                         //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
                         //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
                         w = computeCandidateMate(verLocPtr[v - StartIndex],
@@ -652,17 +676,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                                  GMate,
                                                  Mate,
                                                  Ghost2LocalMap);
-#pragma omp critical
-                        {
+
                         candidateMate[v - StartIndex] = w;
-                        }
+
                         //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
                         cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
 #endif
                         //If found a dominating edge:
-                        if ( w >= 0 ) {
-                            if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+                        if (w >= 0) {
+                            if ((w < StartIndex) || (w > EndIndex)) { //A ghost
                                 //Build the Message Packet:
                                 //Message[0] = v; //LOCAL
                                 //Message[1] = w; //GHOST
@@ -678,26 +701,28 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 QGhostVtx.push_back(w);
                                 QMsgType.push_back(REQUEST);
                                 //ghostOwner = inputSubGraph.findOwner(w);
-                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                assert(ghostOwner != -1);
+                                assert(ghostOwner != myRank);
                                 QOwner.push_back(ghostOwner);
                                 PCounter[ghostOwner]++;
                                 NumMessagesBundled++;
                                 msgInd++;
-                                if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) {
-                                    Mate[v-StartIndex] = w;  //v is a local vertex
+                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+                                    Mate[v - StartIndex] = w;  //v is a local vertex
                                     GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
                                     //Q.push_back(u);
-                                    U.push_back(v);
-                                    U.push_back(w);
-                                    myCard++;
+                                    privateU.push_back(v);
+                                    privateU.push_back(w);
+                                    privateMyCard++;
 #ifdef PRINT_DEBUG_INFO_
                                     cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
 #endif
                                     //Decrement the counter:
                                     //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                    if ( Counter[Ghost2LocalMap[w]] > 0 ) {
+                                    if (Counter[Ghost2LocalMap[w]] > 0) {
                                         Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                        if ( Counter[Ghost2LocalMap[w]] == 0 ) {
+                                        if (Counter[Ghost2LocalMap[w]] == 0) {
                                             S--; //Decrement S
 #ifdef PRINT_DEBUG_INFO_
                                             cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
@@ -709,13 +734,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 } //End of if CandidateMate[w] = v
                             } //End of if a Ghost Vertex
                             else { //w is a local vertex
-                                if ( candidateMate[w-StartIndex] == v )  {
-                                    Mate[v-StartIndex] = w;  //v is a local vertex
-                                    Mate[w-StartIndex] = v;  //w is a local vertex
+                                if (candidateMate[w - StartIndex] == v) {
+                                    Mate[v - StartIndex] = w;  //v is a local vertex
+                                    Mate[w - StartIndex] = v;  //w is a local vertex
                                     //Q.push_back(u);
-                                    U.push_back(v);
-                                    U.push_back(w);
-                                    myCard++;
+                                    privateU.push_back(v);
+                                    privateU.push_back(w);
+                                    privateMyCard++;
 #ifdef PRINT_DEBUG_INFO_
                                     cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
 #endif
@@ -723,11 +748,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             } //End of Else
                         } //End of if(w >=0)
                         else {
-                            adj11 = verLocPtr[v-StartIndex];
-                            adj12 = verLocPtr[v-StartIndex+1];
-                            for( k1 = adj11; k1 < adj12; k1++ ) {
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
+                            for (k1 = adj11; k1 < adj12; k1++) {
                                 w = verLocInd[k1];
-                                if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
+                                if ((w < StartIndex) || (w > EndIndex)) { //A ghost
                                     //Build the Message Packet:
                                     //Message[0] = v;	     //LOCAL
                                     //Message[1] = w;            //GHOST
@@ -744,7 +769,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                     QGhostVtx.push_back(w);
                                     QMsgType.push_back(FAILURE);
                                     //ghostOwner = inputSubGraph.findOwner(w);
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
                                     QOwner.push_back(ghostOwner);
                                     PCounter[ghostOwner]++;
                                     NumMessagesBundled++;
@@ -756,9 +783,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     } //End of If (candidateMate[v-StartIndex] == u)
                 } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
                 else { //Neighbor is a ghost vertex
-                    if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u )
-                        candidateMate[NLVer+Ghost2LocalMap[v]] = -1;
-                    if ( v != Mate[u-StartIndex] ) { //u is local
+                    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                        candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                    if (v != Mate[u - StartIndex]) { //u is local
                         //Build the Message Packet:
                         //Message[0] = u; //LOCAL
                         //Message[1] = v; //GHOST
@@ -774,7 +801,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         QGhostVtx.push_back(v);
                         QMsgType.push_back(SUCCESS);
                         //ghostOwner = inputSubGraph.findOwner(v);
-                        ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                        ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                        assert(ghostOwner != -1);
+                        assert(ghostOwner != myRank);
                         QOwner.push_back(ghostOwner);
                         PCounter[ghostOwner]++;
                         NumMessagesBundled++;
@@ -782,8 +811,24 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     } //End of If( v != Mate[u] )
                 } //End of Else //A Ghost Vertex
             } //End of For Loop adj(u)
+
+            }
+
         } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+
+#pragma omp critical(U)
+        {
+            while(!privateU.empty()) {
+                U.push_back(privateU.pop_front());
+            }
+
+            myCard += privateMyCard;
+        } //End of critical U
+
     } //End of while ( /*!Q.empty()*/ !U.empty() )
+
+    } // end of parallel region
+
     ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 #ifdef DEBUG_HANG_
     if (myRank == 0) cout<<"\n("<<myRank<<") Send Bundles" <<endl; fflush(stdout);
@@ -1097,7 +1142,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
                             MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                            msgInd++; msgActual++;
+                            msgInd++;
+                            msgActual++;
 #ifdef DEBUG_GHOST_
                             if ((u<StartIndex) || (u>EndIndex)) {
 			      cout<<"\n("<<myRank<<") "<<__LINE__<<" From Send: should not happen: u= "<<u<<" v= "<<v<<
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index bdacc992..d42ea8db 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0123                        ! IDIM; domain size. Linear system size is IDIM**3
+00030                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From 5bc4f2a08088fd1982010e041252213da87cd6c7 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Mon, 30 May 2022 14:27:26 -0500
Subject: [PATCH 20/96] PROCESS MATCHED VERTICES parallelization improvement

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 288 +++++++++---------
 samples/advanced/pdegen/runs/amg_pde3d.inp    |   2 +-
 2 files changed, 152 insertions(+), 138 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 0140c0c6..43584b77 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -217,7 +217,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner, u, privateU, privateMyCard, isEmpty) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
         // TODO comments about the reduction
@@ -627,11 +627,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     ///////////////////////////////////////////////////////////////////////////////////
     privateU.~staticQueue();
     new(&privateU) staticQueue(1000); //TODO how can I put a meaningfull size?
-    /*
-    while ( !U.empty() ) {
-        u = U.pop_front(); //Get an element from the queue
-    */
     isEmpty = false;
+
+#ifdef COUNT_LOCAL_VERTEX
+    MilanLongInt localVertices = 0;
+#endif
+
     while( true )
     {
 
@@ -647,8 +648,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
         if ( (u >= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices
 
-#pragma omp critical
-            {
+#ifdef COUNT_LOCAL_VERTEX
+    localVertices ++;
+#endif
+
             //Get the Adjacency list for u
             adj1 = verLocPtr[u - StartIndex];  //Pointer
             adj2 = verLocPtr[u - StartIndex + 1];
@@ -656,118 +659,51 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 v = verLocInd[k];
 
                 if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
-
-                    if (isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
+#pragma omp critical
+                    {
 
 #ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
-                    fflush(stdout);
+                        cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
+                        fflush(stdout);
 #endif
 
-                    if (candidateMate[v - StartIndex] == u) { //Only if pointing to the matched vertex
-                        //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                        w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                 verLocPtr[v - StartIndex + 1],
-                                                 edgeLocWeight, 0,
-                                                 verLocInd,
-                                                 StartIndex,
-                                                 EndIndex,
-                                                 GMate,
-                                                 Mate,
-                                                 Ghost2LocalMap);
+                        if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
+                            candidateMate[v - StartIndex] == u) { //Only if pointing to the matched vertex
+                            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                            w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                     verLocPtr[v - StartIndex + 1],
+                                                     edgeLocWeight, 0,
+                                                     verLocInd,
+                                                     StartIndex,
+                                                     EndIndex,
+                                                     GMate,
+                                                     Mate,
+                                                     Ghost2LocalMap);
 
-                        candidateMate[v - StartIndex] = w;
+                            candidateMate[v - StartIndex] = w;
 
-                        //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
-#endif
-                        //If found a dominating edge:
-                        if (w >= 0) {
-                            if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-                                //Build the Message Packet:
-                                //Message[0] = v; //LOCAL
-                                //Message[1] = w; //GHOST
-                                //Message[2] = REQUEST;  //TYPE
-                                //Send a Request (Asynchronous)
+                            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")Sending a request message:";
-                                cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-#endif
-                                /*MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                                 ComputeTag, comm);*/
-                                QLocalVtx.push_back(v);
-                                QGhostVtx.push_back(w);
-                                QMsgType.push_back(REQUEST);
-                                //ghostOwner = inputSubGraph.findOwner(w);
-                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                assert(ghostOwner != -1);
-                                assert(ghostOwner != myRank);
-                                QOwner.push_back(ghostOwner);
-                                PCounter[ghostOwner]++;
-                                NumMessagesBundled++;
-                                msgInd++;
-                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
-                                    Mate[v - StartIndex] = w;  //v is a local vertex
-                                    GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
-                                    //Q.push_back(u);
-                                    privateU.push_back(v);
-                                    privateU.push_back(w);
-                                    privateMyCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
-#endif
-                                    //Decrement the counter:
-                                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                    if (Counter[Ghost2LocalMap[w]] > 0) {
-                                        Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                        if (Counter[Ghost2LocalMap[w]] == 0) {
-                                            S--; //Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                                            fflush(stdout);
+                            cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
 #endif
-                                        }
-                                    } //End of if Counter[w] > 0
-                                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                } //End of if CandidateMate[w] = v
-                            } //End of if a Ghost Vertex
-                            else { //w is a local vertex
-                                if (candidateMate[w - StartIndex] == v) {
-                                    Mate[v - StartIndex] = w;  //v is a local vertex
-                                    Mate[w - StartIndex] = v;  //w is a local vertex
-                                    //Q.push_back(u);
-                                    privateU.push_back(v);
-                                    privateU.push_back(w);
-                                    privateMyCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
-#endif
-                                } //End of if(CandidateMate(w) = v
-                            } //End of Else
-                        } //End of if(w >=0)
-                        else {
-                            adj11 = verLocPtr[v - StartIndex];
-                            adj12 = verLocPtr[v - StartIndex + 1];
-                            for (k1 = adj11; k1 < adj12; k1++) {
-                                w = verLocInd[k1];
+                            //If found a dominating edge:
+                            if (w >= 0) {
                                 if ((w < StartIndex) || (w > EndIndex)) { //A ghost
                                     //Build the Message Packet:
-                                    //Message[0] = v;	     //LOCAL
-                                    //Message[1] = w;            //GHOST
-                                    //Message[2] = FAILURE;      //TYPE
+                                    //Message[0] = v; //LOCAL
+                                    //Message[1] = w; //GHOST
+                                    //Message[2] = REQUEST;  //TYPE
                                     //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                    cout<<"\n("<<myRank<<")Sending a request message:";
                                     cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    fflush(stdout);
 #endif
-                                    /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                                     ComputeTag, comm); */
+                                    /*MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                                     ComputeTag, comm);*/
                                     QLocalVtx.push_back(v);
                                     QGhostVtx.push_back(w);
-                                    QMsgType.push_back(FAILURE);
+                                    QMsgType.push_back(REQUEST);
                                     //ghostOwner = inputSubGraph.findOwner(w);
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                     assert(ghostOwner != -1);
@@ -776,44 +712,115 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                     PCounter[ghostOwner]++;
                                     NumMessagesBundled++;
                                     msgInd++;
-                                } //End of if(GHOST)
-                            } //End of for loop
-                        } // End of Else: w == -1
-                        //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                    } //End of If (candidateMate[v-StartIndex] == u)
+                                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+                                        Mate[v - StartIndex] = w;  //v is a local vertex
+                                        GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
+                                        //Q.push_back(u);
+                                        privateU.push_back(v);
+                                        privateU.push_back(w);
+                                        privateMyCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                        //Decrement the counter:
+                                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                        if (Counter[Ghost2LocalMap[w]] > 0) {
+                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                            if (Counter[Ghost2LocalMap[w]] == 0) {
+                                                S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                                fflush(stdout);
+#endif
+                                            }
+                                        } //End of if Counter[w] > 0
+                                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                    } //End of if CandidateMate[w] = v
+                                } //End of if a Ghost Vertex
+                                else { //w is a local vertex
+                                    if (candidateMate[w - StartIndex] == v) {
+                                        Mate[v - StartIndex] = w;  //v is a local vertex
+                                        Mate[w - StartIndex] = v;  //w is a local vertex
+                                        //Q.push_back(u);
+                                        privateU.push_back(v);
+                                        privateU.push_back(w);
+                                        privateMyCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                    } //End of if(CandidateMate(w) = v
+                                } //End of Else
+                            } //End of if(w >=0)
+                            else {
+                                adj11 = verLocPtr[v - StartIndex];
+                                adj12 = verLocPtr[v - StartIndex + 1];
+                                for (k1 = adj11; k1 < adj12; k1++) {
+                                    w = verLocInd[k1];
+                                    if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+                                        //Build the Message Packet:
+                                        //Message[0] = v;	     //LOCAL
+                                        //Message[1] = w;            //GHOST
+                                        //Message[2] = FAILURE;      //TYPE
+                                        //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                        cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        fflush(stdout);
+#endif
+                                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                                         ComputeTag, comm); */
+                                        QLocalVtx.push_back(v);
+                                        QGhostVtx.push_back(w);
+                                        QMsgType.push_back(FAILURE);
+                                        //ghostOwner = inputSubGraph.findOwner(w);
+                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        assert(ghostOwner != -1);
+                                        assert(ghostOwner != myRank);
+                                        QOwner.push_back(ghostOwner);
+                                        PCounter[ghostOwner]++;
+                                        NumMessagesBundled++;
+                                        msgInd++;
+                                    } //End of if(GHOST)
+                                } //End of for loop
+                            } // End of Else: w == -1
+                            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        } //End of If (candidateMate[v-StartIndex] == u)
+
+                    } //End of critical region if
+
                 } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
                 else { //Neighbor is a ghost vertex
-                    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                        candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                    if (v != Mate[u - StartIndex]) { //u is local
-                        //Build the Message Packet:
-                        //Message[0] = u; //LOCAL
-                        //Message[1] = v; //GHOST
-                        //Message[2] = SUCCESS;  //TYPE
-                        //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")Sending a success message: ";
-                        cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs)<<"\n"; fflush(stdout);
-#endif
-                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(v),
-                         ComputeTag, comm); */
-                        QLocalVtx.push_back(u);
-                        QGhostVtx.push_back(v);
-                        QMsgType.push_back(SUCCESS);
-                        //ghostOwner = inputSubGraph.findOwner(v);
-                        ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                        assert(ghostOwner != -1);
-                        assert(ghostOwner != myRank);
-                        QOwner.push_back(ghostOwner);
-                        PCounter[ghostOwner]++;
-                        NumMessagesBundled++;
-                        msgInd++;
-                    } //End of If( v != Mate[u] )
-                } //End of Else //A Ghost Vertex
-            } //End of For Loop adj(u)
 
-            }
+#pragma omp critical
+                    {
+                        if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                            candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                        if (v != Mate[u - StartIndex]) { //u is local
+                            //Build the Message Packet:
+                            //Message[0] = u; //LOCAL
+                            //Message[1] = v; //GHOST
+                            //Message[2] = SUCCESS;  //TYPE
+                            //Send a Request (Asynchronous)
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")Sending a success message: ";
+                            cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs)<<"\n"; fflush(stdout);
+#endif
 
+                            QLocalVtx.push_back(u);
+                            QGhostVtx.push_back(v);
+                            QMsgType.push_back(SUCCESS);
+                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
+                            QOwner.push_back(ghostOwner);
+                            PCounter[ghostOwner]++;
+                            NumMessagesBundled++;
+                            msgInd++;
+                        } //End of If( v != Mate[u] )
+                    } //End of critical region
+                } //End of Else //A Ghost Vertex
+            } //End of For Loop adj(u)
         } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
 
 #pragma omp critical(U)
@@ -827,8 +834,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     } //End of while ( /*!Q.empty()*/ !U.empty() )
 
+#ifdef COUNT_LOCAL_VERTEX
+       printf("Count local vertexes: %ld for thread %d of processor %d\n",
+              localVertices,
+              omp_get_thread_num(),
+              myRank);
+#endif
     } // end of parallel region
 
+
     ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 #ifdef DEBUG_HANG_
     if (myRank == 0) cout<<"\n("<<myRank<<") Send Bundles" <<endl; fflush(stdout);
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index d42ea8db..1d9bcfe4 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-00030                        ! IDIM; domain size. Linear system size is IDIM**3
+00123                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From e2ca97ca478412d01067e6661d0f190c92b26c70 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Tue, 31 May 2022 16:04:56 -0500
Subject: [PATCH 21/96] Removed one critical region from
 PARALLEL_PROCESS_EXPOSED_VERTEX_B

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 86 ++++++++++++-------
 1 file changed, 56 insertions(+), 30 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 43584b77..783ed17e 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -211,13 +211,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector <MilanLongInt> GMate;  //Proportional to the number of ghost vertices
     MilanLongInt S;
     MilanLongInt privateMyCard = 0;
-    staticQueue U, privateU;
+    staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
+
+    /*
+    staticQueue privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner;
+     */
     bool isEmpty;
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner /*, privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner*/) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
         // TODO comments about the reduction
@@ -291,21 +295,18 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             double verGhostPtrInitialization = MPI_Wtime();
 #endif
 
-        }
+
         /*
-         * OMP verGhostPtrInitialization
-         *
-         * schedule(static) assign to each thread an huge chunk
-         * it is used in this case to reduce the overhead of chunk assignment
-         * and to reduce false sharing
+         * Not parallelizable
          */
-#pragma omp for nowait schedule(static)
+
         for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
             verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
             cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
 #endif
         }
+    } // End of single region
 
 #ifdef TIME_TRACKER
         verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
@@ -461,6 +462,30 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
              *       in parallel.
              */
 
+            MilanLongInt size = numGhostEdges; //TODO how can I decide a meaningfull size?
+            //Fail messages
+            privateQLocalVtx.~staticQueue();
+            privateQGhostVtx.~staticQueue();
+            privateQMsgType.~staticQueue();
+            privateQOwner.~staticQueue();
+            //Request messages
+            /*
+            privateReqQLocalVtx.~staticQueue();
+            privateReqQGhostVtx.~staticQueue();
+            privateReqQMsgType.~staticQueue();
+            privateReqQOwner.~staticQueue();
+            */
+            new(&privateQLocalVtx) staticQueue(size);
+            new(&privateQGhostVtx) staticQueue(size);
+            new(&privateQMsgType) staticQueue(size);
+            new(&privateQOwner) staticQueue(size);
+            /*
+            new(&privateReqQLocalVtx) staticQueue(size);
+            new(&privateReqQGhostVtx) staticQueue(size);
+            new(&privateReqQMsgType) staticQueue(size);
+            new(&privateReqQOwner) staticQueue(size);
+            */
+
 #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
     for ( v=0; v < NLVer; v++ )
             {
@@ -499,18 +524,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
                         myCard++;
                         if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
-                            //Build the Message Packet:
-                            //Message[0] = v+StartIndex; //LOCAL
-                            //Message[1] = w;            //GHOST
-                            //Message[2] = REQUEST;      //TYPE
-                            //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
                             cout<<"\n("<<myRank<<")Sending a request message (291):";
                         cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
                         fflush(stdout);
 #endif
-                            /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                             ComputeTag, comm);*/
+
                             msgInd++;
                             NumMessagesBundled++;
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
@@ -519,13 +538,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             PCounter[ghostOwner]++;
 #pragma omp critical(Mate)
                             {
-                            QLocalVtx.push_back(v + StartIndex);
-                            QGhostVtx.push_back(w);
-                            QMsgType.push_back(REQUEST);
-                            //ghostOwner = inputSubGraph.findOwner(w);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-                            QOwner.push_back(ghostOwner);
+                                QLocalVtx.push_back(v + StartIndex);
+                                QGhostVtx.push_back(w);
+                                QMsgType.push_back(REQUEST);
+                                QOwner.push_back(ghostOwner);
 
                             if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
 
@@ -601,13 +617,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 assert(ghostOwner != -1);
                                 assert(ghostOwner != myRank);
                                 PCounter[ghostOwner]++;
-#pragma omp critical
-                                {
-                                    QLocalVtx.push_back(v + StartIndex);
-                                    QGhostVtx.push_back(w);
-                                    QMsgType.push_back(FAILURE);
-                                    QOwner.push_back(ghostOwner);
-                                }
+                                privateQLocalVtx.push_back(v + StartIndex);
+                                privateQGhostVtx.push_back(w);
+                                privateQMsgType.push_back(FAILURE);
+                                privateQOwner.push_back(ghostOwner);
 
                             } //End of if(GHOST)
                         } //End of for loop
@@ -615,6 +628,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
             } //End of for ( v=0; v < NLVer; v++ )
 
+#pragma omp critical(privateMsg)
+        {
+        while (!privateQLocalVtx.empty())
+        {
+            QLocalVtx.push_back(privateQLocalVtx.pop_front());
+            QGhostVtx.push_back(privateQGhostVtx.pop_front());
+            QMsgType.push_back(privateQMsgType.pop_front());
+            QOwner.push_back(privateQOwner.pop_front());
+        }
+    }
+
     tempCounter.clear(); //Do not need this any more
 
 
@@ -626,7 +650,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
     privateU.~staticQueue();
+
     new(&privateU) staticQueue(1000); //TODO how can I put a meaningfull size?
+
     isEmpty = false;
 
 #ifdef COUNT_LOCAL_VERTEX

From b079d71f30fc968a323169be8941b018eb897ddc Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 2 Jun 2022 07:29:21 -0500
Subject: [PATCH 22/96] Further optimizations PARALLEL_PROCESS_EXPOSED_VERTEX_B

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 284 +++++++++---------
 1 file changed, 143 insertions(+), 141 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 783ed17e..9f3cbb97 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -164,6 +164,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     for (int i = 0; i < numProcs; i++)
         PCounter[i] = 0;
 
+
     MilanLongInt NumMessagesBundled;
     MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
     //vector<MilanLongInt> candidateMate;
@@ -213,15 +214,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt privateMyCard = 0;
     staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
 
-    /*
-    staticQueue privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner;
-     */
     bool isEmpty;
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner /*, privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner*/) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
         // TODO comments about the reduction
@@ -402,7 +400,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
              * Create the Queue Data Structure for the Dominating Set
              *
              * I had to declare the staticuQueue U before the parallel region
-             * to have it in the correct scope. Since we can't chane the dimension
+             * to have it in the correct scope. Since we can't change the dimension
              * of a staticQueue I had to destroy the previous object and instantiate
              * a new one of the correct size.
              */
@@ -462,102 +460,103 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
              *       in parallel.
              */
 
-            MilanLongInt size = numGhostEdges; //TODO how can I decide a meaningfull size?
+            MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size?
             //Fail messages
             privateQLocalVtx.~staticQueue();
             privateQGhostVtx.~staticQueue();
             privateQMsgType.~staticQueue();
             privateQOwner.~staticQueue();
-            //Request messages
-            /*
-            privateReqQLocalVtx.~staticQueue();
-            privateReqQGhostVtx.~staticQueue();
-            privateReqQMsgType.~staticQueue();
-            privateReqQOwner.~staticQueue();
-            */
+            privateU.~staticQueue();
+
+            new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
             new(&privateQLocalVtx) staticQueue(size);
             new(&privateQGhostVtx) staticQueue(size);
             new(&privateQMsgType) staticQueue(size);
             new(&privateQOwner) staticQueue(size);
-            /*
-            new(&privateReqQLocalVtx) staticQueue(size);
-            new(&privateReqQGhostVtx) staticQueue(size);
-            new(&privateReqQMsgType) staticQueue(size);
-            new(&privateReqQOwner) staticQueue(size);
-            */
 
 #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
-    for ( v=0; v < NLVer; v++ )
-            {
-                //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                k = candidateMate[v];
-                candidateMate[v] = verLocInd[k];
-                w = candidateMate[v];
+        for (v = 0; v < NLVer; v++) {
+            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+            k = candidateMate[v];
+            candidateMate[v] = verLocInd[k];
+            w = candidateMate[v];
 
 #ifdef PRINT_DEBUG_INFO_
-                cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
+            cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-                cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
+            cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
 #endif
+            //If found a dominating edge:
+            if (w >= 0) {
 
-                //If found a dominating edge:
-                if (w >= 0) {
-
-                    //This piece of code is actually executed under 0.01% of the times
-
-                        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
-                            w = computeCandidateMate(verLocPtr[v],
-                                                     verLocPtr[v + 1],
-                                                     edgeLocWeight, 0,
-                                                     verLocInd,
-                                                     StartIndex,
-                                                     EndIndex,
-                                                     GMate,
-                                                     Mate,
-                                                     Ghost2LocalMap);
-                            candidateMate[v] = w;
-                        }
+                if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
+                    w = computeCandidateMate(verLocPtr[v],
+                                             verLocPtr[v + 1],
+                                             edgeLocWeight, 0,
+                                             verLocInd,
+                                             StartIndex,
+                                             EndIndex,
+                                             GMate,
+                                             Mate,
+                                             Ghost2LocalMap);
+                    candidateMate[v] = w;
+                }
 
-                    if (w >= 0) {
+                if (w >= 0) {
 
-                        myCard++;
-                        if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
+                    myCard++;
+                    if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")Sending a request message (291):";
+                        cout<<"\n("<<myRank<<")Sending a request message (291):";
                         cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
                         fflush(stdout);
 #endif
 
-                            msgInd++;
-                            NumMessagesBundled++;
-                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-                            PCounter[ghostOwner]++;
-#pragma omp critical(Mate)
-                            {
-                                QLocalVtx.push_back(v + StartIndex);
-                                QGhostVtx.push_back(w);
-                                QMsgType.push_back(REQUEST);
-                                QOwner.push_back(ghostOwner);
+                        msgInd++;
+                        NumMessagesBundled++;
+                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        assert(ghostOwner != -1);
+                        assert(ghostOwner != myRank);
+                        PCounter[ghostOwner]++;
+
+
+                        //TODO whyyyyy does it fail if I use a private data structure???
+                        /*
+                        privateQLocalVtx.push_back(v + StartIndex);
+                        privateQGhostVtx.push_back(w);
+                        privateQMsgType.push_back(REQUEST);
+                        privateQOwner.push_back(ghostOwner);
+                        */
+
+#pragma omp critical(MSG)
+                        {
+
+                            QLocalVtx.push_back(v + StartIndex);
+                            QGhostVtx.push_back(w);
+                            QMsgType.push_back(REQUEST);
+                            QOwner.push_back(ghostOwner);
+                        } // end of critical region
 
-                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
+                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
 
-                                Mate[v] = w;
-                                GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
-                                U.push_back(v + StartIndex);
-                                U.push_back(w);
+                            privateU.push_back(v + StartIndex);
+                            privateU.push_back(w);
+                            Mate[v] = w;
+                            //FIXME could this instruction create errors?
+                            GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
 
 #ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
+                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
 #endif
-                                //Decrement the counter:
-                                //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                                if (Counter[Ghost2LocalMap[w]] > 0) {
+                            //Decrement the counter:
+                            //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+#pragma omp critical
+                            {
+                            if (Counter[Ghost2LocalMap[w]] > 0) {
 
-                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                    Counter[Ghost2LocalMap[w]] -= 1; //Decrement
                                     if (Counter[Ghost2LocalMap[w]] == 0) {
                                         S--; //Decrement S
 #ifdef PRINT_DEBUG_INFO_
@@ -565,79 +564,86 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                             fflush(stdout);
 #endif
                                     }
-                                } //End of if Counter[w] > 0
-                                //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                            } //End of if CandidateMate[w] = v
-                            } // end of critical region
-                        } //End of if a Ghost Vertex
-                        else { // w is a local vertex
-
-                            if (candidateMate[w - StartIndex] == (v + StartIndex)) {
-#pragma omp critical(Mate)
-                                {
-                                Mate[v] = w;  //v is local
-                                Mate[w - StartIndex] = v + StartIndex; //w is local
-                                //Q.push_back(u);
-                                U.push_back(v + StartIndex);
-                                U.push_back(w);
-
-#ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
-#endif
-                                } //End of critical
-
-                            } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-                        } //End of Else
-                        continue;
-
-                    } //End of second if
-                } //End of if(w >=0)
-
-                    //if (w < 0) { -- if it arrives here this one if is useless, it is certainly -1
-                        adj11 = verLocPtr[v];
-                        adj12 = verLocPtr[v + 1];
-                        for (k1 = adj11; k1 < adj12; k1++) {
-                            w = verLocInd[k1];
-                            if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-                                //Build the Message Packet:
-                                //Message[0] = v+StartIndex; //LOCAL
-                                //Message[1] = w;            //GHOST
-                                //Message[2] = FAILURE;      //TYPE
-                                //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")Sending a failure message: ";
-                            cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            fflush(stdout);
+                                }
+                            } //End of if Counter[w] > 0
+                            //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                        } //End of if CandidateMate[w] = v
+
+
+                    } //End of if a Ghost Vertex
+                    else { // w is a local vertex
+
+                        if (candidateMate[w - StartIndex] == (v + StartIndex)) {
+                            privateU.push_back(v + StartIndex);
+                            privateU.push_back(w);
+
+                            Mate[v] = w;  //v is local
+                            //FIXME this instruction could create errors
+                            Mate[w - StartIndex] = v + StartIndex; //w is local
+
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
+#endif
+
+                        } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                    } //End of Else
+
+                    continue;
+                } //End of second if
+
+            } //End of if(w >=0)
+
+            //This piece of code is executed a really small amount of times, I will not allocate a
+            //huge amount of memory to the private data structures.
+            adj11 = verLocPtr[v];
+            adj12 = verLocPtr[v + 1];
+            for (k1 = adj11; k1 < adj12; k1++) {
+                w = verLocInd[k1];
+                if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")Sending a failure message: ";
+                cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                fflush(stdout);
 #endif
-                                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                                 ComputeTag, comm); */
-                                NumMessagesBundled++;
-                                msgInd++;
-                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                assert(ghostOwner != -1);
-                                assert(ghostOwner != myRank);
-                                PCounter[ghostOwner]++;
-                                privateQLocalVtx.push_back(v + StartIndex);
-                                privateQGhostVtx.push_back(w);
-                                privateQMsgType.push_back(FAILURE);
-                                privateQOwner.push_back(ghostOwner);
-
-                            } //End of if(GHOST)
-                        } //End of for loop
-                    //} // End of Else: w == -1
-                    //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-            } //End of for ( v=0; v < NLVer; v++ )
+
+                    msgInd++;
+                    NumMessagesBundled++;
+                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    assert(ghostOwner != -1);
+                    assert(ghostOwner != myRank);
+                    PCounter[ghostOwner]++;
+                    QLocalVtx.push_back(v + StartIndex);
+                    QGhostVtx.push_back(w);
+                    QMsgType.push_back(FAILURE);
+                    QOwner.push_back(ghostOwner);
+
+                } //End of if(GHOST)
+            } //End of for loop
+            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+        } //End of for ( v=0; v < NLVer; v++ )
+
 
 #pragma omp critical(privateMsg)
         {
-        while (!privateQLocalVtx.empty())
+            while (!privateQLocalVtx.empty())
+            {
+
+                    QLocalVtx.push_back(privateQLocalVtx.pop_back());
+                    QGhostVtx.push_back(privateQGhostVtx.pop_back());
+                    QMsgType.push_back(privateQMsgType.pop_back());
+                    QOwner.push_back(privateQOwner.pop_back());
+            }
+        }
+
+#pragma omp critical(U)
         {
-            QLocalVtx.push_back(privateQLocalVtx.pop_front());
-            QGhostVtx.push_back(privateQGhostVtx.pop_front());
-            QMsgType.push_back(privateQMsgType.pop_front());
-            QOwner.push_back(privateQOwner.pop_front());
+            while (!privateU.empty())
+            {
+                U.push_back(privateU.pop_front());
+            }
         }
-    }
 
     tempCounter.clear(); //Do not need this any more
 
@@ -649,10 +655,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     ///////////////////////////////////////////////////////////////////////////////////
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
-    privateU.~staticQueue();
-
-    new(&privateU) staticQueue(1000); //TODO how can I put a meaningfull size?
-
     isEmpty = false;
 
 #ifdef COUNT_LOCAL_VERTEX

From 532701031e60d68c4b97df952389339c48f855c9 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 2 Jun 2022 09:15:31 -0500
Subject: [PATCH 23/96] Extendend parallel region after SEND PACKET BUNDLE
 Nothing parallelizable founded

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 176 ++++++++++--------
 samples/advanced/pdegen/runs/amg_pde3d.inp    |   2 +-
 2 files changed, 99 insertions(+), 79 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 9f3cbb97..2a541e9f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -213,7 +213,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt S;
     MilanLongInt privateMyCard = 0;
     staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
-
+    MilanLongInt myIndex = 0;
+    vector <MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
+    vector <MPI_Request> SRequest; //Requests that are used for each send message
+    vector <MPI_Status> SStatus;   //Status of sent messages, used in MPI_Wait
+    MilanLongInt MessageIndex = 0; //Pointer for current message
+    MilanInt OneMessageSize = 0;
+    MilanLongInt numMessagesToSend;
+    MilanInt BufferSize;
+    MilanLongInt *Buffer;
     bool isEmpty;
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
@@ -868,7 +876,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
               omp_get_thread_num(),
               myRank);
 #endif
-    } // end of parallel region
 
 
     ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
@@ -878,28 +885,34 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     /////////////////////////////////////////////////////////////////////////////////////////
     ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////////
-    //Data structures for Bundled Messages:
-    vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
-    MilanLongInt myIndex=0;
-    try {
-        PMessageBundle.reserve(NumMessagesBundled*3); //Three integers per message
-        PCumulative.reserve(numProcs+1); //Similar to Row Pointer vector in CSR data structure
-        PSizeInfoMessages.reserve(numProcs*3); //Buffer to hold the Size info message packets
-    } catch ( length_error ) {
-        cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-        cout<<"Not enough memory to allocate the internal variables \n";
-        exit(1);
-    }
-    PMessageBundle.resize(NumMessagesBundled*3, -1);//Initialize
-    PCumulative.resize(numProcs+1, 0); //Only initialize the counter variable
-    PSizeInfoMessages.resize(numProcs*3, 0);
-
-    for (MilanInt i=0; i<numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
-        PCumulative[i+1]=PCumulative[i]+PCounter[i];
-    //Reuse PCounter to keep track of how many messages were inserted:
-    for (MilanInt i=0; i<numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
-        PCounter[i]=0;
-    //Build the Message Bundle packet:
+#pragma omp barrier
+#pragma omp master
+        {
+            //Data structures for Bundled Messages:
+            try {
+                PMessageBundle.reserve(NumMessagesBundled * 3); //Three integers per message
+                PCumulative.reserve(numProcs + 1); //Similar to Row Pointer vector in CSR data structure
+                PSizeInfoMessages.reserve(numProcs * 3); //Buffer to hold the Size info message packets
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+            PMessageBundle.resize(NumMessagesBundled * 3, -1);//Initialize
+            PCumulative.resize(numProcs + 1, 0); //Only initialize the counter variable
+            PSizeInfoMessages.resize(numProcs * 3, 0);
+
+
+            for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+                PCumulative[i + 1] = PCumulative[i] + PCounter[i];
+
+            //OMP not worth parallelizing
+            //Reuse PCounter to keep track of how many messages were inserted:
+            for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+                PCounter[i] = 0;
+            //Build the Message Bundle packet:
+
+            //OMP Not parallelizable
     for (MilanInt i=0; i<NumMessagesBundled; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
         myIndex = ( PCumulative[QOwner[i]] + PCounter[QOwner[i]] )*3;
         PMessageBundle[myIndex+0] = QLocalVtx[i];
@@ -907,58 +920,62 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         PMessageBundle[myIndex+2] = QMsgType[i];
         PCounter[QOwner[i]]++;
     }
+
     //Send the Bundled Messages: Use ISend
-    vector<MPI_Request> SRequest; //Requests that are used for each send message
-    vector<MPI_Status> SStatus;   //Status of sent messages, used in MPI_Wait
-    MilanLongInt MessageIndex=0; //Pointer for current message
-    try {
-        SRequest.reserve(numProcs*2); //At most two messages per processor
-        SStatus.reserve(numProcs*2);//At most two messages per processor
-    } catch ( length_error ) {
-        cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
-        cout<<"Not enough memory to allocate the internal variables \n";
-        exit(1);
-    }
-    MPI_Request myReq; //A sample request
-    SRequest.resize(numProcs*2,myReq);
-    MPI_Status myStat; //A sample status
-    SStatus.resize(numProcs*2,myStat);
-    //Send the Messages
-    for (MilanInt i=0; i<numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
-        if (i==myRank) //Do not send anything to yourself
-            continue;
-        //Send the Message with information about the size of next message:
-        //Build the Message Packet:
-        PSizeInfoMessages[i*3+0] = (PCumulative[i+1]-PCumulative[i])*3; // # of integers in the next message
-        PSizeInfoMessages[i*3+1] = -1; //Dummy packet
-        PSizeInfoMessages[i*3+2] = SIZEINFO;  //TYPE
-        //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")Sending bundled message to process "<<i<<" size: "<<PSizeInfoMessages[i*3+0]<<endl;
-        fflush(stdout);
+
+            try {
+                SRequest.reserve(numProcs * 2); //At most two messages per processor
+                SStatus.reserve(numProcs * 2);//At most two messages per processor
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+            MPI_Request myReq; //A sample request
+            SRequest.resize(numProcs * 2, myReq);
+            MPI_Status myStat; //A sample status
+            SStatus.resize(numProcs * 2, myStat);
+
+            //Send the Messages
+            for (MilanInt i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
+                if (i == myRank) //Do not send anything to yourself
+                    continue;
+                //Send the Message with information about the size of next message:
+                //Build the Message Packet:
+                PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
+                PSizeInfoMessages[i * 3 + 1] = -1; //Dummy packet
+                PSizeInfoMessages[i * 3 + 2] = SIZEINFO;  //TYPE
+                //Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                cout<<"\n("<<myRank<<")Sending bundled message to process "<<i<<" size: "<<PSizeInfoMessages[i*3+0]<<endl;
+                fflush(stdout);
 #endif
-        if ( PSizeInfoMessages[i*3+0] > 0 ) { //Send only if it is a nonempty packet
-            MPI_Isend(&PSizeInfoMessages[i*3+0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm, &SRequest[MessageIndex]);
-            msgActual++;
-            MessageIndex++;
-            //Now Send the message with the data packet:
-#ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Sending Bundle to : "<<i<<endl;
-            for (k=(PCumulative[i]*3); k< (PCumulative[i]*3+PSizeInfoMessages[i*3+0]); k++)
-                cout<<PMessageBundle[k]<<",";
-            cout<<endl;
-            fflush(stdout);
+                if (PSizeInfoMessages[i * 3 + 0] > 0) { //Send only if it is a nonempty packet
+                    MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
+                              &SRequest[MessageIndex]);
+                    msgActual++;
+                    MessageIndex++;
+                    //Now Send the message with the data packet:
+#ifdef PRINT_DEBUG_INFO_
+                    cout<<"\n("<<myRank<<")Sending Bundle to : "<<i<<endl;
+                    for (k=(PCumulative[i]*3); k< (PCumulative[i]*3+PSizeInfoMessages[i*3+0]); k++)
+                        cout<<PMessageBundle[k]<<",";
+                    cout<<endl;
+                    fflush(stdout);
 #endif
-            MPI_Isend(&PMessageBundle[PCumulative[i]*3], PSizeInfoMessages[i*3+0], TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[MessageIndex]);
-            MessageIndex++;
-        } //End of if size > 0
-    }
-    //Free up temporary memory:
-    PCumulative.clear();
-    QLocalVtx.clear();
-    QGhostVtx.clear();
-    QMsgType.clear();
-    QOwner.clear();
+                    MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
+                              TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[MessageIndex]);
+                    MessageIndex++;
+                } //End of if size > 0
+            }
+            //Free up temporary memory:
+            PCumulative.clear();
+            QLocalVtx.clear();
+            QGhostVtx.clear();
+            QMsgType.clear();
+            QOwner.clear();
+
+
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
     cout<<"\n("<<myRank<<")Total number of potential message X 2 = "<<numGhostEdges*2;
@@ -971,17 +988,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     //Allocate memory for MPI Send messages:
     /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
-    MilanInt OneMessageSize=0;
+    OneMessageSize=0;
     MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); //Size of one message packet
     //How many messages to send?
     //Potentially three kinds of messages will be sent/received:
     //Request, Success, Failure.
     //But only two will be sent from a given processor.
     //Substract the number of messages that have already been sent as bundled messages:
-    MilanLongInt numMessagesToSend = numGhostEdges*2 - NumMessagesBundled;
-    MilanInt     BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend;
+    numMessagesToSend = numGhostEdges*2 - NumMessagesBundled;
+    BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend;
 
-    MilanLongInt *Buffer=0;
+    Buffer=0;
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")Size of One Message from PACK= "<<OneMessageSize;
     cout<<"\n("<<myRank<<")Size of Message overhead = "<<MPI_BSEND_OVERHEAD;
@@ -1000,11 +1017,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         }
         MPI_Buffer_attach(Buffer, BufferSize); //Attach the Buffer
     }
+        } //End of master
+
+    } // end of parallel region
     ///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
 
     finishTime = MPI_Wtime();
     *ph1_time = finishTime-startTime; //Time taken for Phase-1
-    *ph1_card = myCard ; //Cardinality at the end of Phase-1
+    *ph1_card = myCard; //Cardinality at the end of Phase-1
     startTime = MPI_Wtime();
     /////////////////////////////////////////////////////////////////////////////////////////
     //////////////////////////////////////// MAIN LOOP //////////////////////////////////////
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index 1d9bcfe4..bdacc992 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-00123                        ! IDIM; domain size. Linear system size is IDIM**3
+0123                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From f921aa47c47ca8902ed8505e114d4572189a0a29 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 8 Jun 2022 15:19:57 -0500
Subject: [PATCH 24/96] Master region for tempCounter.clear() (Might have
 solved stucked runs)

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 2a541e9f..dee2a019 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -530,7 +530,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         PCounter[ghostOwner]++;
 
 
-                        //TODO whyyyyy does it fail if I use a private data structure???
+                        //TODO why does it fail if I use a private data structure???
                         /*
                         privateQLocalVtx.push_back(v + StartIndex);
                         privateQGhostVtx.push_back(w);
@@ -632,19 +632,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         } //End of for ( v=0; v < NLVer; v++ )
 
-
-#pragma omp critical(privateMsg)
-        {
-            while (!privateQLocalVtx.empty())
-            {
-
-                    QLocalVtx.push_back(privateQLocalVtx.pop_back());
-                    QGhostVtx.push_back(privateQGhostVtx.pop_back());
-                    QMsgType.push_back(privateQMsgType.pop_back());
-                    QOwner.push_back(privateQOwner.pop_back());
-            }
-        }
-
 #pragma omp critical(U)
         {
             while (!privateU.empty())
@@ -653,8 +640,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             }
         }
 
-    tempCounter.clear(); //Do not need this any more
-
+#pragma omp master
+        {
+            tempCounter.clear(); //Do not need this any more
+        }
 
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
@@ -703,8 +692,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         fflush(stdout);
 #endif
 
+                        //If the current vertex is pointing to a matched vertex and is not matched
+                        //FIXME is there a way to make candidateMate private?
+                        //      for the moment it could generate an error.
                         if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
-                            candidateMate[v - StartIndex] == u) { //Only if pointing to the matched vertex
+                            candidateMate[v - StartIndex] == u) {
                             //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
                             //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
                             w = computeCandidateMate(verLocPtr[v - StartIndex],
@@ -726,21 +718,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             //If found a dominating edge:
                             if (w >= 0) {
                                 if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-                                    //Build the Message Packet:
-                                    //Message[0] = v; //LOCAL
-                                    //Message[1] = w; //GHOST
-                                    //Message[2] = REQUEST;  //TYPE
-                                    //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
                                     cout<<"\n("<<myRank<<")Sending a request message:";
                                     cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
-                                    /*MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                                     ComputeTag, comm);*/
+
                                     QLocalVtx.push_back(v);
                                     QGhostVtx.push_back(w);
                                     QMsgType.push_back(REQUEST);
-                                    //ghostOwner = inputSubGraph.findOwner(w);
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                     assert(ghostOwner != -1);
                                     assert(ghostOwner != myRank);
@@ -870,6 +855,19 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     } //End of while ( /*!Q.empty()*/ !U.empty() )
 
+#pragma omp critical(privateMsg)
+        {
+            while (!privateQLocalVtx.empty()) {
+
+                QLocalVtx.push_back(privateQLocalVtx.pop_back());
+                QGhostVtx.push_back(privateQGhostVtx.pop_back());
+                QMsgType.push_back(privateQMsgType.pop_back());
+                QOwner.push_back(privateQOwner.pop_back());
+
+            }
+        }
+
+
 #ifdef COUNT_LOCAL_VERTEX
        printf("Count local vertexes: %ld for thread %d of processor %d\n",
               localVertices,

From 6c20cd7819d59cac0bb7cff58ef85a815c001236 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Fri, 10 Jun 2022 15:34:29 -0500
Subject: [PATCH 25/96] PROCESS MATCHED VERTICES draft of parallelization

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 334 +++++++++---------
 samples/advanced/pdegen/runs/amg_pde3d.inp    |   2 +-
 2 files changed, 175 insertions(+), 161 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index dee2a019..d93337c9 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -1,6 +1,7 @@
 #include "MatchBoxPC.h"
 #include <omp.h>
 #include <stdio.h>
+
 // ***********************************************************************
 //
 //        MatchboxP: A C++ library for approximate weighted matching
@@ -602,8 +603,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
             } //End of if(w >=0)
 
-            //This piece of code is executed a really small amount of times, I will not allocate a
-            //huge amount of memory to the private data structures.
             adj11 = verLocPtr[v];
             adj12 = verLocPtr[v + 1];
             for (k1 = adj11; k1 < adj12; k1++) {
@@ -622,16 +621,28 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     assert(ghostOwner != -1);
                     assert(ghostOwner != myRank);
                     PCounter[ghostOwner]++;
-                    QLocalVtx.push_back(v + StartIndex);
-                    QGhostVtx.push_back(w);
-                    QMsgType.push_back(FAILURE);
-                    QOwner.push_back(ghostOwner);
+                    privateQLocalVtx.push_back(v + StartIndex);
+                    privateQGhostVtx.push_back(w);
+                    privateQMsgType.push_back(FAILURE);
+                    privateQOwner.push_back(ghostOwner);
 
                 } //End of if(GHOST)
             } //End of for loop
             //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         } //End of for ( v=0; v < NLVer; v++ )
 
+#pragma omp critical(privateMsg)
+        {
+            while (!privateQLocalVtx.empty()) {
+
+                QLocalVtx.push_back(privateQLocalVtx.pop_back());
+                QGhostVtx.push_back(privateQGhostVtx.pop_back());
+                QMsgType.push_back(privateQMsgType.pop_back());
+                QOwner.push_back(privateQOwner.pop_back());
+
+            }
+        }
+
 #pragma omp critical(U)
         {
             while (!privateU.empty())
@@ -658,202 +669,205 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt localVertices = 0;
 #endif
 
-    while( true )
-    {
+        while (true) {
 
 #pragma omp critical(U)
-        {
-            if (U.empty()) isEmpty = true;
-            else u = U.pop_front();
-        } // End of critical U
-        if (isEmpty) break;
+            {
+                if (U.empty()) isEmpty = true;
+                else u = U.pop_front();
+            } // End of critical U
+            if (isEmpty) break;
 
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
+            cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
 #endif
-        if ( (u >= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices
+            if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices
 
 #ifdef COUNT_LOCAL_VERTEX
-    localVertices ++;
+                localVertices ++;
 #endif
 
-            //Get the Adjacency list for u
-            adj1 = verLocPtr[u - StartIndex];  //Pointer
-            adj2 = verLocPtr[u - StartIndex + 1];
-            for (k = adj1; k < adj2; k++) {
-                v = verLocInd[k];
+                //Get the Adjacency list for u
+                adj1 = verLocPtr[u - StartIndex];  //Pointer
+                adj2 = verLocPtr[u - StartIndex + 1];
+                for (k = adj1; k < adj2; k++) {
+                    v = verLocInd[k];
 
-                if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
 #pragma omp critical
                     {
 
-#ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
-                        fflush(stdout);
-#endif
+                        if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
 
-                        //If the current vertex is pointing to a matched vertex and is not matched
-                        //FIXME is there a way to make candidateMate private?
-                        //      for the moment it could generate an error.
-                        if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
-                            candidateMate[v - StartIndex] == u) {
-                            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                            w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                     verLocPtr[v - StartIndex + 1],
-                                                     edgeLocWeight, 0,
-                                                     verLocInd,
-                                                     StartIndex,
-                                                     EndIndex,
-                                                     GMate,
-                                                     Mate,
-                                                     Ghost2LocalMap);
-
-                            candidateMate[v - StartIndex] = w;
 
-                            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
-#endif
-                            //If found a dominating edge:
-                            if (w >= 0) {
-                                if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-#ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")Sending a request message:";
-                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-#endif
 
-                                    QLocalVtx.push_back(v);
-                                    QGhostVtx.push_back(w);
-                                    QMsgType.push_back(REQUEST);
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
-                                    QOwner.push_back(ghostOwner);
-                                    PCounter[ghostOwner]++;
-                                    NumMessagesBundled++;
-                                    msgInd++;
-                                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
-                                        Mate[v - StartIndex] = w;  //v is a local vertex
-                                        GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
-                                        //Q.push_back(u);
-                                        privateU.push_back(v);
-                                        privateU.push_back(w);
-                                        privateMyCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
-#endif
-                                        //Decrement the counter:
-                                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                        if (Counter[Ghost2LocalMap[w]] > 0) {
-                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                            if (Counter[Ghost2LocalMap[w]] == 0) {
-                                                S--; //Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                                                fflush(stdout);
+                            cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
+                            fflush(stdout);
 #endif
-                                            }
-                                        } //End of if Counter[w] > 0
-                                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                    } //End of if CandidateMate[w] = v
-                                } //End of if a Ghost Vertex
-                                else { //w is a local vertex
-                                    if (candidateMate[w - StartIndex] == v) {
-                                        Mate[v - StartIndex] = w;  //v is a local vertex
-                                        Mate[w - StartIndex] = v;  //w is a local vertex
-                                        //Q.push_back(u);
-                                        privateU.push_back(v);
-                                        privateU.push_back(w);
-                                        privateMyCard++;
+
+                            //If the current vertex is pointing to a matched vertex and is not matched
+                            //FIXME is there a way to make candidateMate private?
+                            //      for the moment it could generate errors.
+                            if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
+                                candidateMate[v - StartIndex] == u) {
+
+                                //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                                w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                         verLocPtr[v - StartIndex + 1],
+                                                         edgeLocWeight, 0,
+                                                         verLocInd,
+                                                         StartIndex,
+                                                         EndIndex,
+                                                         GMate,
+                                                         Mate,
+                                                         Ghost2LocalMap);
+
+                                candidateMate[v - StartIndex] = w;
+
+                                //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+                                cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
 #endif
-                                    } //End of if(CandidateMate(w) = v
-                                } //End of Else
-                            } //End of if(w >=0)
-                            else {
-                                adj11 = verLocPtr[v - StartIndex];
-                                adj12 = verLocPtr[v - StartIndex + 1];
-                                for (k1 = adj11; k1 < adj12; k1++) {
-                                    w = verLocInd[k1];
+                                //If found a dominating edge:
+
+                                if (w >= 0) {
+
                                     if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-                                        //Build the Message Packet:
-                                        //Message[0] = v;	     //LOCAL
-                                        //Message[1] = w;            //GHOST
-                                        //Message[2] = FAILURE;      //TYPE
-                                        //Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                        cout<<"\n("<<myRank<<")Sending a request message:";
                                         cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                        fflush(stdout);
 #endif
-                                        /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                                         ComputeTag, comm); */
-                                        QLocalVtx.push_back(v);
-                                        QGhostVtx.push_back(w);
-                                        QMsgType.push_back(FAILURE);
-                                        //ghostOwner = inputSubGraph.findOwner(w);
+
+
                                         ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                         assert(ghostOwner != -1);
                                         assert(ghostOwner != myRank);
                                         QOwner.push_back(ghostOwner);
+                                        QLocalVtx.push_back(v);
+                                        QGhostVtx.push_back(w);
+                                        QMsgType.push_back(REQUEST);
                                         PCounter[ghostOwner]++;
                                         NumMessagesBundled++;
                                         msgInd++;
-                                    } //End of if(GHOST)
-                                } //End of for loop
-                            } // End of Else: w == -1
-                            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        } //End of If (candidateMate[v-StartIndex] == u)
 
-                    } //End of critical region if
 
-                } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                else { //Neighbor is a ghost vertex
+                                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+                                            Mate[v - StartIndex] = w;  //v is a local vertex
+                                            GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
+                                            //Q.push_back(u);
+                                            privateU.push_back(v);
+                                            privateU.push_back(w);
+                                            privateMyCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                            cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                            //Decrement the counter:
+                                            //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                            if (Counter[Ghost2LocalMap[w]] > 0) {
+                                                Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                                if (Counter[Ghost2LocalMap[w]] == 0) {
+                                                    S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                                    cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                                    fflush(stdout);
+#endif
+                                                }
+                                            } //End of if Counter[w] > 0
 
-#pragma omp critical
-                    {
-                        if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                            candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                        if (v != Mate[u - StartIndex]) { //u is local
-                            //Build the Message Packet:
-                            //Message[0] = u; //LOCAL
-                            //Message[1] = v; //GHOST
-                            //Message[2] = SUCCESS;  //TYPE
-                            //Send a Request (Asynchronous)
+                                            //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                        } //End of if CandidateMate[w] = v
+
+                                    } //End of if a Ghost Vertex
+                                    else { //w is a local vertex
 
+                                        if (candidateMate[w - StartIndex] == v) {
+                                            Mate[v - StartIndex] = w;  //v is a local vertex
+                                            Mate[w - StartIndex] = v;  //w is a local vertex
+                                            privateU.push_back(v);
+                                            privateU.push_back(w);
+                                            privateMyCard++;
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")Sending a success message: ";
-                            cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs)<<"\n"; fflush(stdout);
+                                            cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
 #endif
+                                        } //End of if(CandidateMate(w) = v
 
-                            QLocalVtx.push_back(u);
-                            QGhostVtx.push_back(v);
-                            QMsgType.push_back(SUCCESS);
-                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-                            QOwner.push_back(ghostOwner);
-                            PCounter[ghostOwner]++;
-                            NumMessagesBundled++;
-                            msgInd++;
-                        } //End of If( v != Mate[u] )
-                    } //End of critical region
-                } //End of Else //A Ghost Vertex
-            } //End of For Loop adj(u)
-        } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+                                    } //End of Else
+
+                                } //End of if(w >=0)
+                                else {
+
+                                    adj11 = verLocPtr[v - StartIndex];
+                                    adj12 = verLocPtr[v - StartIndex + 1];
+                                    for (k1 = adj11; k1 < adj12; k1++) {
+                                        w = verLocInd[k1];
+                                        if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                                            cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                            cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            fflush(stdout);
+#endif
+                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            privateQLocalVtx.push_back(v);
+                                            privateQGhostVtx.push_back(w);
+                                            privateQMsgType.push_back(FAILURE);
+                                            privateQOwner.push_back(ghostOwner);
+                                            assert(ghostOwner != -1);
+                                            assert(ghostOwner != myRank);
+                                            PCounter[ghostOwner]++;
+                                            NumMessagesBundled++;
+                                            msgInd++;
+                                        } //End of if(GHOST)
+                                    } //End of for loop
+                                } // End of Else: w == -1
+                                //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+
+                                //} //End of critical region if
+
+                            } //End of If (candidateMate[v-StartIndex] == u)
+
+                            //}//End of critical region
+
+                        } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else { //Neighbor is a ghost vertex
+
+                            if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+
+                            if (v != Mate[u - StartIndex]) { //u is local
+
+#ifdef PRINT_DEBUG_INFO_
+                                cout<<"\n("<<myRank<<")Sending a success message: ";
+                                cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs)<<"\n"; fflush(stdout);
+#endif
+                                ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                                QLocalVtx.push_back(u);
+                                QGhostVtx.push_back(v);
+                                QMsgType.push_back(SUCCESS);
+                                QOwner.push_back(ghostOwner);
+                                assert(ghostOwner != -1);
+                                assert(ghostOwner != myRank);
+                                PCounter[ghostOwner]++;
+                                NumMessagesBundled++;
+                                msgInd++;
+                            } //End of If( v != Mate[u] )
+
+                        } //End of Else //A Ghost Vertex
+                    }// end of critical section
+                } //End of For Loop adj(u)
+            } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
 
 #pragma omp critical(U)
-        {
-            while(!privateU.empty()) {
-                U.push_back(privateU.pop_front());
-            }
+            {
+                while (!privateU.empty()) {
+                    U.push_back(privateU.pop_front());
+                }
 
-            myCard += privateMyCard;
-        } //End of critical U
+                myCard += privateMyCard;
+            } //End of critical U
 
-    } //End of while ( /*!Q.empty()*/ !U.empty() )
+        } //End of while
 
 #pragma omp critical(privateMsg)
         {
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index bdacc992..1b59d29b 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0123                        ! IDIM; domain size. Linear system size is IDIM**3
+00080                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From b2230a6d6d87e7098c59655f435ef2a6c4319751 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Mon, 13 Jun 2022 16:09:00 -0500
Subject: [PATCH 26/96] Improved critical region U

---
 ...istEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index dee2a019..966b86a2 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -841,9 +841,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         } //End of If( v != Mate[u] )
                     } //End of critical region
                 } //End of Else //A Ghost Vertex
+
             } //End of For Loop adj(u)
+
         } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
 
+        //Avoid to ask for the critical section if there is nothing to add
+        if(privateU.empty()) continue;
 #pragma omp critical(U)
         {
             while(!privateU.empty()) {
@@ -852,7 +856,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
             myCard += privateMyCard;
         } //End of critical U
-
     } //End of while ( /*!Q.empty()*/ !U.empty() )
 
 #pragma omp critical(privateMsg)

From bf35c1659b4f8aa7b9ebd635a6d39fe19f37654f Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Mon, 13 Jun 2022 16:53:12 -0500
Subject: [PATCH 27/96] Further improved critical region U

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 38 +++++++++++++++----
 samples/advanced/pdegen/runs/amg_pde3d.inp    |  2 +-
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 966b86a2..82ca4c44 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -72,6 +72,8 @@
  Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
  */
 
+#define UCHUNK 1000
+
 #ifdef SERIAL_MPI
 #else
 //MPI type map
@@ -658,23 +660,41 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt localVertices = 0;
 #endif
 
-    while( true )
-    {
+    //TODO what would be the optimal UCHUNK
+    vector <MilanLongInt> Us;
+    Us.reserve(UCHUNK);
+
+    while( true ) {
 
+        Us.clear();
 #pragma omp critical(U)
         {
-            if (U.empty()) isEmpty = true;
-            else u = U.pop_front();
+            //If U is emptu and there are no new node to add to U
+            if (U.empty() && privateU.empty())
+                isEmpty = true;
+            else {
+                if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
+                    while (!privateU.empty()) {
+                        U.push_back(privateU.pop_front());
+                        myCard += privateMyCard;
+                    }
+                for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes
+                    if (U.empty()) break;
+                    Us.push_back(U.pop_front());
+                }
+            }
         } // End of critical U
         if (isEmpty) break;
 
+        for (MilanLongInt u : Us)
+        {
 #ifdef PRINT_DEBUG_INFO_
         cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
 #endif
-        if ( (u >= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices
+        if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices
 
 #ifdef COUNT_LOCAL_VERTEX
-    localVertices ++;
+            localVertices ++;
 #endif
 
             //Get the Adjacency list for u
@@ -847,15 +867,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
 
         //Avoid to ask for the critical section if there is nothing to add
-        if(privateU.empty()) continue;
+        if (privateU.size() < UCHUNK && !U.empty()) continue;
 #pragma omp critical(U)
         {
-            while(!privateU.empty()) {
+            while (!privateU.empty()) {
                 U.push_back(privateU.pop_front());
             }
 
             myCard += privateMyCard;
         } //End of critical U
+
+    }
     } //End of while ( /*!Q.empty()*/ !U.empty() )
 
 #pragma omp critical(privateMsg)
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index bdacc992..eb254780 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0123                        ! IDIM; domain size. Linear system size is IDIM**3
+0080                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From 6fd571ecb2d30a037c668019553e707664e8270e Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Tue, 14 Jun 2022 14:33:31 -0500
Subject: [PATCH 28/96] Lock error

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 48 +++++++++++++++----
 samples/advanced/pdegen/runs/amg_pde3d.inp    |  2 +-
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 82ca4c44..c9568a9f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -225,6 +225,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanInt BufferSize;
     MilanLongInt *Buffer;
     bool isEmpty;
+
+    //Declare the locks
+    omp_lock_t MateLock[NLVer];
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
@@ -232,8 +235,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
 
-        // TODO comments about the reduction
+        //Initialize the locks
+        //TODO this can be executed as task in parallel with other unparallelizable tasks
+        //TODO destroy the locks
+#pragma omp for schedule(static)
+        for(int i = 0; i < NLVer; i++)
+            omp_init_lock(&MateLock[i]);
 
+        // TODO comments about the reduction
 #pragma omp for reduction(+ : numGhostEdges)
         for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
             insertMe = verLocInd[i];
@@ -704,7 +713,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 v = verLocInd[k];
 
                 if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
-#pragma omp critical
+#pragma omp critical(innerProcessMatched)
                     {
 
 #ifdef PRINT_DEBUG_INFO_
@@ -712,11 +721,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         fflush(stdout);
 #endif
 
+
                         //If the current vertex is pointing to a matched vertex and is not matched
                         //FIXME is there a way to make candidateMate private?
                         //      for the moment it could generate an error.
                         if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
                             candidateMate[v - StartIndex] == u) {
+
+
                             //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
                             //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
                             w = computeCandidateMate(verLocPtr[v - StartIndex],
@@ -737,6 +749,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
                             //If found a dominating edge:
                             if (w >= 0) {
+
+                                //TODO is it possible to lock without a critical region?
+                                //TODO there must be a more elegant and efficient way to do this
+                                while(true) {
+                                    if (omp_test_lock(&MateLock[v - StartIndex])) {
+                                        if (omp_test_lock(&MateLock[w - StartIndex])) break;
+                                        else omp_unset_lock(&MateLock[v - StartIndex]);
+                                    }
+                                }
+
+
                                 if ((w < StartIndex) || (w > EndIndex)) { //A ghost
 #ifdef PRINT_DEBUG_INFO_
                                     cout<<"\n("<<myRank<<")Sending a request message:";
@@ -791,6 +814,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
                                     } //End of if(CandidateMate(w) = v
                                 } //End of Else
+
+                                omp_unset_lock(&MateLock[v - StartIndex]);
+                                omp_unset_lock(&MateLock[w - StartIndex]);
+
                             } //End of if(w >=0)
                             else {
                                 adj11 = verLocPtr[v - StartIndex];
@@ -798,11 +825,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 for (k1 = adj11; k1 < adj12; k1++) {
                                     w = verLocInd[k1];
                                     if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-                                        //Build the Message Packet:
-                                        //Message[0] = v;	     //LOCAL
-                                        //Message[1] = w;            //GHOST
-                                        //Message[2] = FAILURE;      //TYPE
-                                        //Send a Request (Asynchronous)
+
 #ifdef PRINT_DEBUG_INFO_
                                         cout<<"\n("<<myRank<<")Sending a failure message: ";
                                         cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
@@ -825,15 +848,19 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 } //End of for loop
                             } // End of Else: w == -1
                             //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        } //End of If (candidateMate[v-StartIndex] == u)
+
+                        } //End of If (candidateMate[v-StartIndex] == u
 
                     } //End of critical region if
 
                 } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
                 else { //Neighbor is a ghost vertex
 
-#pragma omp critical
+#pragma omp critical(innerProcessMatched)
                     {
+
+                        while(!omp_test_lock(&MateLock[u - StartIndex]));
+
                         if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
                             candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
                         if (v != Mate[u - StartIndex]) { //u is local
@@ -859,6 +886,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             NumMessagesBundled++;
                             msgInd++;
                         } //End of If( v != Mate[u] )
+
+                        omp_unset_lock(&MateLock[u - StartIndex]);
+
                     } //End of critical region
                 } //End of Else //A Ghost Vertex
 
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index eb254780..904b6551 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0080                        ! IDIM; domain size. Linear system size is IDIM**3
+020                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From 2044c5c8eb6a0c307a8d316140aec951954394b3 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Tue, 14 Jun 2022 14:47:45 -0500
Subject: [PATCH 29/96] Merge fix, lock error

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 610 +++++++++---------
 1 file changed, 298 insertions(+), 312 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 6e58d724..f747f1fc 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -1,7 +1,6 @@
 #include "MatchBoxPC.h"
 #include <omp.h>
 #include <stdio.h>
-
 // ***********************************************************************
 //
 //        MatchboxP: A C++ library for approximate weighted matching
@@ -314,17 +313,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
 
 
-        /*
-         * Not parallelizable
-         */
+            /*
+             * Not parallelizable
+             */
 
-        for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
-            verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
+            for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
+                verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
-            cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
+                cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
 #endif
-        }
-    } // End of single region
+            }
+        } // End of single region
 
 #ifdef TIME_TRACKER
         verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
@@ -458,41 +457,41 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         */
 
 #pragma omp for schedule(static)
-            for ( v=0; v < NLVer; v++ ) {
+        for ( v=0; v < NLVer; v++ ) {
 #ifdef PRINT_DEBUG_INFO_
-                cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
+            cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
 #endif
-                //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
-                //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-            }
+            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
+            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+        }
 
-            /*
-             * PARALLEL_PROCESS_EXPOSED_VERTEX_B
-             * The sequential version could be a bit more
-             * efficient.
-             *
-             * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner
-             *       first in a local variable and then, only at the end, append them to the real data structure
-             *       to remove the critical sections.
-             *
-             * TODO: Test when it's more efficient to execute this code
-             *       in parallel.
-             */
+        /*
+         * PARALLEL_PROCESS_EXPOSED_VERTEX_B
+         * The sequential version could be a bit more
+         * efficient.
+         *
+         * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner
+         *       first in a local variable and then, only at the end, append them to the real data structure
+         *       to remove the critical sections.
+         *
+         * TODO: Test when it's more efficient to execute this code
+         *       in parallel.
+         */
 
-            MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size?
-            //Fail messages
-            privateQLocalVtx.~staticQueue();
-            privateQGhostVtx.~staticQueue();
-            privateQMsgType.~staticQueue();
-            privateQOwner.~staticQueue();
-            privateU.~staticQueue();
+        MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size?
+        //Fail messages
+        privateQLocalVtx.~staticQueue();
+        privateQGhostVtx.~staticQueue();
+        privateQMsgType.~staticQueue();
+        privateQOwner.~staticQueue();
+        privateU.~staticQueue();
 
-            new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
-            new(&privateQLocalVtx) staticQueue(size);
-            new(&privateQGhostVtx) staticQueue(size);
-            new(&privateQMsgType) staticQueue(size);
-            new(&privateQOwner) staticQueue(size);
+        new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
+        new(&privateQLocalVtx) staticQueue(size);
+        new(&privateQGhostVtx) staticQueue(size);
+        new(&privateQMsgType) staticQueue(size);
+        new(&privateQOwner) staticQueue(size);
 
 #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
         for (v = 0; v < NLVer; v++) {
@@ -574,7 +573,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
 #pragma omp critical
                             {
-                            if (Counter[Ghost2LocalMap[w]] > 0) {
+                                if (Counter[Ghost2LocalMap[w]] > 0) {
 
                                     Counter[Ghost2LocalMap[w]] -= 1; //Decrement
                                     if (Counter[Ghost2LocalMap[w]] == 0) {
@@ -614,6 +613,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
             } //End of if(w >=0)
 
+            //This piece of code is executed a really small amount of times, I will not allocate a
+            //huge amount of memory to the private data structures.
             adj11 = verLocPtr[v];
             adj12 = verLocPtr[v + 1];
             for (k1 = adj11; k1 < adj12; k1++) {
@@ -632,28 +633,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     assert(ghostOwner != -1);
                     assert(ghostOwner != myRank);
                     PCounter[ghostOwner]++;
-                    privateQLocalVtx.push_back(v + StartIndex);
-                    privateQGhostVtx.push_back(w);
-                    privateQMsgType.push_back(FAILURE);
-                    privateQOwner.push_back(ghostOwner);
+                    QLocalVtx.push_back(v + StartIndex);
+                    QGhostVtx.push_back(w);
+                    QMsgType.push_back(FAILURE);
+                    QOwner.push_back(ghostOwner);
 
                 } //End of if(GHOST)
             } //End of for loop
             //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         } //End of for ( v=0; v < NLVer; v++ )
 
-#pragma omp critical(privateMsg)
-        {
-            while (!privateQLocalVtx.empty()) {
-
-                QLocalVtx.push_back(privateQLocalVtx.pop_back());
-                QGhostVtx.push_back(privateQGhostVtx.pop_back());
-                QMsgType.push_back(privateQMsgType.pop_back());
-                QOwner.push_back(privateQOwner.pop_back());
-
-            }
-        }
-
 #pragma omp critical(U)
         {
             while (!privateU.empty())
@@ -668,261 +657,258 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         }
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+        cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
     fflush(stdout);
 #endif
-    ///////////////////////////////////////////////////////////////////////////////////
-    /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////////////
-    isEmpty = false;
+        ///////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////
+        isEmpty = false;
 
 #ifdef COUNT_LOCAL_VERTEX
-    MilanLongInt localVertices = 0;
+        MilanLongInt localVertices = 0;
 #endif
 
-    //TODO what would be the optimal UCHUNK
-    vector <MilanLongInt> Us;
-    Us.reserve(UCHUNK);
+        //TODO what would be the optimal UCHUNK
+        vector <MilanLongInt> Us;
+        Us.reserve(UCHUNK);
 
-    while( true ) {
+        while( true ) {
 
-        Us.clear();
+            Us.clear();
 #pragma omp critical(U)
-        {
-            //If U is emptu and there are no new node to add to U
-            if (U.empty() && privateU.empty())
-                isEmpty = true;
-            else {
-                if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
-                    while (!privateU.empty()) {
-                        U.push_back(privateU.pop_front());
-                        myCard += privateMyCard;
+            {
+                //If U is emptu and there are no new node to add to U
+                if (U.empty() && privateU.empty())
+                    isEmpty = true;
+                else {
+                    if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
+                        while (!privateU.empty()) {
+                            U.push_back(privateU.pop_front());
+                            myCard += privateMyCard;
+                        }
+                    for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes
+                        if (U.empty()) break;
+                        Us.push_back(U.pop_front());
                     }
-                for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes
-                    if (U.empty()) break;
-                    Us.push_back(U.pop_front());
                 }
-            }
-        } // End of critical U
-        if (isEmpty) break;
+            } // End of critical U
+            if (isEmpty) break;
 
-        for (MilanLongInt u : Us)
-        {
+            for (MilanLongInt u : Us)
+            {
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
+                cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
 #endif
-        if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices
+                if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices
 
 #ifdef COUNT_LOCAL_VERTEX
-            localVertices ++;
+                    localVertices ++;
 #endif
 
-                //Get the Adjacency list for u
-                adj1 = verLocPtr[u - StartIndex];  //Pointer
-                adj2 = verLocPtr[u - StartIndex + 1];
-                for (k = adj1; k < adj2; k++) {
-                    v = verLocInd[k];
-
-                if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
-#pragma omp critical(innerProcessMatched)
-                    {
+                    //Get the Adjacency list for u
+                    adj1 = verLocPtr[u - StartIndex];  //Pointer
+                    adj2 = verLocPtr[u - StartIndex + 1];
+                    for (k = adj1; k < adj2; k++) {
+                        v = verLocInd[k];
 
                         if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
+#pragma omp critical(innerProcessMatched)
+                            {
 
-
-                        //If the current vertex is pointing to a matched vertex and is not matched
-                        //FIXME is there a way to make candidateMate private?
-                        //      for the moment it could generate an error.
-                        if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
-                            candidateMate[v - StartIndex] == u) {
-
-
-                            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                            w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                     verLocPtr[v - StartIndex + 1],
-                                                     edgeLocWeight, 0,
-                                                     verLocInd,
-                                                     StartIndex,
-                                                     EndIndex,
-                                                     GMate,
-                                                     Mate,
-                                                     Ghost2LocalMap);
-
-                            candidateMate[v - StartIndex] = w;
-
-                            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
+                                cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
+                        fflush(stdout);
 #endif
-                            //If found a dominating edge:
-                            if (w >= 0) {
-
-                                //TODO is it possible to lock without a critical region?
-                                //TODO there must be a more elegant and efficient way to do this
-                                while(true) {
-                                    if (omp_test_lock(&MateLock[v - StartIndex])) {
-                                        if (omp_test_lock(&MateLock[w - StartIndex])) break;
-                                        else omp_unset_lock(&MateLock[v - StartIndex]);
-                                    }
-                                }
 
 
-                                if ((w < StartIndex) || (w > EndIndex)) { //A ghost
-#ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")Sending a request message:";
-                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-#endif
-
-#ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
-                            fflush(stdout);
-#endif
+                                //If the current vertex is pointing to a matched vertex and is not matched
+                                //FIXME is there a way to make candidateMate private?
+                                //      for the moment it could generate an error.
+                                if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
+                                    candidateMate[v - StartIndex] == u) {
 
-                            //If the current vertex is pointing to a matched vertex and is not matched
-                            //FIXME is there a way to make candidateMate private?
-                            //      for the moment it could generate errors.
-                            if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
-                                candidateMate[v - StartIndex] == u) {
 
-                                //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                                w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                         verLocPtr[v - StartIndex + 1],
-                                                         edgeLocWeight, 0,
-                                                         verLocInd,
-                                                         StartIndex,
-                                                         EndIndex,
-                                                         GMate,
-                                                         Mate,
-                                                         Ghost2LocalMap);
+                                    //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                                    w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                             verLocPtr[v - StartIndex + 1],
+                                                             edgeLocWeight, 0,
+                                                             verLocInd,
+                                                             StartIndex,
+                                                             EndIndex,
+                                                             GMate,
+                                                             Mate,
+                                                             Ghost2LocalMap);
 
-                                candidateMate[v - StartIndex] = w;
+                                    candidateMate[v - StartIndex] = w;
 
-                                //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                                    //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
+                                    cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
 #endif
-                                    } //End of if(CandidateMate(w) = v
-                                } //End of Else
+                                    //If found a dominating edge:
+                                    if (w >= 0) {
 
-                                omp_unset_lock(&MateLock[v - StartIndex]);
-                                omp_unset_lock(&MateLock[w - StartIndex]);
+                                        //TODO is it possible to lock without a critical region?
+                                        //TODO there must be a more elegant and efficient way to do this
+                                        while(true) {
+                                            if (omp_test_lock(&MateLock[v - StartIndex])) {
+                                                if (omp_test_lock(&MateLock[w - StartIndex])) break;
+                                                else omp_unset_lock(&MateLock[v - StartIndex]);
+                                            }
+                                        }
 
-                            } //End of if(w >=0)
-                            else {
-                                adj11 = verLocPtr[v - StartIndex];
-                                adj12 = verLocPtr[v - StartIndex + 1];
-                                for (k1 = adj11; k1 < adj12; k1++) {
-                                    w = verLocInd[k1];
-                                    if ((w < StartIndex) || (w > EndIndex)) { //A ghost
 
+                                        if ((w < StartIndex) || (w > EndIndex)) { //A ghost
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")Sending a request message:";
-                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            cout<<"\n("<<myRank<<")Sending a request message:";
+                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 
-
-                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                        assert(ghostOwner != -1);
-                                        assert(ghostOwner != myRank);
-                                        QOwner.push_back(ghostOwner);
-                                        QLocalVtx.push_back(v);
-                                        QGhostVtx.push_back(w);
-                                        QMsgType.push_back(REQUEST);
-                                        PCounter[ghostOwner]++;
-                                        NumMessagesBundled++;
-                                        msgInd++;
-                                    } //End of if(GHOST)
-                                } //End of for loop
-                            } // End of Else: w == -1
-                            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-
-                        } //End of If (candidateMate[v-StartIndex] == u
-
-
-                                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
-                                            Mate[v - StartIndex] = w;  //v is a local vertex
-                                            GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
-                                            //Q.push_back(u);
-                                            privateU.push_back(v);
-                                            privateU.push_back(w);
-                                            privateMyCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                            cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+                                            QLocalVtx.push_back(v);
+                                            QGhostVtx.push_back(w);
+                                            QMsgType.push_back(REQUEST);
+                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            assert(ghostOwner != -1);
+                                            assert(ghostOwner != myRank);
+                                            QOwner.push_back(ghostOwner);
+                                            PCounter[ghostOwner]++;
+                                            NumMessagesBundled++;
+                                            msgInd++;
+                                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+                                                Mate[v - StartIndex] = w;  //v is a local vertex
+                                                GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
+                                                //Q.push_back(u);
+                                                privateU.push_back(v);
+                                                privateU.push_back(w);
+                                                privateMyCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                                cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                                //Decrement the counter:
+                                                //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                                if (Counter[Ghost2LocalMap[w]] > 0) {
+                                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
+                                                    if (Counter[Ghost2LocalMap[w]] == 0) {
+                                                        S--; //Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                                        cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                                fflush(stdout);
 #endif
-                                            //Decrement the counter:
-                                            //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                            if (Counter[Ghost2LocalMap[w]] > 0) {
-                                                Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                                if (Counter[Ghost2LocalMap[w]] == 0) {
-                                                    S--; //Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                                    cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                                                    fflush(stdout);
+                                                    }
+                                                } //End of if Counter[w] > 0
+                                                //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                            } //End of if CandidateMate[w] = v
+                                        } //End of if a Ghost Vertex
+                                        else { //w is a local vertex
+                                            if (candidateMate[w - StartIndex] == v) {
+                                                Mate[v - StartIndex] = w;  //v is a local vertex
+                                                Mate[w - StartIndex] = v;  //w is a local vertex
+                                                //Q.push_back(u);
+                                                privateU.push_back(v);
+                                                privateU.push_back(w);
+                                                privateMyCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                                cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+#endif
+                                            } //End of if(CandidateMate(w) = v
+                                        } //End of Else
+
+                                        omp_unset_lock(&MateLock[v - StartIndex]);
+                                        omp_unset_lock(&MateLock[w - StartIndex]);
+
+                                    } //End of if(w >=0)
+                                    else {
+                                        adj11 = verLocPtr[v - StartIndex];
+                                        adj12 = verLocPtr[v - StartIndex + 1];
+                                        for (k1 = adj11; k1 < adj12; k1++) {
+                                            w = verLocInd[k1];
+                                            if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                                                cout<<"\n("<<myRank<<")Sending a failure message: ";
+                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        fflush(stdout);
 #endif
-                                                }
-                                            } //End of if Counter[w] > 0
+                                                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                                                 ComputeTag, comm); */
+                                                QLocalVtx.push_back(v);
+                                                QGhostVtx.push_back(w);
+                                                QMsgType.push_back(FAILURE);
+                                                //ghostOwner = inputSubGraph.findOwner(w);
+                                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                assert(ghostOwner != -1);
+                                                assert(ghostOwner != myRank);
+                                                QOwner.push_back(ghostOwner);
+                                                PCounter[ghostOwner]++;
+                                                NumMessagesBundled++;
+                                                msgInd++;
+                                            } //End of if(GHOST)
+                                        } //End of for loop
+                                    } // End of Else: w == -1
+                                    //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+
+                                } //End of If (candidateMate[v-StartIndex] == u
+
+                            } //End of critical region if
+
+                        } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else { //Neighbor is a ghost vertex
 
 #pragma omp critical(innerProcessMatched)
-                    {
+                            {
 
-                        while(!omp_test_lock(&MateLock[u - StartIndex]));
+                                while(!omp_test_lock(&MateLock[u - StartIndex]));
 
-                        if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                            candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                        if (v != Mate[u - StartIndex]) { //u is local
-                            //Build the Message Packet:
-                            //Message[0] = u; //LOCAL
-                            //Message[1] = v; //GHOST
-                            //Message[2] = SUCCESS;  //TYPE
-                            //Send a Request (Asynchronous)
+                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                                if (v != Mate[u - StartIndex]) { //u is local
+                                    //Build the Message Packet:
+                                    //Message[0] = u; //LOCAL
+                                    //Message[1] = v; //GHOST
+                                    //Message[2] = SUCCESS;  //TYPE
+                                    //Send a Request (Asynchronous)
 
-                                        if (candidateMate[w - StartIndex] == v) {
-                                            Mate[v - StartIndex] = w;  //v is a local vertex
-                                            Mate[w - StartIndex] = v;  //w is a local vertex
-                                            privateU.push_back(v);
-                                            privateU.push_back(w);
-                                            privateMyCard++;
 #ifdef PRINT_DEBUG_INFO_
-                                            cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+                                    cout<<"\n("<<myRank<<")Sending a success message: ";
+                            cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs)<<"\n"; fflush(stdout);
 #endif
-                                        } //End of if(CandidateMate(w) = v
 
-                            QLocalVtx.push_back(u);
-                            QGhostVtx.push_back(v);
-                            QMsgType.push_back(SUCCESS);
-                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-                            QOwner.push_back(ghostOwner);
-                            PCounter[ghostOwner]++;
-                            NumMessagesBundled++;
-                            msgInd++;
-                        } //End of If( v != Mate[u] )
+                                    QLocalVtx.push_back(u);
+                                    QGhostVtx.push_back(v);
+                                    QMsgType.push_back(SUCCESS);
+                                    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
+                                    QOwner.push_back(ghostOwner);
+                                    PCounter[ghostOwner]++;
+                                    NumMessagesBundled++;
+                                    msgInd++;
+                                } //End of If( v != Mate[u] )
 
-                        omp_unset_lock(&MateLock[u - StartIndex]);
+                                omp_unset_lock(&MateLock[u - StartIndex]);
 
-                    } //End of critical region
-                } //End of Else //A Ghost Vertex
+                            } //End of critical region
+                        } //End of Else //A Ghost Vertex
 
-            } //End of For Loop adj(u)
+                    } //End of For Loop adj(u)
 
-        } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+                } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
 
-        //Avoid to ask for the critical section if there is nothing to add
-        if (privateU.size() < UCHUNK && !U.empty()) continue;
+                //Avoid to ask for the critical section if there is nothing to add
+                if (privateU.size() < UCHUNK && !U.empty()) continue;
 #pragma omp critical(U)
-        {
-            while (!privateU.empty()) {
-                U.push_back(privateU.pop_front());
-            }
+                {
+                    while (!privateU.empty()) {
+                        U.push_back(privateU.pop_front());
+                    }
 
-                myCard += privateMyCard;
-            } //End of critical U
+                    myCard += privateMyCard;
+                } //End of critical U
 
-    }
-    } //End of while ( /*!Q.empty()*/ !U.empty() )
+            }
+        } //End of while ( /*!Q.empty()*/ !U.empty() )
 
 #pragma omp critical(privateMsg)
         {
@@ -938,20 +924,20 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
 
 #ifdef COUNT_LOCAL_VERTEX
-       printf("Count local vertexes: %ld for thread %d of processor %d\n",
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
               localVertices,
               omp_get_thread_num(),
               myRank);
 #endif
 
 
-    ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
+        ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 #ifdef DEBUG_HANG_
-    if (myRank == 0) cout<<"\n("<<myRank<<") Send Bundles" <<endl; fflush(stdout);
+        if (myRank == 0) cout<<"\n("<<myRank<<") Send Bundles" <<endl; fflush(stdout);
 #endif
-    /////////////////////////////////////////////////////////////////////////////////////////
-    ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
-    /////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////////////////////////////////////////////////////////
+        ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
+        /////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp barrier
 #pragma omp master
         {
@@ -980,15 +966,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             //Build the Message Bundle packet:
 
             //OMP Not parallelizable
-    for (MilanInt i=0; i<NumMessagesBundled; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
-        myIndex = ( PCumulative[QOwner[i]] + PCounter[QOwner[i]] )*3;
-        PMessageBundle[myIndex+0] = QLocalVtx[i];
-        PMessageBundle[myIndex+1] = QGhostVtx[i];
-        PMessageBundle[myIndex+2] = QMsgType[i];
-        PCounter[QOwner[i]]++;
-    }
+            for (MilanInt i=0; i<NumMessagesBundled; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
+                myIndex = ( PCumulative[QOwner[i]] + PCounter[QOwner[i]] )*3;
+                PMessageBundle[myIndex+0] = QLocalVtx[i];
+                PMessageBundle[myIndex+1] = QGhostVtx[i];
+                PMessageBundle[myIndex+2] = QMsgType[i];
+                PCounter[QOwner[i]]++;
+            }
 
-    //Send the Bundled Messages: Use ISend
+            //Send the Bundled Messages: Use ISend
 
             try {
                 SRequest.reserve(numProcs * 2); //At most two messages per processor
@@ -1044,7 +1030,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
+            cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
     cout<<"\n("<<myRank<<")Total number of potential message X 2 = "<<numGhostEdges*2;
     cout<<"\n("<<myRank<<")Number messages already sent in bundles = "<<NumMessagesBundled;
     if (numGhostEdges>0) {
@@ -1053,21 +1039,21 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     fflush(stdout);
 #endif
 
-    //Allocate memory for MPI Send messages:
-    /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
-    OneMessageSize=0;
-    MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); //Size of one message packet
-    //How many messages to send?
-    //Potentially three kinds of messages will be sent/received:
-    //Request, Success, Failure.
-    //But only two will be sent from a given processor.
-    //Substract the number of messages that have already been sent as bundled messages:
-    numMessagesToSend = numGhostEdges*2 - NumMessagesBundled;
-    BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend;
+            //Allocate memory for MPI Send messages:
+            /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
+            OneMessageSize=0;
+            MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); //Size of one message packet
+            //How many messages to send?
+            //Potentially three kinds of messages will be sent/received:
+            //Request, Success, Failure.
+            //But only two will be sent from a given processor.
+            //Substract the number of messages that have already been sent as bundled messages:
+            numMessagesToSend = numGhostEdges*2 - NumMessagesBundled;
+            BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend;
 
-    Buffer=0;
+            Buffer=0;
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")Size of One Message from PACK= "<<OneMessageSize;
+            cout<<"\n("<<myRank<<")Size of One Message from PACK= "<<OneMessageSize;
     cout<<"\n("<<myRank<<")Size of Message overhead = "<<MPI_BSEND_OVERHEAD;
     cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
     cout<<"\n("<<myRank<<")Number of remaining message = "<<numMessagesToSend;
@@ -1075,15 +1061,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     cout<<"\n("<<myRank<<")Attaching Buffer on.. ";
     fflush(stdout);
 #endif
-    if ( BufferSize > 0 ) {
-        Buffer = (MilanLongInt *) malloc(BufferSize);  //Allocate memory
-        if ( Buffer == 0 ) {
-            cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-            cout<<"Not enough memory to allocate for send buffer on process "<<myRank<<"\n";
-            exit(1);
-        }
-        MPI_Buffer_attach(Buffer, BufferSize); //Attach the Buffer
-    }
+            if ( BufferSize > 0 ) {
+                Buffer = (MilanLongInt *) malloc(BufferSize);  //Allocate memory
+                if ( Buffer == 0 ) {
+                    cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                    cout<<"Not enough memory to allocate for send buffer on process "<<myRank<<"\n";
+                    exit(1);
+                }
+                MPI_Buffer_attach(Buffer, BufferSize); //Attach the Buffer
+            }
         } //End of master
 
     } // end of parallel region
@@ -1707,10 +1693,10 @@ inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistan
  * @return
  */
 inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
-                                         MilanLongInt adj2,
-                                         MilanLongInt* verLocInd,
-                                         MilanReal* edgeLocWeight)
-                                         {
+                                              MilanLongInt adj2,
+                                              MilanLongInt* verLocInd,
+                                              MilanReal* edgeLocWeight)
+{
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
     int finalK;
@@ -1738,12 +1724,12 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
  * @return
  */
 inline bool isAlreadyMatched(MilanLongInt node,
-                                MilanLongInt StartIndex,
-                                MilanLongInt EndIndex,
-                                vector <MilanLongInt> &GMate,
-                                MilanLongInt* Mate,
-                                map <MilanLongInt, MilanLongInt> &Ghost2LocalMap
-                                ) {
+                             MilanLongInt StartIndex,
+                             MilanLongInt EndIndex,
+                             vector <MilanLongInt> &GMate,
+                             MilanLongInt* Mate,
+                             map <MilanLongInt, MilanLongInt> &Ghost2LocalMap
+) {
 
     bool result = false;
 #pragma omp critical(Mate)
@@ -1776,15 +1762,15 @@ inline bool isAlreadyMatched(MilanLongInt node,
  * @return
  */
 inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
-                                              MilanLongInt adj2,
-                                              MilanReal* edgeLocWeight,
-                                              MilanLongInt k,
-                                              MilanLongInt* verLocInd,
-                                              MilanLongInt StartIndex,
-                                              MilanLongInt EndIndex,
-                                              vector <MilanLongInt>& GMate,
-                                              MilanLongInt* Mate,
-                                              map <MilanLongInt, MilanLongInt>& Ghost2LocalMap)
+                                         MilanLongInt adj2,
+                                         MilanReal* edgeLocWeight,
+                                         MilanLongInt k,
+                                         MilanLongInt* verLocInd,
+                                         MilanLongInt StartIndex,
+                                         MilanLongInt EndIndex,
+                                         vector <MilanLongInt>& GMate,
+                                         MilanLongInt* Mate,
+                                         map <MilanLongInt, MilanLongInt>& Ghost2LocalMap)
 {
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN

From bf0532867d4b5d7e891a45609d496d213fb3ce84 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 25 Jun 2022 08:48:49 -0500
Subject: [PATCH 30/96] Functions in different files

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 168 ++----------------
 .../impl/aggregator/computeCandidateMate.cpp  |  69 +++++++
 amgprec/impl/aggregator/findOwnerOfGhost.cpp  |  52 ++++++
 amgprec/impl/aggregator/isAlreadyMatched.cpp  |  42 +++++
 samples/advanced/pdegen/runs/amg_pde3d.inp    |   2 +-
 5 files changed, 177 insertions(+), 156 deletions(-)
 create mode 100644 amgprec/impl/aggregator/computeCandidateMate.cpp
 create mode 100644 amgprec/impl/aggregator/findOwnerOfGhost.cpp
 create mode 100644 amgprec/impl/aggregator/isAlreadyMatched.cpp

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index f747f1fc..ea0e460f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -1,6 +1,10 @@
 #include "MatchBoxPC.h"
 #include <omp.h>
 #include <stdio.h>
+#include "isAlreadyMatched.cpp"
+#include "findOwnerOfGhost.cpp"
+#include "computeCandidateMate.cpp"
+
 // ***********************************************************************
 //
 //        MatchboxP: A C++ library for approximate weighted matching
@@ -238,9 +242,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         //Initialize the locks
         //TODO this can be executed as task in parallel with other unparallelizable tasks
         //TODO destroy the locks
-#pragma omp for schedule(static)
-        for(int i = 0; i < NLVer; i++)
-            omp_init_lock(&MateLock[i]);
+//#pragma omp for schedule(static)
+//        for(int i = 0; i < NLVer; i++)
+//            omp_init_lock(&MateLock[i]);
 
         // TODO comments about the reduction
 #pragma omp for reduction(+ : numGhostEdges)
@@ -752,12 +756,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
                                         //TODO is it possible to lock without a critical region?
                                         //TODO there must be a more elegant and efficient way to do this
+                                        /*
                                         while(true) {
                                             if (omp_test_lock(&MateLock[v - StartIndex])) {
                                                 if (omp_test_lock(&MateLock[w - StartIndex])) break;
                                                 else omp_unset_lock(&MateLock[v - StartIndex]);
                                             }
                                         }
+                                        */
 
 
                                         if ((w < StartIndex) || (w > EndIndex)) { //A ghost
@@ -815,8 +821,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                             } //End of if(CandidateMate(w) = v
                                         } //End of Else
 
-                                        omp_unset_lock(&MateLock[v - StartIndex]);
-                                        omp_unset_lock(&MateLock[w - StartIndex]);
+                                        //omp_unset_lock(&MateLock[v - StartIndex]);
+                                        //omp_unset_lock(&MateLock[w - StartIndex]);
 
                                     } //End of if(w >=0)
                                     else {
@@ -859,7 +865,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #pragma omp critical(innerProcessMatched)
                             {
 
-                                while(!omp_test_lock(&MateLock[u - StartIndex]));
+                                //while(!omp_test_lock(&MateLock[u - StartIndex]));
 
                                 if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
                                     candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
@@ -887,7 +893,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                     msgInd++;
                                 } //End of If( v != Mate[u] )
 
-                                omp_unset_lock(&MateLock[u - StartIndex]);
+                                //omp_unset_lock(&MateLock[u - StartIndex]);
 
                             } //End of critical region
                         } //End of Else //A Ghost Vertex
@@ -1637,154 +1643,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     //MPI_Barrier(comm);
 }
 //End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
-
-///Find the owner of a ghost node:
-inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
-                                 MilanInt myRank, MilanInt numProcs) {
-    //MilanLongInt Size = mVerDistance.size();
-    MilanLongInt mStartInd = mVerDistance[myRank];
-    MilanInt Start = 0;
-    MilanInt End = numProcs;
-    MilanInt Current = 0;
-
-#if 0
-    if ( vtxIndex < mStartInd )
-    End = myRank;
-  else
-    Start = myRank;
-#endif
-
-    while ( Start <= End ) {
-        Current = (End + Start)/2;
-        //CASE-1:
-        if ( mVerDistance[Current] == vtxIndex ) {
-            while ( mVerDistance[Current+1] == vtxIndex ) {
-                Current++;
-                if ( Current == numProcs )
-                    return (-1);
-            }
-            return (Current);
-        }
-        else { //CASE 2:
-            if ( mVerDistance[Current] > vtxIndex )
-                End = Current - 1;
-            else //CASE 3:
-                Start = Current + 1;
-        }
-    } //End of While()
-    if ( Current == 0 )
-        return (Current);
-    else {
-        if ( mVerDistance[Current] > vtxIndex )
-            return (Current-1);
-        else
-            return (Current);
-    } //End of else
-    return (-1); //It should not reach here!
-} //End of findOwnerOfGhost()
-
-/**
- * Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
- * Returns the vertices with the highest weight
- * @param adj1
- * @param adj2
- * @param verLocInd
- * @param edgeLocWeight
- * @return
- */
-inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
-                                              MilanLongInt adj2,
-                                              MilanLongInt* verLocInd,
-                                              MilanReal* edgeLocWeight)
-{
-    MilanInt w = -1;
-    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-    int finalK;
-    for (int k = adj1; k < adj2; k++) {
-
-        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
-            heaviestEdgeWt = edgeLocWeight[k];
-            w = verLocInd[k];
-            finalK = k;
-        }
-    } //End of for loop
-    return finalK;
-}
-
-/**
- * //TODO documentation
- * @param k
- * @param verLocInd
- * @param StartIndex
- * @param EndIndex
- * @param GMate
- * @param Mate
- * @param Ghost2LocalMap
- * @return
- */
-inline bool isAlreadyMatched(MilanLongInt node,
-                             MilanLongInt StartIndex,
-                             MilanLongInt EndIndex,
-                             vector <MilanLongInt> &GMate,
-                             MilanLongInt* Mate,
-                             map <MilanLongInt, MilanLongInt> &Ghost2LocalMap
-) {
-
-    bool result = false;
-#pragma omp critical(Mate)
-    {
-        if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
-            if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched
-                result = true;
-        } else { //A local vertex
-            if (Mate[node - StartIndex] >= 0) // Already matched
-                result = true;
-        }
-
-    }
-
-    return result;
-}
-
-/**
- * //TODO documentation
- * @param adj1
- * @param adj2
- * @param edgeLocWeight
- * @param k
- * @param verLocInd
- * @param StartIndex
- * @param EndIndex
- * @param GMate
- * @param Mate
- * @param Ghost2LocalMap
- * @return
- */
-inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
-                                         MilanLongInt adj2,
-                                         MilanReal* edgeLocWeight,
-                                         MilanLongInt k,
-                                         MilanLongInt* verLocInd,
-                                         MilanLongInt StartIndex,
-                                         MilanLongInt EndIndex,
-                                         vector <MilanLongInt>& GMate,
-                                         MilanLongInt* Mate,
-                                         map <MilanLongInt, MilanLongInt>& Ghost2LocalMap)
-{
-    MilanInt w = -1;
-    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-    for (k = adj1; k < adj2; k++) {
-        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
-
-        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
-            heaviestEdgeWt = edgeLocWeight[k];
-            w = verLocInd[k];
-        }
-    } //End of for loop
-    return w;
-}
 #endif
 
 #endif
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp
new file mode 100644
index 00000000..92e3c92b
--- /dev/null
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@@ -0,0 +1,69 @@
+#include "MatchBoxPC.h"
+
+/**
+ * Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
+ * Returns the vertices with the highest weight
+ * @param adj1
+ * @param adj2
+ * @param verLocInd
+ * @param edgeLocWeight
+ * @return
+ */
+inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                              MilanLongInt adj2,
+                                              MilanLongInt* verLocInd,
+                                              MilanReal* edgeLocWeight)
+{
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+    int finalK;
+    for (int k = adj1; k < adj2; k++) {
+
+        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
+            heaviestEdgeWt = edgeLocWeight[k];
+            w = verLocInd[k];
+            finalK = k;
+        }
+    } //End of for loop
+    return finalK;
+}
+
+/**
+ * //TODO documentation
+ * @param adj1
+ * @param adj2
+ * @param edgeLocWeight
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                         MilanLongInt adj2,
+                                         MilanReal* edgeLocWeight,
+                                         MilanLongInt k,
+                                         MilanLongInt* verLocInd,
+                                         MilanLongInt StartIndex,
+                                         MilanLongInt EndIndex,
+                                         vector <MilanLongInt>& GMate,
+                                         MilanLongInt* Mate,
+                                         map <MilanLongInt, MilanLongInt>& Ghost2LocalMap)
+{
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+    for (k = adj1; k < adj2; k++) {
+        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
+
+        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
+            heaviestEdgeWt = edgeLocWeight[k];
+            w = verLocInd[k];
+        }
+    } //End of for loop
+    return w;
+}
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
new file mode 100644
index 00000000..10850607
--- /dev/null
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@@ -0,0 +1,52 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+
+///Find the owner of a ghost node:
+inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+                                     MilanInt myRank, MilanInt numProcs) {
+  //MilanLongInt Size = mVerDistance.size();
+  MilanLongInt mStartInd = mVerDistance[myRank];
+  MilanInt Start = 0;
+  MilanInt End = numProcs;
+  MilanInt Current = 0;
+
+#if 0
+  if ( vtxIndex < mStartInd )
+    End = myRank;
+  else
+    Start = myRank;
+#endif
+
+  while ( Start <= End ) {
+    Current = (End + Start)/2;
+    //CASE-1:
+    if ( mVerDistance[Current] == vtxIndex ) {
+      while ( mVerDistance[Current+1] == vtxIndex ) {
+	Current++;
+	if ( Current == numProcs )
+	  return (-1);
+      }
+      return (Current);
+    }
+    else { //CASE 2:
+      if ( mVerDistance[Current] > vtxIndex )
+	End = Current - 1;
+      else //CASE 3:
+	Start = Current + 1;
+    }
+  } //End of While()
+  if ( Current == 0 )
+    return (Current);
+  else {
+    if ( mVerDistance[Current] > vtxIndex )
+      return (Current-1);
+    else
+      return (Current);
+  } //End of else
+  return (-1); //It should not reach here!
+} //End of findOwnerOfGhost()
diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp
new file mode 100644
index 00000000..5a9cf476
--- /dev/null
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@@ -0,0 +1,42 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+
+/**
+ * //TODO documentation
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+inline bool isAlreadyMatched(MilanLongInt node,
+                             MilanLongInt StartIndex,
+                             MilanLongInt EndIndex,
+                             vector <MilanLongInt> &GMate,
+                             MilanLongInt* Mate,
+                             map <MilanLongInt, MilanLongInt> &Ghost2LocalMap
+) {
+
+    bool result = false;
+#pragma omp critical(Mate)
+    {
+        if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
+            if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched
+                result = true;
+        } else { //A local vertex
+            if (Mate[node - StartIndex] >= 0) // Already matched
+                result = true;
+        }
+
+    }
+
+    return result;
+}
\ No newline at end of file
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index 904b6551..b6c448c3 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-020                        ! IDIM; domain size. Linear system size is IDIM**3
+0020                       ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From a54f084ffb57b51735e2fb2412ab74bef7d87ac2 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 25 Jun 2022 10:16:30 -0500
Subject: [PATCH 31/96] refactoring, initialization

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  25 +-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 236 +++--------------
 amgprec/impl/aggregator/initialize.cpp        | 239 ++++++++++++++++++
 3 files changed, 305 insertions(+), 195 deletions(-)
 create mode 100644 amgprec/impl/aggregator/initialize.cpp

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index ba7cb5c8..9b0218bc 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -59,7 +59,7 @@
 #include <assert.h>
 #include <map>
 #include <vector>
-// #include "matchboxp.h"
+#include "omp.h"
 #include "primitiveDataTypeDefinitions.h"
 #include "dataStrStaticQueue.h"
 
@@ -175,6 +175,29 @@ inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
                                          MilanLongInt* Mate,
                                          map <MilanLongInt, MilanLongInt> &Ghost2LocalMap);
 
+inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                        MilanLongInt StartIndex, MilanLongInt EndIndex,
+                        MilanLongInt* numGhostEdgesPtr,
+                        MilanLongInt* numGhostVerticesPtr,
+                        MilanLongInt* insertMePtr,
+                        MilanLongInt* verLocInd,
+                        MilanLongInt* verLocPtr,
+                        omp_lock_t* MateLock,
+                        map <MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                        vector <MilanLongInt>& Counter,
+                        vector <MilanLongInt>& verGhostPtr,
+                        vector <MilanLongInt>& verGhostInd,
+                        vector <MilanLongInt>& tempCounter,
+                        vector <MilanLongInt>& GMate,
+                        vector<MilanLongInt>& Message,
+                        vector<MilanLongInt>& QLocalVtx,
+                        vector<MilanLongInt>& QGhostVtx,
+                        vector<MilanLongInt>& QMsgType,
+                        vector<MilanInt>& QOwner,
+                        MilanLongInt* candidateMate,
+                        staticQueue& U
+                        );
+
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP
         (
                 MilanLongInt NLVer, MilanLongInt NLEdge,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index ea0e460f..dc3606c3 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -4,6 +4,7 @@
 #include "isAlreadyMatched.cpp"
 #include "findOwnerOfGhost.cpp"
 #include "computeCandidateMate.cpp"
+#include "initialize.cpp"
 
 // ***********************************************************************
 //
@@ -146,10 +147,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanReal startTime, finishTime;
     //MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer
     startTime = MPI_Wtime();
-    //Get the iterators for the graph:
-    //vector<MilanLongInt>::iterator verLocPtr  = inputSubGraph.getVerPtr_b();
-    //vector<MilanLongInt>::iterator verLocInd  = inputSubGraph.getVerInd_b();
-    //vector<MilanReal>::iterator edgeLocWeight = inputSubGraph.getEdgeWt_b();
 
     //Data structures for sending and receiving messages:
     vector<MilanLongInt> Message; // [ u, v, message_type ]
@@ -171,9 +168,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         PCounter[i] = 0;
 
 
-    MilanLongInt NumMessagesBundled;
-    MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
-    //vector<MilanLongInt> candidateMate;
+    MilanLongInt NumMessagesBundled = 0;
+    MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
     MilanLongInt* candidateMate = new MilanLongInt[1];
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")NV: "<<NLVer<<"  Edges: "<<NLEdge; fflush(stdout);
@@ -188,8 +184,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     //Build the Ghost Vertex Set: Vg
     map <MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
-    // index that starts with zero to |Vg|  - 1
-    map<MilanLongInt, MilanLongInt>::iterator storedAlready;
     vector <MilanLongInt> Counter;  //Store the edge count for each ghost vertex
     MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices
 
@@ -200,17 +194,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     if (myRank == 0)     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
 #endif
 
-    /*
-     * OMP Ghost2LocalInitialization
-     * The cycle analyzes all the edges and when finds a ghost edge
-     * puts it in the Ghost2LocalMap.
-     * A critical region is needed when inserting data in the map.
-     *
-     * Despite the critical region it is still productive to
-     * parallelize this for because the critical region is exeuted
-     * only when a ghost edge is found and ghost edges are a minority.
-     */
-
     //Define Adjacency Lists for Ghost Vertices:
     //cout<<"Building Ghost data structures ... \n\n";
     vector <MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
@@ -232,182 +215,37 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     //Declare the locks
     omp_lock_t MateLock[NLVer];
-#ifdef TIME_TRACKER
-    double Ghost2LocalInitialization = MPI_Wtime();
-#endif
-
-#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
-    {
-
-        //Initialize the locks
-        //TODO this can be executed as task in parallel with other unparallelizable tasks
-        //TODO destroy the locks
-//#pragma omp for schedule(static)
-//        for(int i = 0; i < NLVer; i++)
-//            omp_init_lock(&MateLock[i]);
-
-        // TODO comments about the reduction
-#pragma omp for reduction(+ : numGhostEdges)
-        for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
-            insertMe = verLocInd[i];
-            //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
-            if ((insertMe < StartIndex) || (insertMe > EndIndex)) { //Find a ghost
-                numGhostEdges++;
-#pragma omp critical
-                {
-                    storedAlready = Ghost2LocalMap.find(insertMe);
-                    if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
-                        //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
-                        Counter[storedAlready->second]++; //Increment the counter
-                    } else { //Insert an entry for the ghost:
-                        //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
-                        Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
-                        Counter.push_back(1); //Initialize the counter
-                        numGhostVertices++;  //Increment the number of ghost vertices
-                    } //End of else()
-                }
-            } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
-        } //End of for(ghost vertices)
-
-#pragma omp single
-        {
-            //numGhostEdges = atomicNumGhostEdges;
-#ifdef TIME_TRACKER
-            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
-            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
-#endif
-
-#ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
-            if (!Ghost2LocalMap.empty()) {
-                cout<<"\n("<<myRank<<")Final Map : on process ";
-                cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
-                storedAlready = Ghost2LocalMap.begin();
-                do {
-                    cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
-                    fflush(stdout);
-                    storedAlready++;
-                } while ( storedAlready != Ghost2LocalMap.end() );
-            }
-#endif
-
-            //Initialize adjacency Lists for Ghost Vertices:
-            try {
-                verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector
-                tempCounter.reserve(numGhostVertices); //Pointer Vector
-                verGhostInd.reserve(numGhostEdges); //Index Vector
-                GMate.reserve(numGhostVertices); //Ghost Mate Vector
-            } catch (length_error) {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-            //Initialize the Vectors:
-            verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector
-            tempCounter.resize(numGhostVertices, 0); //Temporary Counter
-            verGhostInd.resize(numGhostEdges, -1); //Index Vector
-            GMate.resize(numGhostVertices, -1); //Temporary Counter
-            verGhostPtr[0] = 0; //The first value
-#ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
-#endif
-
-#ifdef TIME_TRACKER
-            double verGhostPtrInitialization = MPI_Wtime();
-#endif
-
-
-            /*
-             * Not parallelizable
-             */
-
-            for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
-                verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
-#ifdef PRINT_DEBUG_INFO_
-                cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
-#endif
-            }
-        } // End of single region
 
-#ifdef TIME_TRACKER
-        verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
-        fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
-#endif
+    initialize(NLVer, NLEdge, StartIndex, 
+                EndIndex, &numGhostEdges, 
+                &numGhostVertices, &insertMe, 
+                verLocInd, verLocPtr,
+                MateLock, 
+                Ghost2LocalMap, Counter,
+                verGhostPtr, verGhostInd,
+                tempCounter, GMate,
+                Message, QLocalVtx,
+                QGhostVtx, QMsgType, QOwner, 
+                candidateMate, U);
+                        
+    finishTime = MPI_Wtime();
+    *ph0_time = finishTime - startTime; //Time taken for Phase-0: Initialization      
 
-#ifdef PRINT_DEBUG_INFO_
-        if ( numGhostVertices > 0 )
-            cout<<verGhostPtr[numGhostVertices]<<"\n";
-        fflush(stdout);
-#endif
+                    
+    startTime = MPI_Wtime();
 
-        /*
-         * OMP verGhostIndInitialization
-         *
-         * In this cycle the verGhostInd is initialized
-         * with the datas related to ghost edges.
-         * The check to see if a node is a ghost node is
-         * executed in paralle and when a ghost node
-         * is found a critical region is started.
-         *
-         * Despite the critical region it's still useful to
-         * parallelize the for cause the ghost nodes
-         * are a minority hence the critical region is executed
-         * few times.
-         */
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////// INITIALIZATION /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //Compute the Initial Matching Set:
 
-#ifdef TIME_TRACKER
-        double verGhostIndInitialization = MPI_Wtime();
-#endif
 
-#pragma omp for nowait schedule(static)
-        for (v = 0; v < NLVer; v++) {
-            adj1 = verLocPtr[v];   //Vertex Pointer
-            adj2 = verLocPtr[v + 1];
-            for (k = adj1; k < adj2; k++) {
-                w = verLocInd[k]; //Get the adjacent vertex
-                if ((w < StartIndex) || (w > EndIndex)) { //Find a ghost
-#pragma omp critical
-                    {
-                        insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
-                        verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
-                        tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
-                    }
-                } //End of if((w < StartIndex) || (w > EndIndex))
-            } //End of for(k)
-        } //End of for (v)
+#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+    {
 
 #pragma omp single
-        {
-
-#ifdef TIME_TRACKER
-            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
-            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
-#endif
-
-#ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
-            for ( v=0; v < numGhostEdges; v++ )
-                cout<<verGhostInd[v]<<"\t";
-            cout<<endl; fflush(stdout);
-#endif
-
-
-            Message.resize(3, -1);
-            message_type = 0;
-            NumMessagesBundled = 0;
-            ghostOwner = 0;
-            try {
-                QLocalVtx.reserve(numGhostEdges); //Local Vertex
-                QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
-                QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure)
-                QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later
-            } catch (length_error) {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-
-#ifdef PRINT_DEBUG_INFO_
+{
+            #ifdef PRINT_DEBUG_INFO_
             cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
 #endif
             //Allocate Data Structures:
@@ -434,9 +272,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
             fflush(stdout);
 #endif
-            //MPI_Barrier(comm);
-            finishTime = MPI_Wtime();
-            *ph0_time = finishTime - startTime; //Time taken for Phase-0: Initialization
+
 #ifdef PRINT_DEBUG_INFO_
             cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
             fflush(stdout);
@@ -444,15 +280,27 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #ifdef DEBUG_HANG_
             if (myRank == 0) cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
 #endif
-            startTime = MPI_Wtime();
             /////////////////////////////////////////////////////////////////////////////////////////
             //////////////////////////////////// INITIALIZATION /////////////////////////////////////
             /////////////////////////////////////////////////////////////////////////////////////////
             //Compute the Initial Matching Set:
 
             S = numGhostVertices; //Initialize S with number of Ghost Vertices
-        } // end of single region
 
+            /*
+             * Create the Queue Data Structure for the Dominating Set
+             *
+             * I had to declare the staticuQueue U before the parallel region
+             * to have it in the correct scope. Since we can't change the dimension
+             * of a staticQueue I had to destroy the previous object and instantiate
+             * a new one of the correct size.
+             */
+            U.~staticQueue();
+            new(&U) staticQueue(NLVer + numGhostVertices);
+
+                S = numGhostVertices; //Initialize S with number of Ghost Vertices
+
+}
         /*
         * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
         * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
new file mode 100644
index 00000000..baa7ee6f
--- /dev/null
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -0,0 +1,239 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <assert.h>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                        MilanLongInt StartIndex, MilanLongInt EndIndex,
+                        MilanLongInt* numGhostEdgesPtr,
+                        MilanLongInt* numGhostVerticesPtr,
+                        MilanLongInt* insertMePtr,
+                        MilanLongInt* verLocInd,
+                        MilanLongInt* verLocPtr,
+                        omp_lock_t* MateLock,
+                        map <MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                        vector <MilanLongInt>& Counter,
+                        vector <MilanLongInt>& verGhostPtr,
+                        vector <MilanLongInt>& verGhostInd,
+                        vector <MilanLongInt>& tempCounter,
+                        vector <MilanLongInt>& GMate,
+                        vector<MilanLongInt>& Message,
+                        vector<MilanLongInt>& QLocalVtx,
+                        vector<MilanLongInt>& QGhostVtx,
+                        vector<MilanLongInt>& QMsgType,
+                        vector<MilanInt>& QOwner,
+                        MilanLongInt* candidateMate,
+                        staticQueue& U
+                        )
+{
+
+    MilanLongInt insertMe = 0, numGhostEdges = 0, numGhostVertices = 0;
+    MilanLongInt adj1, adj2;
+    int i, v, k, w;
+
+    
+    // index that starts with zero to |Vg|  - 1
+    map<MilanLongInt, MilanLongInt>::iterator storedAlready;
+
+#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+    {
+        
+        //Initialize the locks
+        //TODO this can be executed as task in parallel with other unparallelizable tasks
+        //TODO destroy the locks
+#pragma omp for schedule(static)
+        for(i = 0; i < NLVer; i++)
+            omp_init_lock(&MateLock[i]);
+
+
+#ifdef TIME_TRACKER
+    double Ghost2LocalInitialization = MPI_Wtime();
+#endif
+
+    /*
+     * OMP Ghost2LocalInitialization
+     * The cycle analyzes all the edges and when finds a ghost edge
+     * puts it in the Ghost2LocalMap.
+     * A critical region is needed when inserting data in the map.
+     *
+     * Despite the critical region it is still productive to
+     * parallelize this for because the critical region is exeuted
+     * only when a ghost edge is found and ghost edges are a minority.
+     */
+
+        // TODO comments about the reduction
+#pragma omp for reduction(+ : numGhostEdges)
+        for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
+            insertMe = verLocInd[i];
+            //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
+            if ((insertMe < StartIndex) || (insertMe > EndIndex)) { //Find a ghost
+                numGhostEdges++;
+#pragma omp critical
+                {
+                    storedAlready = Ghost2LocalMap.find(insertMe);
+                    if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
+                        //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
+                        Counter[storedAlready->second]++; //Increment the counter
+                    } else { //Insert an entry for the ghost:
+                        //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
+                        Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
+                        Counter.push_back(1); //Initialize the counter
+                        numGhostVertices++;  //Increment the number of ghost vertices
+                    } //End of else()
+                }
+            } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+        } //End of for(ghost vertices)
+
+
+
+        #pragma omp single
+        {
+            //numGhostEdges = atomicNumGhostEdges;
+#ifdef TIME_TRACKER
+            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
+            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
+            if (!Ghost2LocalMap.empty()) {
+                cout<<"\n("<<myRank<<")Final Map : on process ";
+                cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
+                storedAlready = Ghost2LocalMap.begin();
+                do {
+                    cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
+                    fflush(stdout);
+                    storedAlready++;
+                } while ( storedAlready != Ghost2LocalMap.end() );
+            }
+#endif
+
+            //Initialize adjacency Lists for Ghost Vertices:
+            try {
+                verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector
+                tempCounter.reserve(numGhostVertices); //Pointer Vector
+                verGhostInd.reserve(numGhostEdges); //Index Vector
+                GMate.reserve(numGhostVertices); //Ghost Mate Vector
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+            //Initialize the Vectors:
+            verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector
+            tempCounter.resize(numGhostVertices, 0); //Temporary Counter
+            verGhostInd.resize(numGhostEdges, -1); //Index Vector
+            GMate.resize(numGhostVertices, -1); //Temporary Counter
+            verGhostPtr[0] = 0; //The first value
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
+#endif
+
+#ifdef TIME_TRACKER
+            double verGhostPtrInitialization = MPI_Wtime();
+#endif
+
+
+            /*
+             * Not parallelizable
+             */
+
+            for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
+                verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
+#ifdef PRINT_DEBUG_INFO_
+                cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
+#endif
+            }
+        } // End of single region
+
+        #ifdef TIME_TRACKER
+        verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
+        fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+        if ( numGhostVertices > 0 )
+            cout<<verGhostPtr[numGhostVertices]<<"\n";
+        fflush(stdout);
+#endif
+
+        /*
+         * OMP verGhostIndInitialization
+         *
+         * In this cycle the verGhostInd is initialized
+         * with the datas related to ghost edges.
+         * The check to see if a node is a ghost node is
+         * executed in paralle and when a ghost node
+         * is found a critical region is started.
+         *
+         * Despite the critical region it's still useful to
+         * parallelize the for cause the ghost nodes
+         * are a minority hence the critical region is executed
+         * few times.
+         */
+
+#ifdef TIME_TRACKER
+        double verGhostIndInitialization = MPI_Wtime();
+#endif
+
+#pragma omp for nowait schedule(static)
+        for (v = 0; v < NLVer; v++) {
+            adj1 = verLocPtr[v];   //Vertex Pointer
+            adj2 = verLocPtr[v + 1];
+            for (k = adj1; k < adj2; k++) {
+                w = verLocInd[k]; //Get the adjacent vertex
+                if ((w < StartIndex) || (w > EndIndex)) { //Find a ghost
+#pragma omp critical
+                    {
+                        insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
+                        verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
+                        tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
+                    }
+                } //End of if((w < StartIndex) || (w > EndIndex))
+            } //End of for(k)
+        } //End of for (v)
+    
+    }
+
+    #pragma omp single
+        {
+
+#ifdef TIME_TRACKER
+            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
+            for ( v=0; v < numGhostEdges; v++ )
+                cout<<verGhostInd[v]<<"\t";
+            cout<<endl; fflush(stdout);
+#endif
+
+
+            Message.resize(3, -1);
+            //message_type = 0;
+            //NumMessagesBundled = 0;
+            //ghostOwner = 0;
+            try {
+                QLocalVtx.reserve(numGhostEdges); //Local Vertex
+                QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
+                QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure)
+                QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later
+            } catch (length_error) {
+                cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+                cout << "Not enough memory to allocate the internal variables \n";
+                exit(1);
+            }
+
+        } // end of single region
+
+    *numGhostEdgesPtr = numGhostEdges;
+    *numGhostVerticesPtr = numGhostVertices;  
+    *insertMePtr = insertMe; 
+}

From deab695294e0001f35385019ff3c49dfbfc675d5 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 25 Jun 2022 12:10:14 -0500
Subject: [PATCH 32/96] Refactoring Initialization

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  3 +-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 66 ++-----------------
 amgprec/impl/aggregator/initialize.cpp        | 44 ++++++++++++-
 3 files changed, 49 insertions(+), 64 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 9b0218bc..f1cb257a 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -180,6 +180,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         MilanLongInt* numGhostEdgesPtr,
                         MilanLongInt* numGhostVerticesPtr,
                         MilanLongInt* insertMePtr,
+                        MilanLongInt* S,
                         MilanLongInt* verLocInd,
                         MilanLongInt* verLocPtr,
                         omp_lock_t* MateLock,
@@ -194,7 +195,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         vector<MilanLongInt>& QGhostVtx,
                         vector<MilanLongInt>& QMsgType,
                         vector<MilanInt>& QOwner,
-                        MilanLongInt* candidateMate,
+                        MilanLongInt* &candidateMate,
                         staticQueue& U
                         );
 
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index dc3606c3..174eb578 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -170,7 +170,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     MilanLongInt NumMessagesBundled = 0;
     MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
-    MilanLongInt* candidateMate = new MilanLongInt[1];
+    MilanLongInt* candidateMate = nullptr;
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")NV: "<<NLVer<<"  Edges: "<<NLEdge; fflush(stdout);
     cout<<"\n("<<myRank<<")StartIndex: "<<StartIndex<<"  EndIndex: "<<EndIndex; fflush(stdout);
@@ -218,7 +218,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     initialize(NLVer, NLEdge, StartIndex, 
                 EndIndex, &numGhostEdges, 
-                &numGhostVertices, &insertMe, 
+                &numGhostVertices, &insertMe, &S,
                 verLocInd, verLocPtr,
                 MateLock, 
                 Ghost2LocalMap, Counter,
@@ -239,68 +239,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     /////////////////////////////////////////////////////////////////////////////////////////
     //Compute the Initial Matching Set:
 
-
 #pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
     {
-
-#pragma omp single
-{
-            #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
-#endif
-            //Allocate Data Structures:
-            /*
-             * candidateMate was a vector and has been replaced with a raw array
-             * there is no point in using the vector (or maybe there is???)
-             * so I replaced it with an array wich is slightly faster
-             */
-            delete[] candidateMate;
-            candidateMate = new MilanLongInt[NLVer + numGhostVertices];
-
-            /*
-             * Create the Queue Data Structure for the Dominating Set
-             *
-             * I had to declare the staticuQueue U before the parallel region
-             * to have it in the correct scope. Since we can't change the dimension
-             * of a staticQueue I had to destroy the previous object and instantiate
-             * a new one of the correct size.
-             */
-            U.~staticQueue();
-            new(&U) staticQueue(NLVer + numGhostVertices);
-
-#ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
-            fflush(stdout);
-#endif
-
-#ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
-            fflush(stdout);
-#endif
-#ifdef DEBUG_HANG_
-            if (myRank == 0) cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
-#endif
-            /////////////////////////////////////////////////////////////////////////////////////////
-            //////////////////////////////////// INITIALIZATION /////////////////////////////////////
-            /////////////////////////////////////////////////////////////////////////////////////////
-            //Compute the Initial Matching Set:
-
-            S = numGhostVertices; //Initialize S with number of Ghost Vertices
-
-            /*
-             * Create the Queue Data Structure for the Dominating Set
-             *
-             * I had to declare the staticuQueue U before the parallel region
-             * to have it in the correct scope. Since we can't change the dimension
-             * of a staticQueue I had to destroy the previous object and instantiate
-             * a new one of the correct size.
-             */
-            U.~staticQueue();
-            new(&U) staticQueue(NLVer + numGhostVertices);
-
-                S = numGhostVertices; //Initialize S with number of Ghost Vertices
-
-}
         /*
         * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
         * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
@@ -331,6 +271,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
          *       in parallel.
          */
 
+
         MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size?
         //Fail messages
         privateQLocalVtx.~staticQueue();
@@ -345,6 +286,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         new(&privateQMsgType) staticQueue(size);
         new(&privateQOwner) staticQueue(size);
 
+
 #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
         for (v = 0; v < NLVer; v++) {
             //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index baa7ee6f..acd4b9f8 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -13,6 +13,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         MilanLongInt* numGhostEdgesPtr,
                         MilanLongInt* numGhostVerticesPtr,
                         MilanLongInt* insertMePtr,
+                        MilanLongInt* S,
                         MilanLongInt* verLocInd,
                         MilanLongInt* verLocPtr,
                         omp_lock_t* MateLock,
@@ -27,7 +28,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         vector<MilanLongInt>& QGhostVtx,
                         vector<MilanLongInt>& QMsgType,
                         vector<MilanInt>& QOwner,
-                        MilanLongInt* candidateMate,
+                        MilanLongInt* &candidateMate,
                         staticQueue& U
                         )
 {
@@ -233,7 +234,48 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
         } // end of single region
 
+#ifdef PRINT_DEBUG_INFO_
+cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
+#endif
+
     *numGhostEdgesPtr = numGhostEdges;
     *numGhostVerticesPtr = numGhostVertices;  
     *insertMePtr = insertMe; 
+
+    //Allocate Data Structures:
+    /*
+     * candidateMate was a vector and has been replaced with a raw array
+     * there is no point in using the vector (or maybe there is???)
+     * so I replaced it with an array wich is slightly faster
+     */
+    //candidateMate = new MilanLongInt[NLVer + numGhostVertices];
+    candidateMate = new MilanLongInt[NLVer + numGhostVertices];
+
+
+    #ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
+            fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+            if (myRank == 0) cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
+#endif
+
+    *S = numGhostVertices; //Initialize S with number of Ghost Vertices
+
+
+    /*
+     * Create the Queue Data Structure for the Dominating Set
+     *
+     * I had to declare the staticuQueue U before the parallel region
+     * to have it in the correct scope. Since we can't change the dimension
+     * of a staticQueue I had to destroy the previous object and instantiate
+     * a new one of the correct size.
+     */
+    U.~staticQueue();
+    new(&U) staticQueue(NLVer + numGhostVertices);
 }

From b5e52d31f58f6208c534404dc85323cdf2904151 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 25 Jun 2022 15:25:13 -0500
Subject: [PATCH 33/96] Refactoring private queues, still not working

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  8 ++-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 33 +++++------
 amgprec/impl/aggregator/dataStrStaticQueue.h  |  6 +-
 amgprec/impl/aggregator/initialize.cpp        | 57 ++++++++-----------
 4 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index f1cb257a..54830919 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -179,7 +179,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         MilanLongInt StartIndex, MilanLongInt EndIndex,
                         MilanLongInt* numGhostEdgesPtr,
                         MilanLongInt* numGhostVerticesPtr,
-                        MilanLongInt* insertMePtr,
                         MilanLongInt* S,
                         MilanLongInt* verLocInd,
                         MilanLongInt* verLocPtr,
@@ -196,7 +195,12 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         vector<MilanLongInt>& QMsgType,
                         vector<MilanInt>& QOwner,
                         MilanLongInt* &candidateMate,
-                        staticQueue& U
+                        staticQueue& U,
+                        staticQueue& privateU,
+                        staticQueue& privateQLocalVtx,
+                        staticQueue& privateQGhostVtx,
+                        staticQueue& privateQMsgType,
+                        staticQueue& privateQOwner
                         );
 
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 174eb578..b4ead45d 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -185,7 +185,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     //Build the Ghost Vertex Set: Vg
     map <MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
     vector <MilanLongInt> Counter;  //Store the edge count for each ghost vertex
-    MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0; //Number of Ghost vertices
 
 #ifdef PRINT_DEBUG_INFO_
     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
@@ -218,7 +218,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     initialize(NLVer, NLEdge, StartIndex, 
                 EndIndex, &numGhostEdges, 
-                &numGhostVertices, &insertMe, &S,
+                &numGhostVertices, &S,
                 verLocInd, verLocPtr,
                 MateLock, 
                 Ghost2LocalMap, Counter,
@@ -226,20 +226,27 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 tempCounter, GMate,
                 Message, QLocalVtx,
                 QGhostVtx, QMsgType, QOwner, 
-                candidateMate, U);
+                candidateMate, U,
+                privateU,
+                privateQLocalVtx,
+                privateQGhostVtx,
+                privateQMsgType,
+                privateQOwner
+                );
                         
     finishTime = MPI_Wtime();
     *ph0_time = finishTime - startTime; //Time taken for Phase-0: Initialization      
 
                     
     startTime = MPI_Wtime();
+    
 
     /////////////////////////////////////////////////////////////////////////////////////////
     //////////////////////////////////// INITIALIZATION /////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////////
     //Compute the Initial Matching Set:
 
-#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
     {
         /*
         * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
@@ -272,21 +279,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
          */
 
 
-        MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size?
-        //Fail messages
-        privateQLocalVtx.~staticQueue();
-        privateQGhostVtx.~staticQueue();
-        privateQMsgType.~staticQueue();
-        privateQOwner.~staticQueue();
-        privateU.~staticQueue();
-
-        new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
-        new(&privateQLocalVtx) staticQueue(size);
-        new(&privateQGhostVtx) staticQueue(size);
-        new(&privateQMsgType) staticQueue(size);
-        new(&privateQOwner) staticQueue(size);
-
-
 #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
         for (v = 0; v < NLVer; v++) {
             //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
@@ -334,8 +326,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         assert(ghostOwner != myRank);
                         PCounter[ghostOwner]++;
 
-
                         //TODO why does it fail if I use a private data structure???
+                        
                         /*
                         privateQLocalVtx.push_back(v + StartIndex);
                         privateQGhostVtx.push_back(w);
@@ -351,6 +343,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             QMsgType.push_back(REQUEST);
                             QOwner.push_back(ghostOwner);
                         } // end of critical region
+                      
 
                         if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
 
diff --git a/amgprec/impl/aggregator/dataStrStaticQueue.h b/amgprec/impl/aggregator/dataStrStaticQueue.h
index eecbffeb..c6e6882a 100755
--- a/amgprec/impl/aggregator/dataStrStaticQueue.h
+++ b/amgprec/impl/aggregator/dataStrStaticQueue.h
@@ -80,9 +80,11 @@ class staticQueue
 		MilanLongInt squeueTail;
 		MilanLongInt NumNodes;
 
+		//FIXME I had to comment this piece of code in order to make everything work.
+		//		why?
 		//Prevent Assignment and Pass by Value:
-		staticQueue(const staticQueue& src);
-		staticQueue& operator=(const staticQueue& rhs);
+		//staticQueue(const staticQueue& src);
+		//staticQueue& operator=(const staticQueue& rhs);
 
 	public:
 		//Constructors and Destructors
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index acd4b9f8..117057b5 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -12,7 +12,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         MilanLongInt StartIndex, MilanLongInt EndIndex,
                         MilanLongInt* numGhostEdgesPtr,
                         MilanLongInt* numGhostVerticesPtr,
-                        MilanLongInt* insertMePtr,
                         MilanLongInt* S,
                         MilanLongInt* verLocInd,
                         MilanLongInt* verLocPtr,
@@ -29,7 +28,12 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         vector<MilanLongInt>& QMsgType,
                         vector<MilanInt>& QOwner,
                         MilanLongInt* &candidateMate,
-                        staticQueue& U
+                        staticQueue& U,
+                        staticQueue& privateU,
+                        staticQueue& privateQLocalVtx,
+                        staticQueue& privateQGhostVtx,
+                        staticQueue& privateQMsgType,
+                        staticQueue& privateQOwner
                         )
 {
 
@@ -37,7 +41,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
     MilanLongInt adj1, adj2;
     int i, v, k, w;
 
-    
     // index that starts with zero to |Vg|  - 1
     map<MilanLongInt, MilanLongInt>::iterator storedAlready;
 
@@ -64,10 +67,9 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
      *
      * Despite the critical region it is still productive to
      * parallelize this for because the critical region is exeuted
-     * only when a ghost edge is found and ghost edges are a minority.
+     * only when a ghost edge is found and ghost edges are a minority,
+     * circa 3.5% during the tests.
      */
-
-        // TODO comments about the reduction
 #pragma omp for reduction(+ : numGhostEdges)
         for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
             insertMe = verLocInd[i];
@@ -90,8 +92,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
         } //End of for(ghost vertices)
 
-
-
         #pragma omp single
         {
             //numGhostEdges = atomicNumGhostEdges;
@@ -143,7 +143,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             /*
              * Not parallelizable
              */
-
             for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
                 verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
@@ -163,6 +162,10 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
         fflush(stdout);
 #endif
 
+#ifdef TIME_TRACKER
+        double verGhostIndInitialization = MPI_Wtime();
+#endif
+
         /*
          * OMP verGhostIndInitialization
          *
@@ -175,13 +178,8 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
          * Despite the critical region it's still useful to
          * parallelize the for cause the ghost nodes
          * are a minority hence the critical region is executed
-         * few times.
+         * few times, circa 3.5% of the times in the tests.
          */
-
-#ifdef TIME_TRACKER
-        double verGhostIndInitialization = MPI_Wtime();
-#endif
-
 #pragma omp for nowait schedule(static)
         for (v = 0; v < NLVer; v++) {
             adj1 = verLocPtr[v];   //Vertex Pointer
@@ -192,17 +190,14 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 #pragma omp critical
                     {
                         insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
-                        verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
                         tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
                     }
+                    verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
                 } //End of if((w < StartIndex) || (w > EndIndex))
             } //End of for(k)
         } //End of for (v)
-    
-    }
 
-    #pragma omp single
-        {
+    } // End of parallel region
 
 #ifdef TIME_TRACKER
             verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
@@ -216,11 +211,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             cout<<endl; fflush(stdout);
 #endif
 
-
-            Message.resize(3, -1);
-            //message_type = 0;
-            //NumMessagesBundled = 0;
-            //ghostOwner = 0;
             try {
                 QLocalVtx.reserve(numGhostEdges); //Local Vertex
                 QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
@@ -232,23 +222,19 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                 exit(1);
             }
 
-        } // end of single region
-
 #ifdef PRINT_DEBUG_INFO_
 cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
 #endif
 
     *numGhostEdgesPtr = numGhostEdges;
     *numGhostVerticesPtr = numGhostVertices;  
-    *insertMePtr = insertMe; 
 
     //Allocate Data Structures:
     /*
      * candidateMate was a vector and has been replaced with a raw array
-     * there is no point in using the vector (or maybe there is???)
+     * there is no point in using the vector (or maybe there is (???))
      * so I replaced it with an array wich is slightly faster
      */
-    //candidateMate = new MilanLongInt[NLVer + numGhostVertices];
     candidateMate = new MilanLongInt[NLVer + numGhostVertices];
 
 
@@ -267,7 +253,6 @@ cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
 
     *S = numGhostVertices; //Initialize S with number of Ghost Vertices
 
-
     /*
      * Create the Queue Data Structure for the Dominating Set
      *
@@ -276,6 +261,14 @@ cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
      * of a staticQueue I had to destroy the previous object and instantiate
      * a new one of the correct size.
      */
-    U.~staticQueue();
     new(&U) staticQueue(NLVer + numGhostVertices);
+
+    //TODO how can I decide a more meaningfull size?
+    MilanLongInt size = numGhostVertices;
+
+    new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
+    new(&privateQLocalVtx) staticQueue(size);
+    new(&privateQGhostVtx) staticQueue(size);
+    new(&privateQMsgType) staticQueue(size);
+    new(&privateQOwner) staticQueue(size);
 }

From 7741abd45dc295227dd4f4e9ac4b1be3b219a636 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 26 Jun 2022 04:40:13 -0500
Subject: [PATCH 34/96] Initialize parallelized with task

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 30 ++++++++++----
 amgprec/impl/aggregator/initialize.cpp        | 41 ++++++++++++-------
 2 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index b4ead45d..9d5b6417 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -326,9 +326,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         assert(ghostOwner != myRank);
                         PCounter[ghostOwner]++;
 
-                        //TODO why does it fail if I use a private data structure???
-                        
                         /*
+                        //TODO why does it fail if I use a private data structure???
                         privateQLocalVtx.push_back(v + StartIndex);
                         privateQGhostVtx.push_back(w);
                         privateQMsgType.push_back(REQUEST);
@@ -343,7 +342,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             QMsgType.push_back(REQUEST);
                             QOwner.push_back(ghostOwner);
                         } // end of critical region
-                      
+                    
 
                         if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
 
@@ -430,6 +429,20 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         } //End of for ( v=0; v < NLVer; v++ )
 
+
+        #pragma omp critical(privateMsg)
+        {
+            while (!privateQLocalVtx.empty()) {
+                
+                QLocalVtx.push_back(privateQLocalVtx.pop_front());
+                QGhostVtx.push_back(privateQGhostVtx.pop_front());
+                QMsgType.push_back(privateQMsgType.pop_front());
+                QOwner.push_back(privateQOwner.pop_front());
+
+            }
+
+        }
+
 #pragma omp critical(U)
         {
             while (!privateU.empty())
@@ -699,16 +712,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             }
         } //End of while ( /*!Q.empty()*/ !U.empty() )
 
-#pragma omp critical(privateMsg)
+        #pragma omp critical(privateMsg)
         {
             while (!privateQLocalVtx.empty()) {
 
-                QLocalVtx.push_back(privateQLocalVtx.pop_back());
-                QGhostVtx.push_back(privateQGhostVtx.pop_back());
-                QMsgType.push_back(privateQMsgType.pop_back());
-                QOwner.push_back(privateQOwner.pop_back());
+                QLocalVtx.push_back(privateQLocalVtx.pop_front());
+                QGhostVtx.push_back(privateQGhostVtx.pop_front());
+                QMsgType.push_back(privateQMsgType.pop_front());
+                QOwner.push_back(privateQOwner.pop_front());
 
             }
+
         }
 
 
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 117057b5..908bd1d3 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -8,6 +8,8 @@
 #include "dataStrStaticQueue.h"
 #include "omp.h"
 
+#define NUM_THREAD 4
+
 inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                         MilanLongInt StartIndex, MilanLongInt EndIndex,
                         MilanLongInt* numGhostEdgesPtr,
@@ -44,17 +46,19 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
     // index that starts with zero to |Vg|  - 1
     map<MilanLongInt, MilanLongInt>::iterator storedAlready;
 
-#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD)
     {
         
+        #pragma omp single
+        {
+
         //Initialize the locks
         //TODO this can be executed as task in parallel with other unparallelizable tasks
         //TODO destroy the locks
-#pragma omp for schedule(static)
+#pragma omp taskloop num_tasks(NUM_THREAD)
         for(i = 0; i < NLVer; i++)
             omp_init_lock(&MateLock[i]);
-
-
+        
 #ifdef TIME_TRACKER
     double Ghost2LocalInitialization = MPI_Wtime();
 #endif
@@ -70,7 +74,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
      * only when a ghost edge is found and ghost edges are a minority,
      * circa 3.5% during the tests.
      */
-#pragma omp for reduction(+ : numGhostEdges)
+#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ : numGhostEdges) depend ( out : numGhostEdges, Counter, Ghost2LocalMap )
         for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
             insertMe = verLocInd[i];
             //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
@@ -92,9 +96,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
         } //End of for(ghost vertices)
 
-        #pragma omp single
-        {
-            //numGhostEdges = atomicNumGhostEdges;
 #ifdef TIME_TRACKER
             Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
             fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
@@ -114,6 +115,9 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             }
 #endif
 
+        #pragma omp task depend ( out : verGhostPtr, tempCounter, verGhostInd, GMate) depend ( in : numGhostVertices)
+        {
+
             //Initialize adjacency Lists for Ghost Vertices:
             try {
                 verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector
@@ -139,18 +143,17 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             double verGhostPtrInitialization = MPI_Wtime();
 #endif
 
+        } // End of task
 
-            /*
-             * Not parallelizable
-             */
+#pragma omp task depent ( out : verGhostPtr ) depend ( in : Counter, numGhostVertices)
+        {
             for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
                 verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
                 cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
 #endif
             }
-        } // End of single region
-
+        }//End of task
         #ifdef TIME_TRACKER
         verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
         fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
@@ -180,7 +183,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
          * are a minority hence the critical region is executed
          * few times, circa 3.5% of the times in the tests.
          */
-#pragma omp for nowait schedule(static)
+#pragma omp taskloop num_tasks(NUM_THREAD) depend ( in : insertMe, Ghost2LocalMap, tempCounter) depend ( out : verGhostInd)
         for (v = 0; v < NLVer; v++) {
             adj1 = verLocPtr[v];   //Vertex Pointer
             adj2 = verLocPtr[v + 1];
@@ -211,6 +214,8 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             cout<<endl; fflush(stdout);
 #endif
 
+#pragma omp task depend ( in : numGhostEdges) depend ( out : QLocalVtx, QGhostVtx, QMsgType, QOwner )
+{
             try {
                 QLocalVtx.reserve(numGhostEdges); //Local Vertex
                 QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
@@ -221,6 +226,10 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                 cout << "Not enough memory to allocate the internal variables \n";
                 exit(1);
             }
+}
+
+#pragma omp task depend( in : numGhostEdges, numGhostVertices ) depend ( out : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
+{
 
 #ifdef PRINT_DEBUG_INFO_
 cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
@@ -231,7 +240,7 @@ cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
 
     //Allocate Data Structures:
     /*
-     * candidateMate was a vector and has been replaced with a raw array
+     * candidateMate was a vector and has been replaced with an array
      * there is no point in using the vector (or maybe there is (???))
      * so I replaced it with an array wich is slightly faster
      */
@@ -271,4 +280,6 @@ cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
     new(&privateQGhostVtx) staticQueue(size);
     new(&privateQMsgType) staticQueue(size);
     new(&privateQOwner) staticQueue(size);
+}
+    } // End of single
 }

From ea040ae5ee05e517e3e9779a2e602e43e5c9d555 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 26 Jun 2022 04:48:49 -0500
Subject: [PATCH 35/96] Reformat initialize, refactoring of initialize
 completed

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |   1 +
 amgprec/impl/aggregator/initialize.cpp        | 435 ++++++++++--------
 2 files changed, 236 insertions(+), 200 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 9d5b6417..d6c58852 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -214,6 +214,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     bool isEmpty;
 
     //Declare the locks
+    // TODO destroy the locks
     omp_lock_t MateLock[NLVer];
 
     initialize(NLVer, NLEdge, StartIndex, 
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 908bd1d3..21210c34 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -8,35 +8,34 @@
 #include "dataStrStaticQueue.h"
 #include "omp.h"
 
-#define NUM_THREAD 4
+#define NUM_THREAD 12
 
 inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
-                        MilanLongInt StartIndex, MilanLongInt EndIndex,
-                        MilanLongInt* numGhostEdgesPtr,
-                        MilanLongInt* numGhostVerticesPtr,
-                        MilanLongInt* S,
-                        MilanLongInt* verLocInd,
-                        MilanLongInt* verLocPtr,
-                        omp_lock_t* MateLock,
-                        map <MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-                        vector <MilanLongInt>& Counter,
-                        vector <MilanLongInt>& verGhostPtr,
-                        vector <MilanLongInt>& verGhostInd,
-                        vector <MilanLongInt>& tempCounter,
-                        vector <MilanLongInt>& GMate,
-                        vector<MilanLongInt>& Message,
-                        vector<MilanLongInt>& QLocalVtx,
-                        vector<MilanLongInt>& QGhostVtx,
-                        vector<MilanLongInt>& QMsgType,
-                        vector<MilanInt>& QOwner,
-                        MilanLongInt* &candidateMate,
-                        staticQueue& U,
-                        staticQueue& privateU,
-                        staticQueue& privateQLocalVtx,
-                        staticQueue& privateQGhostVtx,
-                        staticQueue& privateQMsgType,
-                        staticQueue& privateQOwner
-                        )
+                       MilanLongInt StartIndex, MilanLongInt EndIndex,
+                       MilanLongInt *numGhostEdgesPtr,
+                       MilanLongInt *numGhostVerticesPtr,
+                       MilanLongInt *S,
+                       MilanLongInt *verLocInd,
+                       MilanLongInt *verLocPtr,
+                       omp_lock_t *MateLock,
+                       map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                       vector<MilanLongInt> &Counter,
+                       vector<MilanLongInt> &verGhostPtr,
+                       vector<MilanLongInt> &verGhostInd,
+                       vector<MilanLongInt> &tempCounter,
+                       vector<MilanLongInt> &GMate,
+                       vector<MilanLongInt> &Message,
+                       vector<MilanLongInt> &QLocalVtx,
+                       vector<MilanLongInt> &QGhostVtx,
+                       vector<MilanLongInt> &QMsgType,
+                       vector<MilanInt> &QOwner,
+                       MilanLongInt *&candidateMate,
+                       staticQueue &U,
+                       staticQueue &privateU,
+                       staticQueue &privateQLocalVtx,
+                       staticQueue &privateQGhostVtx,
+                       staticQueue &privateQMsgType,
+                       staticQueue &privateQOwner)
 {
 
     MilanLongInt insertMe = 0, numGhostEdges = 0, numGhostVertices = 0;
@@ -48,53 +47,55 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
 #pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD)
     {
-        
-        #pragma omp single
+
+#pragma omp single
         {
 
-        //Initialize the locks
-        //TODO this can be executed as task in parallel with other unparallelizable tasks
-        //TODO destroy the locks
+            // Initialize the locks
 #pragma omp taskloop num_tasks(NUM_THREAD)
-        for(i = 0; i < NLVer; i++)
-            omp_init_lock(&MateLock[i]);
-        
+            for (i = 0; i < NLVer; i++)
+                omp_init_lock(&MateLock[i]);
+
 #ifdef TIME_TRACKER
-    double Ghost2LocalInitialization = MPI_Wtime();
+            double Ghost2LocalInitialization = MPI_Wtime();
 #endif
 
-    /*
-     * OMP Ghost2LocalInitialization
-     * The cycle analyzes all the edges and when finds a ghost edge
-     * puts it in the Ghost2LocalMap.
-     * A critical region is needed when inserting data in the map.
-     *
-     * Despite the critical region it is still productive to
-     * parallelize this for because the critical region is exeuted
-     * only when a ghost edge is found and ghost edges are a minority,
-     * circa 3.5% during the tests.
-     */
-#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ : numGhostEdges) depend ( out : numGhostEdges, Counter, Ghost2LocalMap )
-        for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
-            insertMe = verLocInd[i];
-            //cout<<"InsertMe on Process "<<myRank<<" is: "<<insertMe<<endl;
-            if ((insertMe < StartIndex) || (insertMe > EndIndex)) { //Find a ghost
-                numGhostEdges++;
+            /*
+             * OMP Ghost2LocalInitialization
+             * This loop analyzes all the edges and when finds a ghost edge
+             * puts it in the Ghost2LocalMap.
+             * A critical region is needed when inserting data in the map.
+             *
+             * Despite the critical region it is still productive to
+             * parallelize this cycle because the critical region is exeuted
+             * only when a ghost edge is found and ghost edges are a minority,
+             * circa 3.5% during the tests.
+             */
+#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+                           \
+                                                     : numGhostEdges) depend(out \
+                                                                             : numGhostEdges, Counter, Ghost2LocalMap)
+            for (i = 0; i < NLEdge; i++)
+            { // O(m) - Each edge stored twice
+                insertMe = verLocInd[i];
+                if ((insertMe < StartIndex) || (insertMe > EndIndex))
+                { // Find a ghost
+                    numGhostEdges++;
 #pragma omp critical
-                {
-                    storedAlready = Ghost2LocalMap.find(insertMe);
-                    if (storedAlready != Ghost2LocalMap.end()) { //Has already been added
-                        //cout<<"Process "<<myRank<<" found: "<<storedAlready->first<<" - "<<storedAlready->second<<endl;
-                        Counter[storedAlready->second]++; //Increment the counter
-                    } else { //Insert an entry for the ghost:
-                        //cout<<"Process "<<myRank<<" * New insert:  Key="<<insertMe<< " : Value="<<numGhostVertices<<endl;
-                        Ghost2LocalMap[insertMe] = numGhostVertices; //Add a map entry
-                        Counter.push_back(1); //Initialize the counter
-                        numGhostVertices++;  //Increment the number of ghost vertices
-                    } //End of else()
-                }
-            } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
-        } //End of for(ghost vertices)
+                    {
+                        storedAlready = Ghost2LocalMap.find(insertMe);
+                        if (storedAlready != Ghost2LocalMap.end())
+                        {                                     // Has already been added
+                            Counter[storedAlready->second]++; // Increment the counter
+                        }
+                        else
+                        {                                                // Insert an entry for the ghost:
+                            Ghost2LocalMap[insertMe] = numGhostVertices; // Add a map entry
+                            Counter.push_back(1);                        // Initialize the counter
+                            numGhostVertices++;                          // Increment the number of ghost vertices
+                        }                                                // End of else()
+                    }
+                } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+            }     // End of for(ghost vertices)
 
 #ifdef TIME_TRACKER
             Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
@@ -102,184 +103,218 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")NGhosts:" << numGhostVertices << " GhostEdges: "<<numGhostEdges;
-            if (!Ghost2LocalMap.empty()) {
-                cout<<"\n("<<myRank<<")Final Map : on process ";
-                cout<<"\n("<<myRank<<")Key \t Value \t Counter \n"; fflush(stdout);
+            cout << "\n(" << myRank << ")NGhosts:" << numGhostVertices << " GhostEdges: " << numGhostEdges;
+            if (!Ghost2LocalMap.empty())
+            {
+                cout << "\n(" << myRank << ")Final Map : on process ";
+                cout << "\n(" << myRank << ")Key \t Value \t Counter \n";
+                fflush(stdout);
                 storedAlready = Ghost2LocalMap.begin();
-                do {
-                    cout<<storedAlready->second<<" - "<<storedAlready->first<<" : "<<Counter[storedAlready->second]<<endl;
+                do
+                {
+                    cout << storedAlready->second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl;
                     fflush(stdout);
                     storedAlready++;
-                } while ( storedAlready != Ghost2LocalMap.end() );
+                } while (storedAlready != Ghost2LocalMap.end());
             }
 #endif
 
-        #pragma omp task depend ( out : verGhostPtr, tempCounter, verGhostInd, GMate) depend ( in : numGhostVertices)
-        {
+#pragma omp task depend(out                                                       \
+                        : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
+                                                                               : numGhostVertices)
+            {
 
-            //Initialize adjacency Lists for Ghost Vertices:
-            try {
-                verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector
-                tempCounter.reserve(numGhostVertices); //Pointer Vector
-                verGhostInd.reserve(numGhostEdges); //Index Vector
-                GMate.reserve(numGhostVertices); //Ghost Mate Vector
-            } catch (length_error) {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-            //Initialize the Vectors:
-            verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector
-            tempCounter.resize(numGhostVertices, 0); //Temporary Counter
-            verGhostInd.resize(numGhostEdges, -1); //Index Vector
-            GMate.resize(numGhostVertices, -1); //Temporary Counter
-            verGhostPtr[0] = 0; //The first value
+                // Initialize adjacency Lists for Ghost Vertices:
+                try
+                {
+                    verGhostPtr.reserve(numGhostVertices + 1); // Pointer Vector
+                    tempCounter.reserve(numGhostVertices);     // Pointer Vector
+                    verGhostInd.reserve(numGhostEdges);        // Index Vector
+                    GMate.reserve(numGhostVertices);           // Ghost Mate Vector
+                }
+                catch (length_error)
+                {
+                    cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                    cout << "Not enough memory to allocate the internal variables \n";
+                    exit(1);
+                }
+                // Initialize the Vectors:
+                verGhostPtr.resize(numGhostVertices + 1, 0); // Pointer Vector
+                tempCounter.resize(numGhostVertices, 0);     // Temporary Counter
+                verGhostInd.resize(numGhostEdges, -1);       // Index Vector
+                GMate.resize(numGhostVertices, -1);          // Temporary Counter
+                verGhostPtr[0] = 0;                          // The first value
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Ghost Vertex Pointer: "; fflush(stdout);
+                cout << "\n(" << myRank << ")Ghost Vertex Pointer: ";
+                fflush(stdout);
 #endif
 
-#ifdef TIME_TRACKER
-            double verGhostPtrInitialization = MPI_Wtime();
-#endif
+            } // End of task
 
-        } // End of task
+#pragma omp task depent(out                      \
+                        : verGhostPtr) depend(in \
+                                              : Counter, numGhostVertices)
+            {
 
-#pragma omp task depent ( out : verGhostPtr ) depend ( in : Counter, numGhostVertices)
-        {
-            for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
-                verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
+#ifdef TIME_TRACKER
+                double verGhostPtrInitialization = MPI_Wtime();
+#endif
+                for (i = 0; i < numGhostVertices; i++)
+                { // O(|Ghost Vertices|)
+                    verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
-                cout<<verGhostPtr[i]<<"\t"; fflush(stdout);
+                    cout << verGhostPtr[i] << "\t";
+                    fflush(stdout);
 #endif
-            }
-        }//End of task
-        #ifdef TIME_TRACKER
-        verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
-        fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+                }
+
+#ifdef TIME_TRACKER
+                verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
+                fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
 #endif
+            } // End of task
 
 #ifdef PRINT_DEBUG_INFO_
-        if ( numGhostVertices > 0 )
-            cout<<verGhostPtr[numGhostVertices]<<"\n";
-        fflush(stdout);
+            if (numGhostVertices > 0)
+                cout << verGhostPtr[numGhostVertices] << "\n";
+            fflush(stdout);
 #endif
 
 #ifdef TIME_TRACKER
-        double verGhostIndInitialization = MPI_Wtime();
+            double verGhostIndInitialization = MPI_Wtime();
 #endif
 
-        /*
-         * OMP verGhostIndInitialization
-         *
-         * In this cycle the verGhostInd is initialized
-         * with the datas related to ghost edges.
-         * The check to see if a node is a ghost node is
-         * executed in paralle and when a ghost node
-         * is found a critical region is started.
-         *
-         * Despite the critical region it's still useful to
-         * parallelize the for cause the ghost nodes
-         * are a minority hence the critical region is executed
-         * few times, circa 3.5% of the times in the tests.
-         */
-#pragma omp taskloop num_tasks(NUM_THREAD) depend ( in : insertMe, Ghost2LocalMap, tempCounter) depend ( out : verGhostInd)
-        for (v = 0; v < NLVer; v++) {
-            adj1 = verLocPtr[v];   //Vertex Pointer
-            adj2 = verLocPtr[v + 1];
-            for (k = adj1; k < adj2; k++) {
-                w = verLocInd[k]; //Get the adjacent vertex
-                if ((w < StartIndex) || (w > EndIndex)) { //Find a ghost
+            /*
+             * OMP verGhostIndInitialization
+             *
+             * In this cycle the verGhostInd is initialized
+             * with the datas related to ghost edges.
+             * The check to see if a node is a ghost node is
+             * executed in paralle and when a ghost node
+             * is found a critical region is started.
+             *
+             * Despite the critical region it's still useful to
+             * parallelize the for cause the ghost nodes
+             * are a minority hence the critical region is executed
+             * few times, circa 3.5% of the times in the tests.
+             */
+#pragma omp taskloop num_tasks(NUM_THREAD) depend(in                                                  \
+                                                  : insertMe, Ghost2LocalMap, tempCounter) depend(out \
+                                                                                                  : verGhostInd)
+            for (v = 0; v < NLVer; v++)
+            {
+                adj1 = verLocPtr[v]; // Vertex Pointer
+                adj2 = verLocPtr[v + 1];
+                for (k = adj1; k < adj2; k++)
+                {
+                    w = verLocInd[k]; // Get the adjacent vertex
+                    if ((w < StartIndex) || (w > EndIndex))
+                    { // Find a ghost
 #pragma omp critical
-                    {
-                        insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
-                        tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
-                    }
-                    verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
-                } //End of if((w < StartIndex) || (w > EndIndex))
-            } //End of for(k)
-        } //End of for (v)
+                        {
+                            insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
+                            tempCounter[Ghost2LocalMap[w]]++;                                           // Increment the counter
+                        }
+                        verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
+                    }                                           // End of if((w < StartIndex) || (w > EndIndex))
+                }                                               // End of for(k)
+            }                                                   // End of for (v)
 
-    } // End of parallel region
+        } // End of parallel region
 
 #ifdef TIME_TRACKER
-            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
-            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+        verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+        fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Ghost Vertex Index: ";
-            for ( v=0; v < numGhostEdges; v++ )
-                cout<<verGhostInd[v]<<"\t";
-            cout<<endl; fflush(stdout);
+        cout << "\n(" << myRank << ")Ghost Vertex Index: ";
+        for (v = 0; v < numGhostEdges; v++)
+            cout << verGhostInd[v] << "\t";
+        cout << endl;
+        fflush(stdout);
 #endif
 
-#pragma omp task depend ( in : numGhostEdges) depend ( out : QLocalVtx, QGhostVtx, QMsgType, QOwner )
-{
-            try {
-                QLocalVtx.reserve(numGhostEdges); //Local Vertex
-                QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
-                QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure)
-                QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later
-            } catch (length_error) {
+#pragma omp task depend(in                          \
+                        : numGhostEdges) depend(out \
+                                                : QLocalVtx, QGhostVtx, QMsgType, QOwner)
+        {
+            try
+            {
+                QLocalVtx.reserve(numGhostEdges); // Local Vertex
+                QGhostVtx.reserve(numGhostEdges); // Ghost Vertex
+                QMsgType.reserve(numGhostEdges);  // Message Type (Request/Failure)
+                QOwner.reserve(numGhostEdges);    // Owner of the ghost: COmpute once and use later
+            }
+            catch (length_error)
+            {
                 cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
                 cout << "Not enough memory to allocate the internal variables \n";
                 exit(1);
             }
-}
-
-#pragma omp task depend( in : numGhostEdges, numGhostVertices ) depend ( out : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
-{
+        }
 
 #ifdef PRINT_DEBUG_INFO_
-cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
+        cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
+        fflush(stdout);
 #endif
 
-    *numGhostEdgesPtr = numGhostEdges;
-    *numGhostVerticesPtr = numGhostVertices;  
-
-    //Allocate Data Structures:
-    /*
-     * candidateMate was a vector and has been replaced with an array
-     * there is no point in using the vector (or maybe there is (???))
-     * so I replaced it with an array wich is slightly faster
-     */
-    candidateMate = new MilanLongInt[NLVer + numGhostVertices];
-
-
-    #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
-            fflush(stdout);
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << "=========================************===============================" << endl;
+        fflush(stdout);
+        fflush(stdout);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
-            fflush(stdout);
+        cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+        fflush(stdout);
+        fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
-            if (myRank == 0) cout<<"\n("<<myRank<<") Setup Time :"<< *ph0_time <<endl; fflush(stdout);
+        if (myRank == 0)
+            cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+        fflush(stdout);
 #endif
 
-    *S = numGhostVertices; //Initialize S with number of Ghost Vertices
-
-    /*
-     * Create the Queue Data Structure for the Dominating Set
-     *
-     * I had to declare the staticuQueue U before the parallel region
-     * to have it in the correct scope. Since we can't change the dimension
-     * of a staticQueue I had to destroy the previous object and instantiate
-     * a new one of the correct size.
-     */
-    new(&U) staticQueue(NLVer + numGhostVertices);
-
-    //TODO how can I decide a more meaningfull size?
-    MilanLongInt size = numGhostVertices;
-
-    new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
-    new(&privateQLocalVtx) staticQueue(size);
-    new(&privateQGhostVtx) staticQueue(size);
-    new(&privateQMsgType) staticQueue(size);
-    new(&privateQOwner) staticQueue(size);
-}
+#pragma omp task depend(in                                            \
+                        : numGhostEdges, numGhostVertices) depend(out \
+                                                                  : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
+        {
+
+            
+            //The values calculated in this function are sent back to the calling function
+            *numGhostEdgesPtr = numGhostEdges;
+            *numGhostVerticesPtr = numGhostVertices;
+
+            // Allocate Data Structures:
+            /*
+             * candidateMate was a vector and has been replaced with an array
+             * there is no point in using the vector (or maybe there is (???))
+             * so I replaced it with an array wich is slightly faster
+             */
+            candidateMate = new MilanLongInt[NLVer + numGhostVertices];
+
+            *S = numGhostVertices; // Initialize S with number of Ghost Vertices
+
+            /*
+             * Create the Queue Data Structure for the Dominating Set
+             *
+             * I had to declare the staticuQueue U before the parallel region
+             * to have it in the correct scope. Since we can't change the dimension
+             * of a staticQueue I had to destroy the previous object and instantiate
+             * a new one of the correct size.
+             */
+            new (&U) staticQueue(NLVer + numGhostVertices);
+
+            // TODO how can I decide a more meaningfull size?
+            MilanLongInt size = numGhostVertices;
+
+            // Initialize the privte data structure
+            new (&privateU) staticQueue(NLVer + numGhostVertices); // TODO how can I put a meaningfull size?
+            new (&privateQLocalVtx) staticQueue(size);
+            new (&privateQGhostVtx) staticQueue(size);
+            new (&privateQMsgType) staticQueue(size);
+            new (&privateQOwner) staticQueue(size);
+        }
     } // End of single
 }

From 1aca17cd44780bc87a2dff632f786ca2547b0551 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 26 Jun 2022 10:02:11 -0500
Subject: [PATCH 36/96] initialize fix

---
 amgprec/impl/aggregator/initialize.cpp | 240 +++++++++++++------------
 exec.sh                                |   1 +
 2 files changed, 125 insertions(+), 116 deletions(-)

diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 21210c34..c5ae3f26 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -8,7 +8,7 @@
 #include "dataStrStaticQueue.h"
 #include "omp.h"
 
-#define NUM_THREAD 12
+#define NUM_THREAD 4
 
 inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                        MilanLongInt StartIndex, MilanLongInt EndIndex,
@@ -50,7 +50,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
 #pragma omp single
         {
-
             // Initialize the locks
 #pragma omp taskloop num_tasks(NUM_THREAD)
             for (i = 0; i < NLVer; i++)
@@ -71,32 +70,38 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
              * only when a ghost edge is found and ghost edges are a minority,
              * circa 3.5% during the tests.
              */
-#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+                           \
-                                                     : numGhostEdges) depend(out \
-                                                                             : numGhostEdges, Counter, Ghost2LocalMap)
-            for (i = 0; i < NLEdge; i++)
-            { // O(m) - Each edge stored twice
-                insertMe = verLocInd[i];
-                if ((insertMe < StartIndex) || (insertMe > EndIndex))
-                { // Find a ghost
-                    numGhostEdges++;
+
+#pragma omp task depend(out \
+                        : numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, numGhostVertices)
+            {
+
+#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
+                                                     : numGhostEdges)
+                for (i = 0; i < NLEdge; i++)
+                { // O(m) - Each edge stored twice
+                    insertMe = verLocInd[i];
+                    if ((insertMe < StartIndex) || (insertMe > EndIndex))
+                    { // Find a ghost
+                        numGhostEdges++;
 #pragma omp critical
-                    {
-                        storedAlready = Ghost2LocalMap.find(insertMe);
-                        if (storedAlready != Ghost2LocalMap.end())
-                        {                                     // Has already been added
-                            Counter[storedAlready->second]++; // Increment the counter
+                        {
+                            storedAlready = Ghost2LocalMap.find(insertMe);
+                            if (storedAlready != Ghost2LocalMap.end())
+                            {                                     // Has already been added
+                                Counter[storedAlready->second]++; // Increment the counter
+                            }
+                            else
+                            {                                                // Insert an entry for the ghost:
+                                Ghost2LocalMap[insertMe] = numGhostVertices; // Add a map entry
+                                Counter.push_back(1);                        // Initialize the counter
+                                numGhostVertices++;                          // Increment the number of ghost vertices
+                            }                                                // End of else()
                         }
-                        else
-                        {                                                // Insert an entry for the ghost:
-                            Ghost2LocalMap[insertMe] = numGhostVertices; // Add a map entry
-                            Counter.push_back(1);                        // Initialize the counter
-                            numGhostVertices++;                          // Increment the number of ghost vertices
-                        }                                                // End of else()
-                    }
-                } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
-            }     // End of for(ghost vertices)
+                    } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+                }     // End of for(ghost vertices)
+            }         // end of task depend
 
+            // numGhostEdges = atomicNumGhostEdges;
 #ifdef TIME_TRACKER
             Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
             fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
@@ -121,7 +126,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
 #pragma omp task depend(out                                                       \
                         : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
-                                                                               : numGhostVertices)
+                                                                               : numGhostVertices, numGhostEdges)
             {
 
                 // Initialize adjacency Lists for Ghost Vertices:
@@ -151,7 +156,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
             } // End of task
 
-#pragma omp task depent(out                      \
+#pragma omp task depend(out                      \
                         : verGhostPtr) depend(in \
                                               : Counter, numGhostVertices)
             {
@@ -198,123 +203,126 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
              * are a minority hence the critical region is executed
              * few times, circa 3.5% of the times in the tests.
              */
-#pragma omp taskloop num_tasks(NUM_THREAD) depend(in                                                  \
-                                                  : insertMe, Ghost2LocalMap, tempCounter) depend(out \
-                                                                                                  : verGhostInd)
-            for (v = 0; v < NLVer; v++)
+#pragma omp task depend(in                                                               \
+                        : insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \
+                                                                                     : verGhostInd)
             {
-                adj1 = verLocPtr[v]; // Vertex Pointer
-                adj2 = verLocPtr[v + 1];
-                for (k = adj1; k < adj2; k++)
+#pragma omp taskloop num_tasks(NUM_THREAD)
+                for (v = 0; v < NLVer; v++)
                 {
-                    w = verLocInd[k]; // Get the adjacent vertex
-                    if ((w < StartIndex) || (w > EndIndex))
-                    { // Find a ghost
+                    adj1 = verLocPtr[v]; // Vertex Pointer
+                    adj2 = verLocPtr[v + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        w = verLocInd[k]; // Get the adjacent vertex
+                        if ((w < StartIndex) || (w > EndIndex))
+                        { // Find a ghost
 #pragma omp critical
-                        {
-                            insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
-                            tempCounter[Ghost2LocalMap[w]]++;                                           // Increment the counter
-                        }
-                        verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
-                    }                                           // End of if((w < StartIndex) || (w > EndIndex))
-                }                                               // End of for(k)
-            }                                                   // End of for (v)
-
-        } // End of parallel region
+                            {
+                                insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
+                                tempCounter[Ghost2LocalMap[w]]++;                                           // Increment the counter
+                            }
+                            verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
+                        }                                           // End of if((w < StartIndex) || (w > EndIndex))
+                    }                                               // End of for(k)
+                }                                                   // End of for (v)
+            }                                                       // end of tasklopp
 
 #ifdef TIME_TRACKER
-        verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
-        fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Ghost Vertex Index: ";
-        for (v = 0; v < numGhostEdges; v++)
-            cout << verGhostInd[v] << "\t";
-        cout << endl;
-        fflush(stdout);
+            cout << "\n(" << myRank << ")Ghost Vertex Index: ";
+            for (v = 0; v < numGhostEdges; v++)
+                cout << verGhostInd[v] << "\t";
+            cout << endl;
+            fflush(stdout);
 #endif
 
 #pragma omp task depend(in                          \
                         : numGhostEdges) depend(out \
                                                 : QLocalVtx, QGhostVtx, QMsgType, QOwner)
-        {
-            try
             {
-                QLocalVtx.reserve(numGhostEdges); // Local Vertex
-                QGhostVtx.reserve(numGhostEdges); // Ghost Vertex
-                QMsgType.reserve(numGhostEdges);  // Message Type (Request/Failure)
-                QOwner.reserve(numGhostEdges);    // Owner of the ghost: COmpute once and use later
-            }
-            catch (length_error)
-            {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-        }
+                try
+                {
+                    QLocalVtx.reserve(numGhostEdges); // Local Vertex
+                    QGhostVtx.reserve(numGhostEdges); // Ghost Vertex
+                    QMsgType.reserve(numGhostEdges);  // Message Type (Request/Failure)
+                    QOwner.reserve(numGhostEdges);    // Owner of the ghost: COmpute once and use later
+                }
+                catch (length_error)
+                {
+                    cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+                    cout << "Not enough memory to allocate the internal variables \n";
+                    exit(1);
+                }
+            } // end of task
 
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
-        fflush(stdout);
+            cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
+            fflush(stdout);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << "=========================************===============================" << endl;
-        fflush(stdout);
-        fflush(stdout);
+            cout << "\n(" << myRank << "=========================************===============================" << endl;
+            fflush(stdout);
+            fflush(stdout);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
-        fflush(stdout);
-        fflush(stdout);
+            cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+            fflush(stdout);
+            fflush(stdout);
 #endif
+
 #ifdef DEBUG_HANG_
-        if (myRank == 0)
-            cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
-        fflush(stdout);
+            if (myRank == 0)
+                cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+            fflush(stdout);
 #endif
 
 #pragma omp task depend(in                                            \
                         : numGhostEdges, numGhostVertices) depend(out \
                                                                   : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
-        {
-
-            
-            //The values calculated in this function are sent back to the calling function
-            *numGhostEdgesPtr = numGhostEdges;
-            *numGhostVerticesPtr = numGhostVertices;
-
-            // Allocate Data Structures:
-            /*
-             * candidateMate was a vector and has been replaced with an array
-             * there is no point in using the vector (or maybe there is (???))
-             * so I replaced it with an array wich is slightly faster
-             */
-            candidateMate = new MilanLongInt[NLVer + numGhostVertices];
-
-            *S = numGhostVertices; // Initialize S with number of Ghost Vertices
+            {
 
-            /*
-             * Create the Queue Data Structure for the Dominating Set
-             *
-             * I had to declare the staticuQueue U before the parallel region
-             * to have it in the correct scope. Since we can't change the dimension
-             * of a staticQueue I had to destroy the previous object and instantiate
-             * a new one of the correct size.
-             */
-            new (&U) staticQueue(NLVer + numGhostVertices);
-
-            // TODO how can I decide a more meaningfull size?
-            MilanLongInt size = numGhostVertices;
-
-            // Initialize the privte data structure
-            new (&privateU) staticQueue(NLVer + numGhostVertices); // TODO how can I put a meaningfull size?
-            new (&privateQLocalVtx) staticQueue(size);
-            new (&privateQGhostVtx) staticQueue(size);
-            new (&privateQMsgType) staticQueue(size);
-            new (&privateQOwner) staticQueue(size);
-        }
-    } // End of single
+                // The values calculated in this function are sent back to the calling function
+                *numGhostEdgesPtr = numGhostEdges;
+                *numGhostVerticesPtr = numGhostVertices;
+
+                // Allocate Data Structures:
+                /*
+                 * candidateMate was a vector and has been replaced with an array
+                 * there is no point in using the vector (or maybe there is (???))
+                 * so I replaced it with an array wich is slightly faster
+                 */
+                candidateMate = new MilanLongInt[NLVer + numGhostVertices];
+
+                *S = numGhostVertices; // Initialize S with number of Ghost Vertices
+
+                /*
+                 * Create the Queue Data Structure for the Dominating Set
+                 *
+                 * I had to declare the staticuQueue U before the parallel region
+                 * to have it in the correct scope. Since we can't change the dimension
+                 * of a staticQueue I had to destroy the previous object and instantiate
+                 * a new one of the correct size.
+                 */
+                new (&U) staticQueue(NLVer + numGhostVertices);
+
+                // TODO how can I decide a more meaningfull size?
+                MilanLongInt size = numGhostVertices;
+
+                // Initialize the privte data structure
+                new (&privateU) staticQueue(NLVer + numGhostVertices); // TODO how can I put a meaningfull size?
+                new (&privateQLocalVtx) staticQueue(size);
+                new (&privateQGhostVtx) staticQueue(size);
+                new (&privateQMsgType) staticQueue(size);
+                new (&privateQOwner) staticQueue(size);
+            } // end of task
+
+        } // End of single region
+    }     // End of parallel region
 }
diff --git a/exec.sh b/exec.sh
index 3bb7bd90..50edf4ad 100755
--- a/exec.sh
+++ b/exec.sh
@@ -1,3 +1,4 @@
+rm amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
 make all
 cd samples/advanced/pdegen
 make amg_d_pde3d

From 7cfe198d0f383c541c83d61b7e99e016699e086d Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 26 Jun 2022 10:45:06 -0500
Subject: [PATCH 37/96] Format

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  315 ++--
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1562 +++++++++--------
 2 files changed, 1024 insertions(+), 853 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 54830919..530933e5 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -52,7 +52,7 @@
 
 #ifndef _matchboxpC_H_
 #define _matchboxpC_H_
-//Turn on a lot of debugging information with this switch:
+// Turn on a lot of debugging information with this switch:
 //#define PRINT_DEBUG_INFO_
 #include <stdio.h>
 #include <iostream>
@@ -66,193 +66,190 @@
 using namespace std;
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 #if !defined(SERIAL_MPI)
-  
-#define MilanMpiLongInt  MPI_LONG_LONG
+
+#define MilanMpiLongInt MPI_LONG_LONG
 
 #ifndef _primitiveDataType_Definition_
 #define _primitiveDataType_Definition_
-    //Regular integer:
-    #ifndef INTEGER_H
-    #define INTEGER_H
-        typedef int32_t MilanInt;
-    #endif
+// Regular integer:
+#ifndef INTEGER_H
+#define INTEGER_H
+    typedef int32_t MilanInt;
+#endif
 
-    //Regular long integer:
-    #ifndef LONG_INT_H
-    #define LONG_INT_H
-        #ifdef BIT64
-            typedef int64_t MilanLongInt;
-            typedef MPI_LONG MilanMpiLongInt;
-        #else
-            typedef int32_t MilanLongInt;
-            typedef MPI_INT MilanMpiLongInt;
-        #endif
-    #endif
+// Regular long integer:
+#ifndef LONG_INT_H
+#define LONG_INT_H
+#ifdef BIT64
+    typedef int64_t MilanLongInt;
+    typedef MPI_LONG MilanMpiLongInt;
+#else
+    typedef int32_t MilanLongInt;
+    typedef MPI_INT MilanMpiLongInt;
+#endif
+#endif
 
-    //Regular boolean
-    #ifndef BOOL_H
-    #define BOOL_H
-        typedef bool MilanBool;
-    #endif
+// Regular boolean
+#ifndef BOOL_H
+#define BOOL_H
+    typedef bool MilanBool;
+#endif
 
-    //Regular double and absolute value computation:
-    #ifndef REAL_H
-    #define REAL_H
-        typedef double MilanReal;
-        typedef MPI_DOUBLE MilanMpiReal;
-        inline MilanReal MilanAbs(MilanReal value)
-        {
-            return fabs(value);
-        }
-    #endif
+// Regular double and absolute value computation:
+#ifndef REAL_H
+#define REAL_H
+    typedef double MilanReal;
+    typedef MPI_DOUBLE MilanMpiReal;
+    inline MilanReal MilanAbs(MilanReal value)
+    {
+        return fabs(value);
+    }
+#endif
 
-    //Regular float and absolute value computation:
-    #ifndef FLOAT_H
-    #define FLOAT_H
-        typedef float MilanFloat;
-        typedef MPI_FLOAT MilanMpiFloat;
-        inline MilanFloat MilanAbsFloat(MilanFloat value)
-        {
-            return fabs(value);
-        }
-    #endif
+// Regular float and absolute value computation:
+#ifndef FLOAT_H
+#define FLOAT_H
+    typedef float MilanFloat;
+    typedef MPI_FLOAT MilanMpiFloat;
+    inline MilanFloat MilanAbsFloat(MilanFloat value)
+    {
+        return fabs(value);
+    }
+#endif
 
-    //// Define the limits:
-    #ifndef LIMITS_H
-    #define LIMITS_H
-    //Integer Maximum and Minimum:
-  //      #define MilanIntMax INT_MAX
-  //    #define MilanIntMin INT_MIN
-        #define MilanIntMax INT32_MAX
-        #define MilanIntMin INT32_MIN
+//// Define the limits:
+#ifndef LIMITS_H
+#define LIMITS_H
+    // Integer Maximum and Minimum:
+    //      #define MilanIntMax INT_MAX
+    //    #define MilanIntMin INT_MIN
+#define MilanIntMax INT32_MAX
+#define MilanIntMin INT32_MIN
 
-        #ifdef BIT64
-            #define MilanLongIntMax INT64_MAX
-            #define MilanLongIntMin -INT64_MAX
-        #else
-            #define MilanLongIntMax INT32_MAX
-            #define MilanLongIntMin -INT32_MAX
-        #endif
+#ifdef BIT64
+#define MilanLongIntMax INT64_MAX
+#define MilanLongIntMin -INT64_MAX
+#else
+#define MilanLongIntMax INT32_MAX
+#define MilanLongIntMin -INT32_MAX
+#endif
 
-    #endif
+#endif
 
     // +INFINITY
     const double PLUS_INFINITY = numeric_limits<int>::infinity();
     const double MINUS_INFINITY = -PLUS_INFINITY;
-    //#define MilanRealMax LDBL_MAX
-    #define MilanRealMax PLUS_INFINITY
-    #define MilanRealMin MINUS_INFINITY
+//#define MilanRealMax LDBL_MAX
+#define MilanRealMax PLUS_INFINITY
+#define MilanRealMin MINUS_INFINITY
 #endif
 
-//Function of find the owner of a ghost vertex using binary search:
-inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+    // Function of find the owner of a ghost vertex using binary search:
+    inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                                      MilanInt myRank, MilanInt numProcs);
 
-inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
-                                              MilanLongInt adj2,
-                                              MilanLongInt* verLocInd,
-                                              MilanReal* edgeLocWeight);
+    inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                                  MilanLongInt adj2,
+                                                  MilanLongInt *verLocInd,
+                                                  MilanReal *edgeLocWeight);
 
-inline bool isAlreadyMatched(MilanLongInt node,
-                                MilanLongInt StartIndex,
-                                MilanLongInt EndIndex,
-                                vector <MilanLongInt> &GMate,
-                                MilanLongInt* Mate,
-                                map <MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+    inline bool isAlreadyMatched(MilanLongInt node,
+                                 MilanLongInt StartIndex,
+                                 MilanLongInt EndIndex,
+                                 vector<MilanLongInt> &GMate,
+                                 MilanLongInt *Mate,
+                                 map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
 
-inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
-                                         MilanLongInt adj2,
-                                         MilanReal* edgeLocWeight,
-                                         MilanLongInt k,
-                                         MilanLongInt* verLocInd,
-                                         MilanLongInt StartIndex,
-                                         MilanLongInt EndIndex,
-                                         vector <MilanLongInt> &GMate,
-                                         MilanLongInt* Mate,
-                                         map <MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+    inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                             MilanLongInt adj2,
+                                             MilanReal *edgeLocWeight,
+                                             MilanLongInt k,
+                                             MilanLongInt *verLocInd,
+                                             MilanLongInt StartIndex,
+                                             MilanLongInt EndIndex,
+                                             vector<MilanLongInt> &GMate,
+                                             MilanLongInt *Mate,
+                                             map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
 
-inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
-                        MilanLongInt StartIndex, MilanLongInt EndIndex,
-                        MilanLongInt* numGhostEdgesPtr,
-                        MilanLongInt* numGhostVerticesPtr,
-                        MilanLongInt* S,
-                        MilanLongInt* verLocInd,
-                        MilanLongInt* verLocPtr,
-                        omp_lock_t* MateLock,
-                        map <MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-                        vector <MilanLongInt>& Counter,
-                        vector <MilanLongInt>& verGhostPtr,
-                        vector <MilanLongInt>& verGhostInd,
-                        vector <MilanLongInt>& tempCounter,
-                        vector <MilanLongInt>& GMate,
-                        vector<MilanLongInt>& Message,
-                        vector<MilanLongInt>& QLocalVtx,
-                        vector<MilanLongInt>& QGhostVtx,
-                        vector<MilanLongInt>& QMsgType,
-                        vector<MilanInt>& QOwner,
-                        MilanLongInt* &candidateMate,
-                        staticQueue& U,
-                        staticQueue& privateU,
-                        staticQueue& privateQLocalVtx,
-                        staticQueue& privateQGhostVtx,
-                        staticQueue& privateQMsgType,
-                        staticQueue& privateQOwner
-                        );
+    inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                           MilanLongInt StartIndex, MilanLongInt EndIndex,
+                           MilanLongInt *numGhostEdgesPtr,
+                           MilanLongInt *numGhostVerticesPtr,
+                           MilanLongInt *S,
+                           MilanLongInt *verLocInd,
+                           MilanLongInt *verLocPtr,
+                           omp_lock_t *MateLock,
+                           map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                           vector<MilanLongInt> &Counter,
+                           vector<MilanLongInt> &verGhostPtr,
+                           vector<MilanLongInt> &verGhostInd,
+                           vector<MilanLongInt> &tempCounter,
+                           vector<MilanLongInt> &GMate,
+                           vector<MilanLongInt> &Message,
+                           vector<MilanLongInt> &QLocalVtx,
+                           vector<MilanLongInt> &QGhostVtx,
+                           vector<MilanLongInt> &QMsgType,
+                           vector<MilanInt> &QOwner,
+                           MilanLongInt *&candidateMate,
+                           staticQueue &U,
+                           staticQueue &privateU,
+                           staticQueue &privateQLocalVtx,
+                           staticQueue &privateQGhostVtx,
+                           staticQueue &privateQMsgType,
+                           staticQueue &privateQOwner);
 
-void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP
-        (
-                MilanLongInt NLVer, MilanLongInt NLEdge,
-                MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
-                MilanLongInt* verDistance,
-                MilanLongInt* Mate,
-                MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-                MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-                MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-                MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 
-  void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
-(
- MilanLongInt NLVer, MilanLongInt NLEdge,
- MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
- MilanLongInt* verDistance,
- MilanLongInt* Mate,
- MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
- MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
- MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
- MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 
- void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
-(
-MilanLongInt NLVer, MilanLongInt NLEdge,
-MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
-MilanLongInt* verDistance,
-MilanLongInt* Mate,
-MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+    void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 
-void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
-		MilanLongInt* verDistance,
-		MilanLongInt* Mate,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+    void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+                     MilanLongInt *verDistance,
+                     MilanLongInt *Mate,
+                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 
-void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
-		MilanLongInt* verDistance,
-		MilanLongInt* Mate,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+    void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
+                     MilanLongInt *verDistance,
+                     MilanLongInt *Mate,
+                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 
 #endif
 #ifdef __cplusplus
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index d6c58852..d470b1ab 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -81,26 +81,32 @@
 
 #ifdef SERIAL_MPI
 #else
-//MPI type map
-template<typename T> MPI_Datatype TypeMap();
-template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
-template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
-template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
-template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
+// MPI type map
+template <typename T>
+MPI_Datatype TypeMap();
+template <>
+inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
+template <>
+inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
+template <>
+inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
+template <>
+inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
 
 // DOUBLE PRECISION VERSION
-//WARNING: The vertex block on a given rank is contiguous
+// WARNING: The vertex block on a given rank is contiguous
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
-        MilanLongInt NLVer, MilanLongInt NLEdge,
-        MilanLongInt* verLocPtr, MilanLongInt* verLocInd,
-        MilanReal* edgeLocWeight,
-        MilanLongInt* verDistance,
-        MilanLongInt* Mate,
-        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-        MilanLongInt* msgIndSent, MilanLongInt* msgActualSent,
-        MilanReal* msgPercent,
-        MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-        MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
+    MilanLongInt NLVer, MilanLongInt NLEdge,
+    MilanLongInt *verLocPtr, MilanLongInt *verLocInd,
+    MilanReal *edgeLocWeight,
+    MilanLongInt *verDistance,
+    MilanLongInt *Mate,
+    MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+    MilanLongInt *msgIndSent, MilanLongInt *msgActualSent,
+    MilanReal *msgPercent,
+    MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+    MilanLongInt *ph1_card, MilanLongInt *ph2_card)
+{
 
     /*
      * verDistance: it's a vector long as the number of processors.
@@ -118,152 +124,159 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
 #if !defined(SERIAL_MPI)
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()"; fflush(stdout);
+    cout << "\n(" << myRank << ")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()";
+    fflush(stdout);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<") verDistance ["<< verDistance[0] << "," << verDistance[1] << "," << verDistance[2] <<"," << verDistance[3] <<"]"; fflush(stdout);
+    cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]";
+    fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
-    if (myRank == 0) cout<<"\n("<<myRank<<") verDistance ["<< verDistance[0] << "," << verDistance[1] << "," << verDistance[2] <<"," << verDistance[3] <<"]"; fflush(stdout);
+    if (myRank == 0)
+        cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]";
+    fflush(stdout);
 #endif
 
-    //inputSubGraph.getStartEndIndices(StartIndex, EndIndex);
-    MilanLongInt StartIndex = verDistance[myRank]; //The starting vertex owned by the current rank
-    //MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank
-    MilanLongInt EndIndex = verDistance[myRank + 1] - 1; //The ending vertex owned by the current rank
+    // inputSubGraph.getStartEndIndices(StartIndex, EndIndex);
+    MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank
+    // MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank
+    MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
 
     MPI_Status computeStatus;
-    const int ComputeTag = 7;  //Predefined tag
-    const int BundleTag = 9;   //Predefined tag
+    const int ComputeTag = 7; // Predefined tag
+    const int BundleTag = 9;  // Predefined tag
     int error_codeC;
     error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
     char error_message[MPI_MAX_ERROR_STRING];
     int message_length;
 
-    //MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0;
+    // MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0;
     MilanLongInt msgActual = 0, msgInd = 0;
-    MilanReal heaviestEdgeWt = 0.0f; //Assumes positive weight
+    MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight
     MilanReal startTime, finishTime;
-    //MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer
+    // MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer
     startTime = MPI_Wtime();
 
-    //Data structures for sending and receiving messages:
+    // Data structures for sending and receiving messages:
     vector<MilanLongInt> Message; // [ u, v, message_type ]
-    Message.resize(3,-1);
-    const MilanLongInt REQUEST  = 1;
-    const MilanLongInt SUCCESS  = 2;
-    const MilanLongInt FAILURE  = 3;
+    Message.resize(3, -1);
+    const MilanLongInt REQUEST = 1;
+    const MilanLongInt SUCCESS = 2;
+    const MilanLongInt FAILURE = 3;
     const MilanLongInt SIZEINFO = 4;
     MilanLongInt message_type = 0;
-    //Data structures for Message Bundling:
-    //Although up to two messages can be sent along any cross edge,
-    //only one message will be sent in the initialization phase -
-    //one of: REQUEST/FAILURE/SUCCESS
+    // Data structures for Message Bundling:
+    // Although up to two messages can be sent along any cross edge,
+    // only one message will be sent in the initialization phase -
+    // one of: REQUEST/FAILURE/SUCCESS
     vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
     vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
 
-    MilanLongInt* PCounter = new MilanLongInt [numProcs];
+    MilanLongInt *PCounter = new MilanLongInt[numProcs];
     for (int i = 0; i < numProcs; i++)
         PCounter[i] = 0;
 
-
     MilanLongInt NumMessagesBundled = 0;
     MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
-    MilanLongInt* candidateMate = nullptr;
+    MilanLongInt *candidateMate = nullptr;
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")NV: "<<NLVer<<"  Edges: "<<NLEdge; fflush(stdout);
-    cout<<"\n("<<myRank<<")StartIndex: "<<StartIndex<<"  EndIndex: "<<EndIndex; fflush(stdout);
+    cout << "\n(" << myRank << ")NV: " << NLVer << "  Edges: " << NLEdge;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")StartIndex: " << StartIndex << "  EndIndex: " << EndIndex;
+    fflush(stdout);
 #endif
-    //Other Variables:
+    // Other Variables:
     MilanLongInt u = -1, v = -1, w = -1, i = 0;
     MilanLongInt k = -1, adj1 = -1, adj2 = -1;
     MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
     MilanLongInt myCard = 0;
     MilanInt Sender = 0; // This is the rank of the sending nodes, it has to be an integer! Fabio
 
-    //Build the Ghost Vertex Set: Vg
-    map <MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
-    vector <MilanLongInt> Counter;  //Store the edge count for each ghost vertex
-    MilanLongInt numGhostVertices = 0, numGhostEdges = 0; //Number of Ghost vertices
+    // Build the Ghost Vertex Set: Vg
+    map<MilanLongInt, MilanLongInt> Ghost2LocalMap;       // Map each ghost vertex to a local vertex
+    vector<MilanLongInt> Counter;                         // Store the edge count for each ghost vertex
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
+    cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
+    fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
-    if (myRank == 0)     cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
+    if (myRank == 0)
+        cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
+    fflush(stdout);
 #endif
 
-    //Define Adjacency Lists for Ghost Vertices:
-    //cout<<"Building Ghost data structures ... \n\n";
-    vector <MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
-    //Mate array for ghost vertices:
-    vector <MilanLongInt> GMate;  //Proportional to the number of ghost vertices
+    // Define Adjacency Lists for Ghost Vertices:
+    // cout<<"Building Ghost data structures ... \n\n";
+    vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
+    // Mate array for ghost vertices:
+    vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
     MilanLongInt S;
     MilanLongInt privateMyCard = 0;
     staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
     MilanLongInt myIndex = 0;
-    vector <MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
-    vector <MPI_Request> SRequest; //Requests that are used for each send message
-    vector <MPI_Status> SStatus;   //Status of sent messages, used in MPI_Wait
-    MilanLongInt MessageIndex = 0; //Pointer for current message
+    vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
+    vector<MPI_Request> SRequest;  // Requests that are used for each send message
+    vector<MPI_Status> SStatus;    // Status of sent messages, used in MPI_Wait
+    MilanLongInt MessageIndex = 0; // Pointer for current message
     MilanInt OneMessageSize = 0;
     MilanLongInt numMessagesToSend;
     MilanInt BufferSize;
     MilanLongInt *Buffer;
     bool isEmpty;
 
-    //Declare the locks
-    // TODO destroy the locks
+    // Declare the locks
+    //  TODO destroy the locks
     omp_lock_t MateLock[NLVer];
 
-    initialize(NLVer, NLEdge, StartIndex, 
-                EndIndex, &numGhostEdges, 
-                &numGhostVertices, &S,
-                verLocInd, verLocPtr,
-                MateLock, 
-                Ghost2LocalMap, Counter,
-                verGhostPtr, verGhostInd,
-                tempCounter, GMate,
-                Message, QLocalVtx,
-                QGhostVtx, QMsgType, QOwner, 
-                candidateMate, U,
-                privateU,
-                privateQLocalVtx,
-                privateQGhostVtx,
-                privateQMsgType,
-                privateQOwner
-                );
-                        
+    initialize(NLVer, NLEdge, StartIndex,
+               EndIndex, &numGhostEdges,
+               &numGhostVertices, &S,
+               verLocInd, verLocPtr,
+               MateLock,
+               Ghost2LocalMap, Counter,
+               verGhostPtr, verGhostInd,
+               tempCounter, GMate,
+               Message, QLocalVtx,
+               QGhostVtx, QMsgType, QOwner,
+               candidateMate, U,
+               privateU,
+               privateQLocalVtx,
+               privateQGhostVtx,
+               privateQMsgType,
+               privateQOwner);
+
     finishTime = MPI_Wtime();
-    *ph0_time = finishTime - startTime; //Time taken for Phase-0: Initialization      
+    *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization
 
-                    
     startTime = MPI_Wtime();
-    
 
     /////////////////////////////////////////////////////////////////////////////////////////
     //////////////////////////////////// INITIALIZATION /////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////////
-    //Compute the Initial Matching Set:
+    // Compute the Initial Matching Set:
 
 #pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
     {
         /*
-        * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
-        * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
-        * the two.
-        * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
-        */
+         * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
+         * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
+         * the two.
+         * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
+         */
 
 #pragma omp for schedule(static)
-        for ( v=0; v < NLVer; v++ ) {
+        for (v = 0; v < NLVer; v++)
+        {
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
 #endif
-            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
             candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
-            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
         }
 
         /*
@@ -279,25 +292,30 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
          *       in parallel.
          */
 
-
-#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
-        for (v = 0; v < NLVer; v++) {
-            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+#pragma omp for reduction(+ \
+                          : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
+        for (v = 0; v < NLVer; v++)
+        {
+            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
             k = candidateMate[v];
             candidateMate[v] = verLocInd[k];
             w = candidateMate[v];
 
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Processing: "<<v+StartIndex<<endl; fflush(stdout);
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")"<<v+StartIndex<<" Points to: "<<w; fflush(stdout);
+            cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
+            fflush(stdout);
 #endif
-            //If found a dominating edge:
-            if (w >= 0) {
+            // If found a dominating edge:
+            if (w >= 0)
+            {
 
-                if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
+                if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                {
                     w = computeCandidateMate(verLocPtr[v],
                                              verLocPtr[v + 1],
                                              edgeLocWeight, 0,
@@ -310,13 +328,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     candidateMate[v] = w;
                 }
 
-                if (w >= 0) {
+                if (w >= 0)
+                {
 
                     myCard++;
-                    if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex
+                    if ((w < StartIndex) || (w > EndIndex))
+                    { // w is a ghost vertex
 #ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")Sending a request message (291):";
-                        cout<<"\n("<<myRank<<")Local is: "<<v+StartIndex<<" Ghost is "<<w<<" Owner is: "<< findOwnerOfGhost(w, verDistance, myRank, numProcs) <<endl;
+                        cout << "\n(" << myRank << ")Sending a request message (291):";
+                        cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                         fflush(stdout);
 #endif
 
@@ -343,75 +363,81 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                             QMsgType.push_back(REQUEST);
                             QOwner.push_back(ghostOwner);
                         } // end of critical region
-                    
 
-                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
+                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
+                        {
 
                             privateU.push_back(v + StartIndex);
                             privateU.push_back(w);
                             Mate[v] = w;
-                            //FIXME could this instruction create errors?
-                            GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost
+                            // FIXME could this instruction create errors?
+                            GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
 
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<")"; fflush(stdout);
+                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
+                            fflush(stdout);
 #endif
-                            //Decrement the counter:
-                            //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                            // Decrement the counter:
+                            // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
 #pragma omp critical
                             {
-                                if (Counter[Ghost2LocalMap[w]] > 0) {
+                                if (Counter[Ghost2LocalMap[w]] > 0)
+                                {
 
-                                    Counter[Ghost2LocalMap[w]] -= 1; //Decrement
-                                    if (Counter[Ghost2LocalMap[w]] == 0) {
-                                        S--; //Decrement S
+                                    Counter[Ghost2LocalMap[w]] -= 1; // Decrement
+                                    if (Counter[Ghost2LocalMap[w]] == 0)
+                                    {
+                                        S--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                                            fflush(stdout);
+                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
+                                        fflush(stdout);
 #endif
                                     }
                                 }
-                            } //End of if Counter[w] > 0
-                            //End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                        } //End of if CandidateMate[w] = v
-
+                            } // End of if Counter[w] > 0
+                            // End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                        } // End of if CandidateMate[w] = v
 
-                    } //End of if a Ghost Vertex
-                    else { // w is a local vertex
+                    } // End of if a Ghost Vertex
+                    else
+                    { // w is a local vertex
 
-                        if (candidateMate[w - StartIndex] == (v + StartIndex)) {
+                        if (candidateMate[w - StartIndex] == (v + StartIndex))
+                        {
                             privateU.push_back(v + StartIndex);
                             privateU.push_back(w);
 
-                            Mate[v] = w;  //v is local
-                            //FIXME this instruction could create errors
-                            Mate[w - StartIndex] = v + StartIndex; //w is local
-
+                            Mate[v] = w; // v is local
+                            // FIXME this instruction could create errors
+                            Mate[w - StartIndex] = v + StartIndex; // w is local
 
 #ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")MATCH: ("<<v+StartIndex<<","<<w<<") "; fflush(stdout);
+                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
+                            fflush(stdout);
 #endif
 
-                        } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-                    } //End of Else
+                        } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                    }     // End of Else
 
                     continue;
-                } //End of second if
+                } // End of second if
 
-            } //End of if(w >=0)
+            } // End of if(w >=0)
 
-            //This piece of code is executed a really small amount of times, I will not allocate a
-            //huge amount of memory to the private data structures.
+            // This piece of code is executed a really small amount of times, I will not allocate a
+            // huge amount of memory to the private data structures.
             adj11 = verLocPtr[v];
             adj12 = verLocPtr[v + 1];
-            for (k1 = adj11; k1 < adj12; k1++) {
+            for (k1 = adj11; k1 < adj12; k1++)
+            {
                 w = verLocInd[k1];
-                if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+                if ((w < StartIndex) || (w > EndIndex))
+                { // A ghost
 
 #ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")Sending a failure message: ";
-                cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                fflush(stdout);
+                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    fflush(stdout);
 #endif
 
                     msgInd++;
@@ -425,23 +451,21 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     QMsgType.push_back(FAILURE);
                     QOwner.push_back(ghostOwner);
 
-                } //End of if(GHOST)
-            } //End of for loop
-            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-        } //End of for ( v=0; v < NLVer; v++ )
-
+                } // End of if(GHOST)
+            }     // End of for loop
+            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+        } // End of for ( v=0; v < NLVer; v++ )
 
-        #pragma omp critical(privateMsg)
+#pragma omp critical(privateMsg)
         {
-            while (!privateQLocalVtx.empty()) {
-                
+            while (!privateQLocalVtx.empty())
+            {
+
                 QLocalVtx.push_back(privateQLocalVtx.pop_front());
                 QGhostVtx.push_back(privateQGhostVtx.pop_front());
                 QMsgType.push_back(privateQMsgType.pop_front());
                 QOwner.push_back(privateQOwner.pop_front());
-
             }
-
         }
 
 #pragma omp critical(U)
@@ -454,12 +478,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
 #pragma omp master
         {
-            tempCounter.clear(); //Do not need this any more
+            tempCounter.clear(); // Do not need this any more
         }
 
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
-    fflush(stdout);
+        cout << "\n(" << myRank << "=========================************===============================" << endl;
+        fflush(stdout);
+        fflush(stdout);
 #endif
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
@@ -470,68 +495,77 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         MilanLongInt localVertices = 0;
 #endif
 
-        //TODO what would be the optimal UCHUNK
-        vector <MilanLongInt> Us;
+        // TODO what would be the optimal UCHUNK
+        vector<MilanLongInt> Us;
         Us.reserve(UCHUNK);
 
-        while( true ) {
+        while (true)
+        {
 
             Us.clear();
 #pragma omp critical(U)
             {
-                //If U is emptu and there are no new node to add to U
+                // If U is emptu and there are no new node to add to U
                 if (U.empty() && privateU.empty())
                     isEmpty = true;
-                else {
+                else
+                {
                     if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
-                        while (!privateU.empty()) {
+                        while (!privateU.empty())
+                        {
                             U.push_back(privateU.pop_front());
                             myCard += privateMyCard;
                         }
-                    for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes
-                        if (U.empty()) break;
+                    for (int i = 0; i < UCHUNK; i++)
+                    { // Pop the new nodes
+                        if (U.empty())
+                            break;
                         Us.push_back(U.pop_front());
                     }
                 }
             } // End of critical U
-            if (isEmpty) break;
+            if (isEmpty)
+                break;
 
             for (MilanLongInt u : Us)
             {
 #ifdef PRINT_DEBUG_INFO_
-                cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
+                cout << "\n(" << myRank << ")u: " << u;
+                fflush(stdout);
 #endif
-                if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices
+                if ((u >= StartIndex) && (u <= EndIndex))
+                { // Process Only the Local Vertices
 
 #ifdef COUNT_LOCAL_VERTEX
-                    localVertices ++;
+                    localVertices++;
 #endif
 
-                    //Get the Adjacency list for u
-                    adj1 = verLocPtr[u - StartIndex];  //Pointer
+                    // Get the Adjacency list for u
+                    adj1 = verLocPtr[u - StartIndex]; // Pointer
                     adj2 = verLocPtr[u - StartIndex + 1];
-                    for (k = adj1; k < adj2; k++) {
+                    for (k = adj1; k < adj2; k++)
+                    {
                         v = verLocInd[k];
 
-                        if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex:
+                        if ((v >= StartIndex) && (v <= EndIndex))
+                        { // If Local Vertex:
 #pragma omp critical(innerProcessMatched)
                             {
 
 #ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
-                        fflush(stdout);
+                                cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                                fflush(stdout);
 #endif
 
-
-                                //If the current vertex is pointing to a matched vertex and is not matched
-                                //FIXME is there a way to make candidateMate private?
-                                //      for the moment it could generate an error.
+                                // If the current vertex is pointing to a matched vertex and is not matched
+                                // FIXME is there a way to make candidateMate private?
+                                //       for the moment it could generate an error.
                                 if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
-                                    candidateMate[v - StartIndex] == u) {
-
+                                    candidateMate[v - StartIndex] == u)
+                                {
 
-                                    //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                                    // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
                                     w = computeCandidateMate(verLocPtr[v - StartIndex],
                                                              verLocPtr[v - StartIndex + 1],
                                                              edgeLocWeight, 0,
@@ -544,15 +578,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
                                     candidateMate[v - StartIndex] = w;
 
-                                    //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                                    // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
+                                    cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                    fflush(stdout);
 #endif
-                                    //If found a dominating edge:
-                                    if (w >= 0) {
+                                    // If found a dominating edge:
+                                    if (w >= 0)
+                                    {
 
-                                        //TODO is it possible to lock without a critical region?
-                                        //TODO there must be a more elegant and efficient way to do this
+                                        // TODO is it possible to lock without a critical region?
+                                        // TODO there must be a more elegant and efficient way to do this
                                         /*
                                         while(true) {
                                             if (omp_test_lock(&MateLock[v - StartIndex])) {
@@ -562,11 +598,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                         }
                                         */
 
-
-                                        if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+                                        if ((w < StartIndex) || (w > EndIndex))
+                                        { // A ghost
 #ifdef PRINT_DEBUG_INFO_
-                                            cout<<"\n("<<myRank<<")Sending a request message:";
-                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            cout << "\n(" << myRank << ")Sending a request message:";
+                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 
                                             QLocalVtx.push_back(v);
@@ -579,67 +615,77 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                             PCounter[ghostOwner]++;
                                             NumMessagesBundled++;
                                             msgInd++;
-                                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
-                                                Mate[v - StartIndex] = w;  //v is a local vertex
-                                                GMate[Ghost2LocalMap[w]] = v;  //w is a ghost vertex
-                                                //Q.push_back(u);
+                                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                            {
+                                                Mate[v - StartIndex] = w;     // v is a local vertex
+                                                GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+                                                // Q.push_back(u);
                                                 privateU.push_back(v);
                                                 privateU.push_back(w);
                                                 privateMyCard++;
 #ifdef PRINT_DEBUG_INFO_
-                                                cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                fflush(stdout);
 #endif
-                                                //Decrement the counter:
-                                                //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                                if (Counter[Ghost2LocalMap[w]] > 0) {
-                                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                                    if (Counter[Ghost2LocalMap[w]] == 0) {
-                                                        S--; //Decrement S
+                                                // Decrement the counter:
+                                                // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                                if (Counter[Ghost2LocalMap[w]] > 0)
+                                                {
+                                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
+                                                    if (Counter[Ghost2LocalMap[w]] == 0)
+                                                    {
+                                                        S--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                                        cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
-                                                fflush(stdout);
+                                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
+                                                        fflush(stdout);
 #endif
                                                     }
-                                                } //End of if Counter[w] > 0
-                                                //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                            } //End of if CandidateMate[w] = v
-                                        } //End of if a Ghost Vertex
-                                        else { //w is a local vertex
-                                            if (candidateMate[w - StartIndex] == v) {
-                                                Mate[v - StartIndex] = w;  //v is a local vertex
-                                                Mate[w - StartIndex] = v;  //w is a local vertex
-                                                //Q.push_back(u);
+                                                } // End of if Counter[w] > 0
+                                                // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                            } // End of if CandidateMate[w] = v
+                                        }     // End of if a Ghost Vertex
+                                        else
+                                        { // w is a local vertex
+                                            if (candidateMate[w - StartIndex] == v)
+                                            {
+                                                Mate[v - StartIndex] = w; // v is a local vertex
+                                                Mate[w - StartIndex] = v; // w is a local vertex
+                                                // Q.push_back(u);
                                                 privateU.push_back(v);
                                                 privateU.push_back(w);
                                                 privateMyCard++;
 #ifdef PRINT_DEBUG_INFO_
-                                                cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                fflush(stdout);
 #endif
-                                            } //End of if(CandidateMate(w) = v
-                                        } //End of Else
+                                            } // End of if(CandidateMate(w) = v
+                                        }     // End of Else
 
-                                        //omp_unset_lock(&MateLock[v - StartIndex]);
-                                        //omp_unset_lock(&MateLock[w - StartIndex]);
+                                        // omp_unset_lock(&MateLock[v - StartIndex]);
+                                        // omp_unset_lock(&MateLock[w - StartIndex]);
 
-                                    } //End of if(w >=0)
-                                    else {
+                                    } // End of if(w >=0)
+                                    else
+                                    {
                                         adj11 = verLocPtr[v - StartIndex];
                                         adj12 = verLocPtr[v - StartIndex + 1];
-                                        for (k1 = adj11; k1 < adj12; k1++) {
+                                        for (k1 = adj11; k1 < adj12; k1++)
+                                        {
                                             w = verLocInd[k1];
-                                            if ((w < StartIndex) || (w > EndIndex)) { //A ghost
+                                            if ((w < StartIndex) || (w > EndIndex))
+                                            { // A ghost
 
 #ifdef PRINT_DEBUG_INFO_
-                                                cout<<"\n("<<myRank<<")Sending a failure message: ";
-                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                        fflush(stdout);
+                                                cout << "\n(" << myRank << ")Sending a failure message: ";
+                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                fflush(stdout);
 #endif
                                                 /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
                                                  ComputeTag, comm); */
                                                 QLocalVtx.push_back(v);
                                                 QGhostVtx.push_back(w);
                                                 QMsgType.push_back(FAILURE);
-                                                //ghostOwner = inputSubGraph.findOwner(w);
+                                                // ghostOwner = inputSubGraph.findOwner(w);
                                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                                 assert(ghostOwner != -1);
                                                 assert(ghostOwner != myRank);
@@ -647,35 +693,38 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                                 PCounter[ghostOwner]++;
                                                 NumMessagesBundled++;
                                                 msgInd++;
-                                            } //End of if(GHOST)
-                                        } //End of for loop
-                                    } // End of Else: w == -1
-                                    //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                            } // End of if(GHOST)
+                                        }     // End of for loop
+                                    }         // End of Else: w == -1
+                                    // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 
-                                } //End of If (candidateMate[v-StartIndex] == u
+                                } // End of If (candidateMate[v-StartIndex] == u
 
-                            } //End of critical region if
+                            } // End of critical region if
 
-                        } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                        else { //Neighbor is a ghost vertex
+                        } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else
+                        { // Neighbor is a ghost vertex
 
 #pragma omp critical(innerProcessMatched)
                             {
 
-                                //while(!omp_test_lock(&MateLock[u - StartIndex]));
+                                // while(!omp_test_lock(&MateLock[u - StartIndex]));
 
                                 if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
                                     candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                                if (v != Mate[u - StartIndex]) { //u is local
-                                    //Build the Message Packet:
-                                    //Message[0] = u; //LOCAL
-                                    //Message[1] = v; //GHOST
-                                    //Message[2] = SUCCESS;  //TYPE
-                                    //Send a Request (Asynchronous)
+                                if (v != Mate[u - StartIndex])
+                                { // u is local
+                                    // Build the Message Packet:
+                                    // Message[0] = u; //LOCAL
+                                    // Message[1] = v; //GHOST
+                                    // Message[2] = SUCCESS;  //TYPE
+                                    // Send a Request (Asynchronous)
 
 #ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")Sending a success message: ";
-                            cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs)<<"\n"; fflush(stdout);
+                                    cout << "\n(" << myRank << ")Sending a success message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                                    fflush(stdout);
 #endif
 
                                     QLocalVtx.push_back(u);
@@ -688,56 +737,56 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                     PCounter[ghostOwner]++;
                                     NumMessagesBundled++;
                                     msgInd++;
-                                } //End of If( v != Mate[u] )
+                                } // End of If( v != Mate[u] )
 
-                                //omp_unset_lock(&MateLock[u - StartIndex]);
+                                // omp_unset_lock(&MateLock[u - StartIndex]);
 
-                            } //End of critical region
-                        } //End of Else //A Ghost Vertex
+                            } // End of critical region
+                        }     // End of Else //A Ghost Vertex
 
-                    } //End of For Loop adj(u)
+                    } // End of For Loop adj(u)
 
-                } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+                } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
 
-                //Avoid to ask for the critical section if there is nothing to add
-                if (privateU.size() < UCHUNK && !U.empty()) continue;
+                // Avoid to ask for the critical section if there is nothing to add
+                if (privateU.size() < UCHUNK && !U.empty())
+                    continue;
 #pragma omp critical(U)
                 {
-                    while (!privateU.empty()) {
+                    while (!privateU.empty())
+                    {
                         U.push_back(privateU.pop_front());
                     }
 
                     myCard += privateMyCard;
-                } //End of critical U
-
+                } // End of critical U
             }
-        } //End of while ( /*!Q.empty()*/ !U.empty() )
+        } // End of while ( /*!Q.empty()*/ !U.empty() )
 
-        #pragma omp critical(privateMsg)
+#pragma omp critical(privateMsg)
         {
-            while (!privateQLocalVtx.empty()) {
+            while (!privateQLocalVtx.empty())
+            {
 
                 QLocalVtx.push_back(privateQLocalVtx.pop_front());
                 QGhostVtx.push_back(privateQGhostVtx.pop_front());
                 QMsgType.push_back(privateQMsgType.pop_front());
                 QOwner.push_back(privateQOwner.pop_front());
-
             }
-
         }
 
-
 #ifdef COUNT_LOCAL_VERTEX
         printf("Count local vertexes: %ld for thread %d of processor %d\n",
-              localVertices,
-              omp_get_thread_num(),
-              myRank);
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
 #endif
 
-
         ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 #ifdef DEBUG_HANG_
-        if (myRank == 0) cout<<"\n("<<myRank<<") Send Bundles" <<endl; fflush(stdout);
+        if (myRank == 0)
+            cout << "\n(" << myRank << ") Send Bundles" << endl;
+        fflush(stdout);
 #endif
         /////////////////////////////////////////////////////////////////////////////////////////
         ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
@@ -745,347 +794,401 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #pragma omp barrier
 #pragma omp master
         {
-            //Data structures for Bundled Messages:
-            try {
-                PMessageBundle.reserve(NumMessagesBundled * 3); //Three integers per message
-                PCumulative.reserve(numProcs + 1); //Similar to Row Pointer vector in CSR data structure
-                PSizeInfoMessages.reserve(numProcs * 3); //Buffer to hold the Size info message packets
-            } catch (length_error) {
+            // Data structures for Bundled Messages:
+            try
+            {
+                PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
+                PCumulative.reserve(numProcs + 1);              // Similar to Row Pointer vector in CSR data structure
+                PSizeInfoMessages.reserve(numProcs * 3);        // Buffer to hold the Size info message packets
+            }
+            catch (length_error)
+            {
                 cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
                 cout << "Not enough memory to allocate the internal variables \n";
                 exit(1);
             }
-            PMessageBundle.resize(NumMessagesBundled * 3, -1);//Initialize
-            PCumulative.resize(numProcs + 1, 0); //Only initialize the counter variable
+            PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
+            PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
             PSizeInfoMessages.resize(numProcs * 3, 0);
 
-
             for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
                 PCumulative[i + 1] = PCumulative[i] + PCounter[i];
 
-            //OMP not worth parallelizing
-            //Reuse PCounter to keep track of how many messages were inserted:
+            // OMP not worth parallelizing
+            // Reuse PCounter to keep track of how many messages were inserted:
             for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
                 PCounter[i] = 0;
-            //Build the Message Bundle packet:
-
-            //OMP Not parallelizable
-            for (MilanInt i=0; i<NumMessagesBundled; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
-                myIndex = ( PCumulative[QOwner[i]] + PCounter[QOwner[i]] )*3;
-                PMessageBundle[myIndex+0] = QLocalVtx[i];
-                PMessageBundle[myIndex+1] = QGhostVtx[i];
-                PMessageBundle[myIndex+2] = QMsgType[i];
+            // Build the Message Bundle packet:
+
+            // OMP Not parallelizable
+            for (MilanInt i = 0; i < NumMessagesBundled; i++)
+            { // Changed by Fabio to be an integer, addresses needs to be integers!
+                myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
+                PMessageBundle[myIndex + 0] = QLocalVtx[i];
+                PMessageBundle[myIndex + 1] = QGhostVtx[i];
+                PMessageBundle[myIndex + 2] = QMsgType[i];
                 PCounter[QOwner[i]]++;
             }
 
-            //Send the Bundled Messages: Use ISend
+            // Send the Bundled Messages: Use ISend
 
-            try {
-                SRequest.reserve(numProcs * 2); //At most two messages per processor
-                SStatus.reserve(numProcs * 2);//At most two messages per processor
-            } catch (length_error) {
+            try
+            {
+                SRequest.reserve(numProcs * 2); // At most two messages per processor
+                SStatus.reserve(numProcs * 2);  // At most two messages per processor
+            }
+            catch (length_error)
+            {
                 cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
                 cout << "Not enough memory to allocate the internal variables \n";
                 exit(1);
             }
-            MPI_Request myReq; //A sample request
+            MPI_Request myReq; // A sample request
             SRequest.resize(numProcs * 2, myReq);
-            MPI_Status myStat; //A sample status
+            MPI_Status myStat; // A sample status
             SStatus.resize(numProcs * 2, myStat);
 
-            //Send the Messages
-            for (MilanInt i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
-                if (i == myRank) //Do not send anything to yourself
+            // Send the Messages
+            for (MilanInt i = 0; i < numProcs; i++)
+            {                    // Changed by Fabio to be an integer, addresses needs to be integers!
+                if (i == myRank) // Do not send anything to yourself
                     continue;
-                //Send the Message with information about the size of next message:
-                //Build the Message Packet:
+                // Send the Message with information about the size of next message:
+                // Build the Message Packet:
                 PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
-                PSizeInfoMessages[i * 3 + 1] = -1; //Dummy packet
-                PSizeInfoMessages[i * 3 + 2] = SIZEINFO;  //TYPE
-                //Send a Request (Asynchronous)
+                PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
+                PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
+                // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                cout<<"\n("<<myRank<<")Sending bundled message to process "<<i<<" size: "<<PSizeInfoMessages[i*3+0]<<endl;
+                cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
                 fflush(stdout);
 #endif
-                if (PSizeInfoMessages[i * 3 + 0] > 0) { //Send only if it is a nonempty packet
+                if (PSizeInfoMessages[i * 3 + 0] > 0)
+                { // Send only if it is a nonempty packet
                     MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
                               &SRequest[MessageIndex]);
                     msgActual++;
                     MessageIndex++;
-                    //Now Send the message with the data packet:
+                    // Now Send the message with the data packet:
 #ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")Sending Bundle to : "<<i<<endl;
-                    for (k=(PCumulative[i]*3); k< (PCumulative[i]*3+PSizeInfoMessages[i*3+0]); k++)
-                        cout<<PMessageBundle[k]<<",";
-                    cout<<endl;
+                    cout << "\n(" << myRank << ")Sending Bundle to : " << i << endl;
+                    for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
+                        cout << PMessageBundle[k] << ",";
+                    cout << endl;
                     fflush(stdout);
 #endif
                     MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
                               TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[MessageIndex]);
                     MessageIndex++;
-                } //End of if size > 0
+                } // End of if size > 0
             }
-            //Free up temporary memory:
+            // Free up temporary memory:
             PCumulative.clear();
             QLocalVtx.clear();
             QGhostVtx.clear();
             QMsgType.clear();
             QOwner.clear();
 
-
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
-    cout<<"\n("<<myRank<<")Total number of potential message X 2 = "<<numGhostEdges*2;
-    cout<<"\n("<<myRank<<")Number messages already sent in bundles = "<<NumMessagesBundled;
-    if (numGhostEdges>0) {
-      cout<<"\n("<<myRank<<")Percentage of total = "<<((double)NumMessagesBundled/(double)(numGhostEdges*2))*100.0<<"% \n";
-    }
-    fflush(stdout);
+            cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
+            cout << "\n(" << myRank << ")Total number of potential message X 2 = " << numGhostEdges * 2;
+            cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
+            if (numGhostEdges > 0)
+            {
+                cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(numGhostEdges * 2)) * 100.0 << "% \n";
+            }
+            fflush(stdout);
 #endif
 
-            //Allocate memory for MPI Send messages:
+            // Allocate memory for MPI Send messages:
             /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
-            OneMessageSize=0;
-            MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); //Size of one message packet
-            //How many messages to send?
-            //Potentially three kinds of messages will be sent/received:
-            //Request, Success, Failure.
-            //But only two will be sent from a given processor.
-            //Substract the number of messages that have already been sent as bundled messages:
-            numMessagesToSend = numGhostEdges*2 - NumMessagesBundled;
-            BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend;
-
-            Buffer=0;
-#ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Size of One Message from PACK= "<<OneMessageSize;
-    cout<<"\n("<<myRank<<")Size of Message overhead = "<<MPI_BSEND_OVERHEAD;
-    cout<<"\n("<<myRank<<")Number of Ghost edges = "<<numGhostEdges;
-    cout<<"\n("<<myRank<<")Number of remaining message = "<<numMessagesToSend;
-    cout<<"\n("<<myRank<<")BufferSize = "<<BufferSize;
-    cout<<"\n("<<myRank<<")Attaching Buffer on.. ";
-    fflush(stdout);
+            OneMessageSize = 0;
+            MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
+            // How many messages to send?
+            // Potentially three kinds of messages will be sent/received:
+            // Request, Success, Failure.
+            // But only two will be sent from a given processor.
+            // Substract the number of messages that have already been sent as bundled messages:
+            numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled;
+            BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
+
+            Buffer = 0;
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
+            cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
+            cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
+            cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
+            cout << "\n(" << myRank << ")BufferSize = " << BufferSize;
+            cout << "\n(" << myRank << ")Attaching Buffer on.. ";
+            fflush(stdout);
 #endif
-            if ( BufferSize > 0 ) {
-                Buffer = (MilanLongInt *) malloc(BufferSize);  //Allocate memory
-                if ( Buffer == 0 ) {
-                    cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-                    cout<<"Not enough memory to allocate for send buffer on process "<<myRank<<"\n";
+            if (BufferSize > 0)
+            {
+                Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory
+                if (Buffer == 0)
+                {
+                    cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                    cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
                     exit(1);
                 }
-                MPI_Buffer_attach(Buffer, BufferSize); //Attach the Buffer
+                MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer
             }
-        } //End of master
+        } // End of master
 
     } // end of parallel region
     ///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
 
     finishTime = MPI_Wtime();
-    *ph1_time = finishTime-startTime; //Time taken for Phase-1
-    *ph1_card = myCard; //Cardinality at the end of Phase-1
+    *ph1_time = finishTime - startTime; // Time taken for Phase-1
+    *ph1_card = myCard;                 // Cardinality at the end of Phase-1
     startTime = MPI_Wtime();
     /////////////////////////////////////////////////////////////////////////////////////////
     //////////////////////////////////////// MAIN LOOP //////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////////
-    //Main While Loop:
+    // Main While Loop:
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
     fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")Entering While(true) loop.."; fflush(stdout);
-    //U.display(); fflush(stdout);
+    cout << "\n(" << myRank << ")Entering While(true) loop..";
+    fflush(stdout);
+    // U.display(); fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
     fflush(stdout);
 #endif
-    //Buffer to receive bundled messages
-    //Maximum messages that can be received from any processor is
-    //twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
+    // Buffer to receive bundled messages
+    // Maximum messages that can be received from any processor is
+    // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
     vector<MilanLongInt> ReceiveBuffer;
-    MilanLongInt bundleSize=0, bundleCounter=0;
-    try {
-        ReceiveBuffer.reserve(numGhostEdges*2*3); //Three integers per cross edge
-    } catch ( length_error ) {
-        cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-        cout<<"Not enough memory to allocate the internal variables \n";
+    MilanLongInt bundleSize = 0, bundleCounter = 0;
+    try
+    {
+        ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
+    }
+    catch (length_error)
+    {
+        cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+        cout << "Not enough memory to allocate the internal variables \n";
         exit(1);
     }
-    while ( true ) {
+    while (true)
+    {
 #ifdef DEBUG_HANG_
-        if (myRank == 0) cout<<"\n("<<myRank<<") Main loop" <<endl; fflush(stdout);
+        if (myRank == 0)
+            cout << "\n(" << myRank << ") Main loop" << endl;
+        fflush(stdout);
 #endif
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
-        while ( /*!Q.empty()*/ !U.empty() ) {
-            //Q.pop_front();
-            u = U.pop_front(); //Get an element from the queue
+        while (/*!Q.empty()*/ !U.empty())
+        {
+            // Q.pop_front();
+            u = U.pop_front(); // Get an element from the queue
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")u: "<<u; fflush(stdout);
+            cout << "\n(" << myRank << ")u: " << u;
+            fflush(stdout);
 #endif
-            if ( (u >= StartIndex) && (u <= EndIndex) ) { //Process Only If a Local Vertex
-                //Get the Adjacency list for u
-                adj1 = verLocPtr[u-StartIndex];  //Pointer
-                adj2 = verLocPtr[u-StartIndex+1];
-                for( k = adj1; k < adj2; k++ ) {
+            if ((u >= StartIndex) && (u <= EndIndex))
+            { // Process Only If a Local Vertex
+                // Get the Adjacency list for u
+                adj1 = verLocPtr[u - StartIndex]; // Pointer
+                adj2 = verLocPtr[u - StartIndex + 1];
+                for (k = adj1; k < adj2; k++)
+                {
                     v = verLocInd[k];
-                    if ( (v >= StartIndex) && (v <= EndIndex) ) { //v is a Local Vertex:
-                        if ( Mate[v-StartIndex] >= 0 )   // v is already matched
+                    if ((v >= StartIndex) && (v <= EndIndex))
+                    {                                  // v is a Local Vertex:
+                        if (Mate[v - StartIndex] >= 0) // v is already matched
                             continue;
 #ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")v: "<<v<<" c(v)= "<<candidateMate[v-StartIndex]<<" Mate[v]: "<<Mate[v];
+                        cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
                         fflush(stdout);
 #endif
-                        if ( candidateMate[v-StartIndex] == u ) { //Only if pointing to the matched vertex
-                            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                            adj11 = verLocPtr[v-StartIndex];
-                            adj12 = verLocPtr[v-StartIndex+1];
+                        if (candidateMate[v - StartIndex] == u)
+                        { // Only if pointing to the matched vertex
+                            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
                             w = -1;
-                            heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-                            for( k1 = adj11; k1 < adj12; k1++ ) {
-                                if ( (verLocInd[k1]<StartIndex) || (verLocInd[k1]>EndIndex) ) { //Is it a ghost vertex?
-                                    if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched
+                            heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex))
+                                {                                                  // Is it a ghost vertex?
+                                    if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched
                                         continue;
                                 }
-                                else { //A local vertex
-                                    if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched
+                                else
+                                {                                              // A local vertex
+                                    if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched
                                         continue;
                                 }
 
-                                if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
-                                    ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                if ((edgeLocWeight[k1] > heaviestEdgeWt) ||
+                                    ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1])))
+                                {
                                     heaviestEdgeWt = edgeLocWeight[k1];
                                     w = verLocInd[k1];
                                 }
-                            } //End of for loop
-                            candidateMate[v-StartIndex] = w;
-                            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w; fflush(stdout);
-#endif
-                            //If found a dominating edge:
-                            if ( w >= 0 ) {
-                                if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost
-                                    //Build the Message Packet:
-                                    Message[0] = v; //LOCAL
-                                    Message[1] = w; //GHOST
-                                    Message[2] = REQUEST;  //TYPE
-                                    //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")Sending a request message:";
-                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            } // End of for loop
+                            candidateMate[v - StartIndex] = w;
+                            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                            fflush(stdout);
+#endif
+                            // If found a dominating edge:
+                            if (w >= 0)
+                            {
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // w is a ghost
+                                    // Build the Message Packet:
+                                    Message[0] = v;       // LOCAL
+                                    Message[1] = w;       // GHOST
+                                    Message[2] = REQUEST; // TYPE
+                                    // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a request message:";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                     fflush(stdout);
 #endif
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
                                     MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                    msgInd++; msgActual++;
-                                    if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) {
-                                        Mate[v-StartIndex] = w; //v is local
-                                        GMate[Ghost2LocalMap[w]] = v; //w is ghost
-                                        //Q.push_back(u);
+                                    msgInd++;
+                                    msgActual++;
+                                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                    {
+                                        Mate[v - StartIndex] = w;     // v is local
+                                        GMate[Ghost2LocalMap[w]] = v; // w is ghost
+                                        // Q.push_back(u);
                                         U.push_back(v);
                                         U.push_back(w);
                                         myCard++;
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                        fflush(stdout);
 #endif
-                                        //Decrement the counter:
-                                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                        if ( Counter[Ghost2LocalMap[w]] > 0 ) {
-                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                            if ( Counter[Ghost2LocalMap[w]] == 0 ) {
-                                                S--; //Decrement S
+                                        // Decrement the counter:
+                                        // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                        if (Counter[Ghost2LocalMap[w]] > 0)
+                                        {
+                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
+                                            if (Counter[Ghost2LocalMap[w]] == 0)
+                                            {
+                                                S--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                                cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
                                                 fflush(stdout);
 #endif
                                             }
-                                        } //End of if Counter[w] > 0
-                                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                    } //End of if CandidateMate[w] = v
-                                } //End of if a Ghost Vertex
-                                else { //w is a local vertex
-                                    if ( candidateMate[w-StartIndex] == v )  {
-                                        Mate[v-StartIndex] = w; //v is local
-                                        Mate[w-StartIndex] = v; //w is local
-                                        //Q.push_back(u);
+                                        } // End of if Counter[w] > 0
+                                        // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                    } // End of if CandidateMate[w] = v
+                                }     // End of if a Ghost Vertex
+                                else
+                                { // w is a local vertex
+                                    if (candidateMate[w - StartIndex] == v)
+                                    {
+                                        Mate[v - StartIndex] = w; // v is local
+                                        Mate[w - StartIndex] = v; // w is local
+                                        // Q.push_back(u);
                                         U.push_back(v);
                                         U.push_back(w);
                                         myCard++;
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "; fflush(stdout);
+                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                        fflush(stdout);
 #endif
-                                    } //End of if(CandidateMate(w) = v
-                                } //End of Else
-                            } //End of if(w >=0)
-                            else { //no dominating edge found: w == -1
-                                adj11 = verLocPtr[v-StartIndex];
-                                adj12 = verLocPtr[v-StartIndex+1];
-                                for( k1 = adj11; k1 < adj12; k1++ ) {
+                                    } // End of if(CandidateMate(w) = v
+                                }     // End of Else
+                            }         // End of if(w >=0)
+                            else
+                            { // no dominating edge found: w == -1
+                                adj11 = verLocPtr[v - StartIndex];
+                                adj12 = verLocPtr[v - StartIndex + 1];
+                                for (k1 = adj11; k1 < adj12; k1++)
+                                {
                                     w = verLocInd[k1];
-                                    if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
-                                        //Build the Message Packet:
-                                        Message[0] = v;			 //LOCAL
-                                        Message[1] = w;            //GHOST
-                                        Message[2] = FAILURE;      //TYPE
-                                        //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")Sending a failure message: ";
-                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    if ((w < StartIndex) || (w > EndIndex))
+                                    { // A ghost
+                                        // Build the Message Packet:
+                                        Message[0] = v;       // LOCAL
+                                        Message[1] = w;       // GHOST
+                                        Message[2] = FAILURE; // TYPE
+                                        // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                        cout << "\n(" << myRank << ")Sending a failure message: ";
+                                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                         fflush(stdout);
 #endif
-                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        assert(ghostOwner != -1);
+                                        assert(ghostOwner != myRank);
                                         MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                        msgInd++; msgActual++;
-                                    } //End of if(GHOST)
-                                } //End of for loop
-                            } // End of Else: w == -1
-                            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        } //End of If (candidateMate[v-StartIndex] == u)
-                    } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                    else { //Neighbor v is a ghost vertex
-                        if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u )
-                            candidateMate[NLVer+Ghost2LocalMap[v]] = -1;
-                        if ( v != Mate[u-StartIndex] ) { //u is a local vertex
-                            //Build the Message Packet:
-                            Message[0] = u; //LOCAL
-                            Message[1] = v; //GHOST
-                            Message[2] = SUCCESS;  //TYPE
-                            //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")Sending a success message: ";
-                            cout<<"\n("<<myRank<<")Ghost is "<<v<<" Owner is: "<<findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                                        msgInd++;
+                                        msgActual++;
+                                    } // End of if(GHOST)
+                                }     // End of for loop
+                            }         // End of Else: w == -1
+                            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        } // End of If (candidateMate[v-StartIndex] == u)
+                    }     // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                    else
+                    { // Neighbor v is a ghost vertex
+                        if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                            candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                        if (v != Mate[u - StartIndex])
+                        { // u is a local vertex
+                            // Build the Message Packet:
+                            Message[0] = u;       // LOCAL
+                            Message[1] = v;       // GHOST
+                            Message[2] = SUCCESS; // TYPE
+                            // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")Sending a success message: ";
+                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs);
                             fflush(stdout);
 #endif
-                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
                             MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
                             msgInd++;
                             msgActual++;
 #ifdef DEBUG_GHOST_
-                            if ((u<StartIndex) || (u>EndIndex)) {
-			      cout<<"\n("<<myRank<<") "<<__LINE__<<" From Send: should not happen: u= "<<u<<" v= "<<v<<
-				" StartIndex "<<StartIndex<<" EndIndex "<<EndIndex<<endl;
-			      fflush(stdout);
-			    }
-#endif
-
-                        } //End of If( v != Mate[u] )
-                    } //End of Else //A Ghost Vertex
-                } //End of For Loop adj(u)
-            } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
-        } //End of while ( /*!Q.empty()*/ !U.empty() )
+                            if ((u < StartIndex) || (u > EndIndex))
+                            {
+                                cout << "\n(" << myRank << ") " << __LINE__ << " From Send: should not happen: u= " << u << " v= " << v << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
+                                fflush(stdout);
+                            }
+#endif
+
+                        } // End of If( v != Mate[u] )
+                    }     // End of Else //A Ghost Vertex
+                }         // End of For Loop adj(u)
+            }             // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+        }                 // End of while ( /*!Q.empty()*/ !U.empty() )
         ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 
         //// BREAK IF NO MESSAGES EXPECTED /////////
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")Deciding whether to break: S= "<<S<<endl;
+        cout << "\n(" << myRank << ")Deciding whether to break: S= " << S << endl;
 #endif
 
-        if ( S == 0 ) {
+        if (S == 0)
+        {
 #ifdef DEBUG_HANG_
-            cout<<"\n("<<myRank<<") Breaking out" <<endl; fflush(stdout);
+            cout << "\n(" << myRank << ") Breaking out" << endl;
+            fflush(stdout);
 #endif
             break;
         }
@@ -1097,350 +1200,421 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
          // u is a GHOST vertex ... v is a LOCAL vertex
          */
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+        cout << "\n(" << myRank << "=========================************===============================" << endl;
+        fflush(stdout);
         fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")About to begin Message processing phase ... S="<<S<<endl;
+        cout << "\n(" << myRank << ")About to begin Message processing phase ... S=" << S << endl;
         fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<"=========================************==============================="<<endl; fflush(stdout);
+        cout << "\n(" << myRank << "=========================************===============================" << endl;
+        fflush(stdout);
         fflush(stdout);
 #endif
-        //BLOCKING RECEIVE:
+        // BLOCKING RECEIVE:
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<" Waiting for blocking receive..."<<endl; fflush(stdout);
+        cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
+        fflush(stdout);
         fflush(stdout);
 #endif
         error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
-        if (error_codeC != MPI_SUCCESS ) {
+        if (error_codeC != MPI_SUCCESS)
+        {
             MPI_Error_string(error_codeC, error_message, &message_length);
-            cout<<"\n*Error in call to MPI_Receive on Slave: "<<error_message<<"\n"; fflush(stdout);
+            cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
+            fflush(stdout);
         }
         Sender = computeStatus.MPI_SOURCE;
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")Received message from Process "<<Sender<<" Type= "<<Message[2]<<endl;
+        cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
         fflush(stdout);
 #endif
-        //If the Message Type is a size indicator, then receive the bigger message.
-        if ( Message[2] == SIZEINFO ) {
+        // If the Message Type is a size indicator, then receive the bigger message.
+        if (Message[2] == SIZEINFO)
+        {
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Received bundled message from Process "<<Sender<<" Size= "<<Message[0]<<endl;
+            cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
             fflush(stdout);
 #endif
             bundleSize = Message[0]; //#of integers in the message
-            //Build the Message Buffer:
+            // Build the Message Buffer:
             if (!ReceiveBuffer.empty())
-                ReceiveBuffer.clear(); //Empty it out first
-            ReceiveBuffer.resize(bundleSize, -1); //Initialize
+                ReceiveBuffer.clear();            // Empty it out first
+            ReceiveBuffer.resize(bundleSize, -1); // Initialize
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Message Bundle Before: "<<endl;
-            for (i=0; i<bundleSize; i++)
-                cout<<ReceiveBuffer[i]<<",";
-            cout<<endl;
+            cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
+            for (i = 0; i < bundleSize; i++)
+                cout << ReceiveBuffer[i] << ",";
+            cout << endl;
             fflush(stdout);
 #endif
-            //Receive the message
+            // Receive the message
             error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
-            if (error_codeC != MPI_SUCCESS ) {
+            if (error_codeC != MPI_SUCCESS)
+            {
                 MPI_Error_string(error_codeC, error_message, &message_length);
-                cout<<"\n*Error in call to MPI_Receive on processor "<<myRank<<" Error: "<<error_message<<"\n"; fflush(stdout);
+                cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
+                fflush(stdout);
             }
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Message Bundle After: "<<endl;
-            for (i=0; i<bundleSize; i++)
-                cout<<ReceiveBuffer[i]<<",";
-            cout<<endl;
+            cout << "\n(" << myRank << ")Message Bundle After: " << endl;
+            for (i = 0; i < bundleSize; i++)
+                cout << ReceiveBuffer[i] << ",";
+            cout << endl;
             fflush(stdout);
 #endif
         }
-        else { //Just a single message:
+        else
+        { // Just a single message:
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Received regular message from Process "<<Sender<<" u= "<<Message[0]<<" v= "<<Message[1]<<endl;
+            cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
             fflush(stdout);
 #endif
-            //Add the current message to Queue:
+            // Add the current message to Queue:
             bundleSize = 3; //#of integers in the message
-            //Build the Message Buffer:
+            // Build the Message Buffer:
             if (!ReceiveBuffer.empty())
-                ReceiveBuffer.clear(); //Empty it out first
-            ReceiveBuffer.resize(bundleSize, -1); //Initialize
+                ReceiveBuffer.clear();            // Empty it out first
+            ReceiveBuffer.resize(bundleSize, -1); // Initialize
 
-            ReceiveBuffer[0]=Message[0]; //u
-            ReceiveBuffer[1]=Message[1]; //v
-            ReceiveBuffer[2]=Message[2]; //message_type
+            ReceiveBuffer[0] = Message[0]; // u
+            ReceiveBuffer[1] = Message[1]; // v
+            ReceiveBuffer[2] = Message[2]; // message_type
         }
         bundleCounter = 0;
-        while ( bundleCounter < bundleSize ) {
-            u = ReceiveBuffer[bundleCounter]; //GHOST
+        while (bundleCounter < bundleSize)
+        {
+            u = ReceiveBuffer[bundleCounter]; // GHOST
             bundleCounter++;
-            v = ReceiveBuffer[bundleCounter]; //LOCAL
+            v = ReceiveBuffer[bundleCounter]; // LOCAL
             bundleCounter++;
-            message_type = ReceiveBuffer[bundleCounter]; //TYPE
+            message_type = ReceiveBuffer[bundleCounter]; // TYPE
             bundleCounter++;
 #ifdef DEBUG_GHOST_
-            if ((v<StartIndex) || (v>EndIndex)) {
-	      cout<<"\n("<<myRank<<") From ReceiveBuffer: This should not happen: u= "<<u<<" v= "<<v<<" Type= "<<message_type<<
-		" StartIndex "<<StartIndex<<" EndIndex "<<EndIndex<<endl;
-	      fflush(stdout);
-	    }
+            if ((v < StartIndex) || (v > EndIndex))
+            {
+                cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
+                fflush(stdout);
+            }
 #endif
 #ifdef PRINT_DEBUG_INFO_
-            cout<<"\n("<<myRank<<")Processing message: u= "<<u<<" v= "<<v<<" Type= "<<message_type<<endl;
+            cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
             fflush(stdout);
 #endif
             // CASE I: REQUEST
-            if ( message_type == REQUEST ) {
+            if (message_type == REQUEST)
+            {
 #ifdef PRINT_DEBUG_INFO_
-                cout<<"\n("<<myRank<<")Message type is REQUEST"<<endl; fflush(stdout);
+                cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
+                fflush(stdout);
 #endif
 #ifdef DEBUG_GHOST_
-                if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
-		  cout<<"\n("<<myRank<<") case 1 Bad address "<<v<<" "<<StartIndex<<" "<<v-StartIndex<<" "<<NLVer<<endl; fflush(stdout);
-		}
+                if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
+                {
+                    cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+                    fflush(stdout);
+                }
 
 #endif
-                if ( Mate[v-StartIndex] == -1 ) { //Process only if not already matched  (v is local)
-                    candidateMate[NLVer+Ghost2LocalMap[u]] = v;  //Set CandidateMate for the ghost
-                    if ( candidateMate[v-StartIndex] == u ) {
-                        GMate[Ghost2LocalMap[u]] = v; //u is ghost
-                        Mate[v-StartIndex] = u; //v is local
-                        //Q.push_back(u);
+                if (Mate[v - StartIndex] == -1)
+                {                                                 // Process only if not already matched  (v is local)
+                    candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
+                    if (candidateMate[v - StartIndex] == u)
+                    {
+                        GMate[Ghost2LocalMap[u]] = v; // u is ghost
+                        Mate[v - StartIndex] = u;     // v is local
+                        // Q.push_back(u);
                         U.push_back(v);
                         U.push_back(u);
                         myCard++;
 #ifdef PRINT_DEBUG_INFO_
-                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<u<<") "<<endl; fflush(stdout);
+                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
+                        fflush(stdout);
 #endif
-                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
-                        if ( Counter[Ghost2LocalMap[u]] > 0 ) {
-                            Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement
-                            if ( Counter[Ghost2LocalMap[u]] == 0 ) {
-                                S--; //Decrement S
+                        // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                        if (Counter[Ghost2LocalMap[u]] > 0)
+                        {
+                            Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement
+                            if (Counter[Ghost2LocalMap[u]] == 0)
+                            {
+                                S--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<u<<" has received all its messages"<<endl;
+                                cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages" << endl;
                                 fflush(stdout);
 #endif
                             }
-                        } //End of if Counter[w] > 0
-                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
-                    } //End of if ( candidateMate[v-StartIndex] == u )e
-                } //End of if ( Mate[v] == -1 )
-            } //End of REQUEST
-            else {   //CASE II: SUCCESS
-                if ( message_type == SUCCESS ) {
-#ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")Message type is SUCCESS"<<endl; fflush(stdout);
-#endif
-                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
-                    GMate[Ghost2LocalMap[u]] = EndIndex+1; //Set a Dummy Mate to make sure that we do not (u is a ghost)
-                    //process it again
-                    if ( Counter[Ghost2LocalMap[u]] > 0 ) {
-                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement
-                        if ( Counter[Ghost2LocalMap[u]] == 0 ) {
-                            S--; //Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<u<<" has received all its messages";
+                        } // End of if Counter[w] > 0
+                        // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    } // End of if ( candidateMate[v-StartIndex] == u )e
+                }     // End of if ( Mate[v] == -1 )
+            }         // End of REQUEST
+            else
+            { // CASE II: SUCCESS
+                if (message_type == SUCCESS)
+                {
+#ifdef PRINT_DEBUG_INFO_
+                    cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
+                    fflush(stdout);
+#endif
+                    // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
+                    // process it again
+                    if (Counter[Ghost2LocalMap[u]] > 0)
+                    {
+                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement
+                        if (Counter[Ghost2LocalMap[u]] == 0)
+                        {
+                            S--; // Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages";
                             fflush(stdout);
 #endif
                         }
-                    } //End of if Counter[w] > 0
-                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    } // End of if Counter[w] > 0
+                    // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
 #ifdef DEBUG_GHOST_
-                    if ((v<0)||(v<StartIndex) || ((v-StartIndex)>NLVer)) {
-		      cout<<"\n("<<myRank<<") case 2  Bad address "<<v<<" "<<StartIndex<<" "<<v-StartIndex<<" "<<NLVer<<endl; fflush(stdout);
-		    }
-#endif
-                    if ( Mate[v-StartIndex] == -1 ) { //Process only if not already matched ( v is local)
-                        if ( candidateMate[v-StartIndex] == u ) {
-                            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                            adj11 = verLocPtr[v-StartIndex];
-                            adj12 = verLocPtr[v-StartIndex+1];
+                    if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
+                    {
+                        cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+                        fflush(stdout);
+                    }
+#endif
+                    if (Mate[v - StartIndex] == -1)
+                    { // Process only if not already matched ( v is local)
+                        if (candidateMate[v - StartIndex] == u)
+                        {
+                            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
                             w = -1;
-                            heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-                            for( k1 = adj11; k1 < adj12; k1++ ) {
-                                if ( (verLocInd[k1]<StartIndex) || (verLocInd[k1]>EndIndex) ) { //Is it a ghost vertex?
-                                    if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched
+                            heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex))
+                                {                                                  // Is it a ghost vertex?
+                                    if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched
                                         continue;
                                 }
-                                else { //A local vertex
-                                    if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched
+                                else
+                                {                                              // A local vertex
+                                    if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched
                                         continue;
                                 }
 
-                                if( (edgeLocWeight[k1] > heaviestEdgeWt) ||
-                                    ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) {
+                                if ((edgeLocWeight[k1] > heaviestEdgeWt) ||
+                                    ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1])))
+                                {
                                     heaviestEdgeWt = edgeLocWeight[k1];
                                     w = verLocInd[k1];
                                 }
-                            } //End of for loop
-                            candidateMate[v-StartIndex] = w;
-                            //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")"<<v<<" Points to: "<<w<<endl; fflush(stdout);
-#endif
-                            //If found a dominating edge:
-                            if ( w >= 0 ) {
-                                if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost
-                                    //Build the Message Packet:
-                                    Message[0] = v; //LOCAL
-                                    Message[1] = w; //GHOST
-                                    Message[2] = REQUEST;  //TYPE
-                                    //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                    cout<<"\n("<<myRank<<")Sending a request message: ";
-                                    cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs)<<endl;
+                            } // End of for loop
+                            candidateMate[v - StartIndex] = w;
+                            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
+                            fflush(stdout);
+#endif
+                            // If found a dominating edge:
+                            if (w >= 0)
+                            {
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // w is a ghost
+                                    // Build the Message Packet:
+                                    Message[0] = v;       // LOCAL
+                                    Message[1] = w;       // GHOST
+                                    Message[2] = REQUEST; // TYPE
+                                    // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a request message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                                     fflush(stdout);
 #endif
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
                                     MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                    msgInd++; msgActual++;
-                                    if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) {
-                                        Mate[v-StartIndex] = w; //v is local
-                                        GMate[Ghost2LocalMap[w]] = v; //w is ghost
-                                        //Q.push_back(u);
+                                    msgInd++;
+                                    msgActual++;
+                                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                    {
+                                        Mate[v - StartIndex] = w;     // v is local
+                                        GMate[Ghost2LocalMap[w]] = v; // w is ghost
+                                        // Q.push_back(u);
                                         U.push_back(v);
                                         U.push_back(w);
                                         myCard++;
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "<<endl; fflush(stdout);
+                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+                                        fflush(stdout);
 #endif
-                                        //Decrement the counter:
-                                        //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                        if ( Counter[Ghost2LocalMap[w]] > 0 ) {
-                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement
-                                            if ( Counter[Ghost2LocalMap[w]] == 0 ) {
-                                                S--; //Decrement S
+                                        // Decrement the counter:
+                                        // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                        if (Counter[Ghost2LocalMap[w]] > 0)
+                                        {
+                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
+                                            if (Counter[Ghost2LocalMap[w]] == 0)
+                                            {
+                                                S--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
-                                                cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<w<<" has received all its messages";
+                                                cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
                                                 fflush(stdout);
 #endif
                                             }
-                                        } //End of if Counter[w] > 0
-                                        //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                    } //End of if CandidateMate[w] = v
-                                } //End of if a Ghost Vertex
-                                else { //w is a local vertex
-                                    if ( candidateMate[w-StartIndex] == v ) {
-                                        Mate[v-StartIndex] = w; //v is local
-                                        Mate[w-StartIndex] = v; //w is local
-                                        //Q.push_back(u);
+                                        } // End of if Counter[w] > 0
+                                        // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                    } // End of if CandidateMate[w] = v
+                                }     // End of if a Ghost Vertex
+                                else
+                                { // w is a local vertex
+                                    if (candidateMate[w - StartIndex] == v)
+                                    {
+                                        Mate[v - StartIndex] = w; // v is local
+                                        Mate[w - StartIndex] = v; // w is local
+                                        // Q.push_back(u);
                                         U.push_back(v);
                                         U.push_back(w);
                                         myCard++;
 #ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")MATCH: ("<<v<<","<<w<<") "<<endl; fflush(stdout);
+                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+                                        fflush(stdout);
 #endif
-                                    } //End of if(CandidateMate(w) = v
-                                } //End of Else
-                            } //End of if(w >=0)
-                            else { //No dominant edge found
-                                adj11 = verLocPtr[v-StartIndex];
-                                adj12 = verLocPtr[v-StartIndex+1];
-                                for( k1 = adj11; k1 < adj12; k1++ ) {
+                                    } // End of if(CandidateMate(w) = v
+                                }     // End of Else
+                            }         // End of if(w >=0)
+                            else
+                            { // No dominant edge found
+                                adj11 = verLocPtr[v - StartIndex];
+                                adj12 = verLocPtr[v - StartIndex + 1];
+                                for (k1 = adj11; k1 < adj12; k1++)
+                                {
                                     w = verLocInd[k1];
-                                    if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost
-                                        //Build the Message Packet:
-                                        Message[0] = v;			 //LOCAL
-                                        Message[1] = w;            //GHOST
-                                        Message[2] = FAILURE;      //TYPE
-                                        //Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                        cout<<"\n("<<myRank<<")Sending a failure message: ";
-                                        cout<<"\n("<<myRank<<")Ghost is "<<w<<" Owner is: "<<findOwnerOfGhost(w, verDistance, myRank, numProcs)<<endl;
+                                    if ((w < StartIndex) || (w > EndIndex))
+                                    { // A ghost
+                                        // Build the Message Packet:
+                                        Message[0] = v;       // LOCAL
+                                        Message[1] = w;       // GHOST
+                                        Message[2] = FAILURE; // TYPE
+                                        // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                        cout << "\n(" << myRank << ")Sending a failure message: ";
+                                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                                         fflush(stdout);
 #endif
-                                        //MPI_Bsend(&Message[0], 3, MilanMpiLongInt, findOwnerOfGhost(w, verDistance, myRank, numProcs),
-                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank);
+                                        // MPI_Bsend(&Message[0], 3, MilanMpiLongInt, findOwnerOfGhost(w, verDistance, myRank, numProcs),
+                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        assert(ghostOwner != -1);
+                                        assert(ghostOwner != myRank);
                                         MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                        msgInd++; msgActual++;
-                                    } //End of if(GHOST)
-                                } //End of for loop
-                            } // End of Else: w == -1
-                            //End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        } //End of if ( candidateMate[v-StartIndex] == u )
-                    } //End of if ( Mate[v] == -1 )
-                } //End of if ( message_type == SUCCESS )
-                else { //CASE III: FAILURE
-#ifdef PRINT_DEBUG_INFO_
-                    cout<<"\n("<<myRank<<")Message type is FAILURE"<<endl; fflush(stdout);
-#endif
-                    //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
-                    GMate[Ghost2LocalMap[u]] = EndIndex+1; //Set a Dummy Mate to make sure that we do not (u is a ghost)
-                    //process it again
-                    if ( Counter[Ghost2LocalMap[u]] > 0 ) {
-                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement
-                        if ( Counter[Ghost2LocalMap[u]] == 0 ) {
-                            S--; //Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                            cout<<"\n("<<myRank<<")Decrementing S: Ghost vertex "<<u<<" has received all its messages";
+                                        msgInd++;
+                                        msgActual++;
+                                    } // End of if(GHOST)
+                                }     // End of for loop
+                            }         // End of Else: w == -1
+                            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        } // End of if ( candidateMate[v-StartIndex] == u )
+                    }     // End of if ( Mate[v] == -1 )
+                }         // End of if ( message_type == SUCCESS )
+                else
+                { // CASE III: FAILURE
+#ifdef PRINT_DEBUG_INFO_
+                    cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
+                    fflush(stdout);
+#endif
+                    // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
+                    // process it again
+                    if (Counter[Ghost2LocalMap[u]] > 0)
+                    {
+                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement
+                        if (Counter[Ghost2LocalMap[u]] == 0)
+                        {
+                            S--; // Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages";
                             fflush(stdout);
 #endif
                         }
-                    } //End of if Counter[w] > 0
-                    //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
-                } //End of else: CASE III
-            } //End of else: CASE I
-        } //End of if (!MsgQ.empty())
+                    } // End of if Counter[w] > 0
+                    // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                } // End of else: CASE III
+            }     // End of else: CASE I
+        }         // End of if (!MsgQ.empty())
         ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
 #ifdef PRINT_DEBUG_INFO_
-        cout<<"\n("<<myRank<<")Finished Message processing phase: S= "<<S; fflush(stdout);
-        cout<<"\n("<<myRank<<")** SENT     : ACTUAL= "<<msgActual; fflush(stdout);
-        cout<<"\n("<<myRank<<")** SENT     : INDIVIDUAL= "<<msgInd<<endl; fflush(stdout);
+        cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
+        fflush(stdout);
 #endif
-    } //End of while (true)
+    } // End of while (true)
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<") Waitall= "<<endl; fflush(stdout);
+    cout << "\n(" << myRank << ") Waitall= " << endl;
+    fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
-    cout<<"\n("<<myRank<<") Waitall " <<endl; fflush(stdout);
+    cout << "\n(" << myRank << ") Waitall " << endl;
+    fflush(stdout);
 #endif
-    //MPI_Barrier(comm);
-    //Cleanup Phase
+    // MPI_Barrier(comm);
+    // Cleanup Phase
     MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
-    //MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
-    if ( BufferSize > 0 ) {
-        MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer
-        free(Buffer); //Free the memory that was allocated
+    // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
+    if (BufferSize > 0)
+    {
+        MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
+        free(Buffer);                            // Free the memory that was allocated
     }
     finishTime = MPI_Wtime();
-    *ph2_time = finishTime-startTime; //Time taken for Phase-2
-    *ph2_card = myCard ; //Cardinality at the end of Phase-2
+    *ph2_time = finishTime - startTime; // Time taken for Phase-2
+    *ph2_card = myCard;                 // Cardinality at the end of Phase-2
 
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<")End of function to compute matching: "<<endl; fflush(stdout);
-    cout<<"\n("<<myRank<<")myCardinality: "<<myCard<<endl; fflush(stdout);
-    cout<<"\n("<<myRank<<")Matching took "<<finishTime-startTime<<"seconds"<<endl; fflush(stdout);
-    cout<<"\n("<<myRank<<")** Getting out of the matching function **"<<endl; fflush(stdout);
+    cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
+    fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
-    cout<<"\n("<<myRank<<") Number of Ghost edges = "<<numGhostEdges;
-    cout<<"\n("<<myRank<<") Total number of potential message X 2 = "<<numGhostEdges*2;
-    cout<<"\n("<<myRank<<") Number messages bundled = "<<NumMessagesBundled;
-    cout<<"\n("<<myRank<<") Total Individual Messages sent = "<< msgInd;
-    if (msgInd>0) {
-      cout<<"\n("<<myRank<<") Percentage of messages bundled = "<<((double)NumMessagesBundled/(double)(msgInd))*100.0<<"% \n";
+    cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
+    cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
+    cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
+    cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
+    if (msgInd > 0)
+    {
+        cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
     }
     fflush(stdout);
 #endif
 
     *msgActualSent = msgActual;
     *msgIndSent = msgInd;
-    if (msgInd > 0) {
-        *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0;
-    } else {
+    if (msgInd > 0)
+    {
+        *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
+    }
+    else
+    {
         *msgPercent = 0;
     }
 
 #ifdef DEBUG_HANG_
-    if (myRank == 0) cout<<"\n("<<myRank<<") Done" <<endl; fflush(stdout);
+    if (myRank == 0)
+        cout << "\n(" << myRank << ") Done" << endl;
+    fflush(stdout);
 #endif
-    //MPI_Barrier(comm);
+    // MPI_Barrier(comm);
 }
-//End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
+// End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
 #endif
 
 #endif
\ No newline at end of file

From 46047b220293a286150f2e984d915863c47cf862 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 30 Jun 2022 16:48:18 -0500
Subject: [PATCH 38/96] refactoring parallelComputeCandidateMateB

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  8 ++++
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 45 +++++++++----------
 .../parallelComputeCandidateMateB.cpp         | 36 +++++++++++++++
 3 files changed, 65 insertions(+), 24 deletions(-)
 create mode 100644 amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 530933e5..88e205ba 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -64,6 +64,7 @@
 #include "dataStrStaticQueue.h"
 
 using namespace std;
+#define NUM_THREAD 4
 
 #ifdef __cplusplus
 extern "C"
@@ -203,6 +204,13 @@ extern "C"
                            staticQueue &privateQMsgType,
                            staticQueue &privateQOwner);
 
+    inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+                    MilanLongInt *verLocPtr,
+                    MilanLongInt *verLocInd,
+                    MilanInt myRank, 
+                    MilanReal *edgeLocWeight,
+                    MilanLongInt *candidateMate);
+
     void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         MilanLongInt NLVer, MilanLongInt NLEdge,
         MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index d470b1ab..980824aa 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -5,6 +5,7 @@
 #include "findOwnerOfGhost.cpp"
 #include "computeCandidateMate.cpp"
 #include "initialize.cpp"
+#include "parallelComputeCandidateMateB.cpp"
 
 // ***********************************************************************
 //
@@ -258,26 +259,22 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     /////////////////////////////////////////////////////////////////////////////////////////
     // Compute the Initial Matching Set:
 
+    /*
+     * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
+     * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
+     * the two.
+     * PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
+     */
+
+    PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer,
+                                      verLocPtr,
+                                      verLocInd,
+                                      myRank,
+                                      edgeLocWeight,
+                                      candidateMate);
+
 #pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
     {
-        /*
-         * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
-         * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
-         * the two.
-         * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
-         */
-
-#pragma omp for schedule(static)
-        for (v = 0; v < NLVer; v++)
-        {
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
-            fflush(stdout);
-#endif
-            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-            candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
-            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-        }
 
         /*
          * PARALLEL_PROCESS_EXPOSED_VERTEX_B
@@ -476,7 +473,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             }
         }
 
-#pragma omp master
+#pragma omp single
         {
             tempCounter.clear(); // Do not need this any more
         }
@@ -715,11 +712,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                     candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
                                 if (v != Mate[u - StartIndex])
                                 { // u is local
-                                    // Build the Message Packet:
-                                    // Message[0] = u; //LOCAL
-                                    // Message[1] = v; //GHOST
-                                    // Message[2] = SUCCESS;  //TYPE
-                                    // Send a Request (Asynchronous)
+                                  // Build the Message Packet:
+                                  // Message[0] = u; //LOCAL
+                                  // Message[1] = v; //GHOST
+                                  // Message[2] = SUCCESS;  //TYPE
+                                  // Send a Request (Asynchronous)
 
 #ifdef PRINT_DEBUG_INFO_
                                     cout << "\n(" << myRank << ")Sending a success message: ";
diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
new file mode 100644
index 00000000..ced93456
--- /dev/null
+++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
@@ -0,0 +1,36 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <assert.h>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+                                              MilanLongInt *verLocPtr,
+                                              MilanLongInt *verLocInd,
+                                              MilanInt myRank,
+                                              MilanReal *edgeLocWeight,
+                                              MilanLongInt *candidateMate)
+{
+
+    MilanLongInt v = -1;
+
+#pragma omp parallel private(v) default(shared) num_threads(4)
+    {
+
+#pragma omp for schedule(static)
+        for (v = 0; v < NLVer; v++)
+        {
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
+#endif
+            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
+            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+        }
+    }
+}

From b66de7f25cf15ca1b69d3a7150a5d15c797c34e4 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 6 Jul 2022 12:58:00 -0500
Subject: [PATCH 39/96] Refactoring PARALLEL_PROCESS_EXPOSED_VERTEX_B

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  58 +++-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 237 +++-------------
 .../impl/aggregator/processExposedVertex.cpp  | 256 ++++++++++++++++++
 3 files changed, 334 insertions(+), 217 deletions(-)
 create mode 100644 amgprec/impl/aggregator/processExposedVertex.cpp

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 88e205ba..122a1380 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -205,21 +205,51 @@ extern "C"
                            staticQueue &privateQOwner);
 
     inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
-                    MilanLongInt *verLocPtr,
-                    MilanLongInt *verLocInd,
-                    MilanInt myRank, 
-                    MilanReal *edgeLocWeight,
-                    MilanLongInt *candidateMate);
+                                                  MilanLongInt *verLocPtr,
+                                                  MilanLongInt *verLocInd,
+                                                  MilanInt myRank,
+                                                  MilanReal *edgeLocWeight,
+                                                  MilanLongInt *candidateMate);
 
-    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
-        MilanLongInt NLVer, MilanLongInt NLEdge,
-        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
-        MilanLongInt *verDistance,
-        MilanLongInt *Mate,
-        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
-        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
-        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+    inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+                                                  MilanLongInt *candidateMate,
+                                                  MilanLongInt *verLocInd,
+                                                  MilanLongInt *verLocPtr,
+                                                  MilanLongInt StartIndex,
+                                                  MilanLongInt EndIndex,
+                                                  MilanLongInt *Mate,
+                                                  vector<MilanLongInt> &GMate,
+                                                  map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                                  MilanReal *edgeLocWeight,
+                                                  MilanLongInt *myCardPtr,
+                                                  MilanLongInt *msgIndPtr,
+                                                  MilanLongInt *NumMessagesBundledPtr,
+                                                  MilanLongInt *SPtr,
+                                                  MilanLongInt *verDistance,
+                                                  MilanLongInt *PCounter,
+                                                  vector<MilanLongInt> &Counter,
+                                                  MilanInt myRank,
+                                                  MilanInt numProcs,
+                                                  staticQueue &U,
+                                                  staticQueue &privateU,
+                                                  vector<MilanLongInt> &QLocalVtx,
+                                                  vector<MilanLongInt> &QGhostVtx,
+                                                  vector<MilanLongInt> &QMsgType,
+                                                  vector<MilanInt> &QOwner,
+                                                  staticQueue &privateQLocalVtx,
+                                                  staticQueue &privateQGhostVtx,
+                                                  staticQueue &privateQMsgType,
+                                                  staticQueue &privateQOwner);
+
+        void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+            MilanLongInt NLVer, MilanLongInt NLEdge,
+            MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+            MilanLongInt *verDistance,
+            MilanLongInt *Mate,
+            MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+            MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+            MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+            MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 
     void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
         MilanLongInt NLVer, MilanLongInt NLEdge,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 980824aa..930a4d37 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -6,6 +6,7 @@
 #include "computeCandidateMate.cpp"
 #include "initialize.cpp"
 #include "parallelComputeCandidateMateB.cpp"
+#include "processExposedVertex.cpp"
 
 // ***********************************************************************
 //
@@ -273,211 +274,41 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                       edgeLocWeight,
                                       candidateMate);
 
+    PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer, 
+    candidateMate, 
+    verLocInd, 
+    verLocPtr, 
+    StartIndex, 
+    EndIndex, 
+    Mate, 
+    GMate, 
+    Ghost2LocalMap, 
+    edgeLocWeight, 
+    &myCard, 
+    &msgInd, 
+    &NumMessagesBundled, 
+    &S, 
+    verDistance, 
+    PCounter, 
+    Counter, 
+    myRank, 
+    numProcs, 
+    U, 
+    privateU, 
+    QLocalVtx, 
+    QGhostVtx, 
+    QMsgType, 
+    QOwner,
+    privateQLocalVtx,
+    privateQGhostVtx,
+    privateQMsgType,
+    privateQOwner);
+
+    tempCounter.clear(); // Do not need this any more
+
 #pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
     {
 
-        /*
-         * PARALLEL_PROCESS_EXPOSED_VERTEX_B
-         * The sequential version could be a bit more
-         * efficient.
-         *
-         * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner
-         *       first in a local variable and then, only at the end, append them to the real data structure
-         *       to remove the critical sections.
-         *
-         * TODO: Test when it's more efficient to execute this code
-         *       in parallel.
-         */
-
-#pragma omp for reduction(+ \
-                          : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
-        for (v = 0; v < NLVer; v++)
-        {
-            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-            k = candidateMate[v];
-            candidateMate[v] = verLocInd[k];
-            w = candidateMate[v];
-
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
-            fflush(stdout);
-#endif
-
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
-            fflush(stdout);
-#endif
-            // If found a dominating edge:
-            if (w >= 0)
-            {
-
-                if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
-                {
-                    w = computeCandidateMate(verLocPtr[v],
-                                             verLocPtr[v + 1],
-                                             edgeLocWeight, 0,
-                                             verLocInd,
-                                             StartIndex,
-                                             EndIndex,
-                                             GMate,
-                                             Mate,
-                                             Ghost2LocalMap);
-                    candidateMate[v] = w;
-                }
-
-                if (w >= 0)
-                {
-
-                    myCard++;
-                    if ((w < StartIndex) || (w > EndIndex))
-                    { // w is a ghost vertex
-#ifdef PRINT_DEBUG_INFO_
-                        cout << "\n(" << myRank << ")Sending a request message (291):";
-                        cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
-                        fflush(stdout);
-#endif
-
-                        msgInd++;
-                        NumMessagesBundled++;
-                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        assert(ghostOwner != -1);
-                        assert(ghostOwner != myRank);
-                        PCounter[ghostOwner]++;
-
-                        /*
-                        //TODO why does it fail if I use a private data structure???
-                        privateQLocalVtx.push_back(v + StartIndex);
-                        privateQGhostVtx.push_back(w);
-                        privateQMsgType.push_back(REQUEST);
-                        privateQOwner.push_back(ghostOwner);
-                        */
-
-#pragma omp critical(MSG)
-                        {
-
-                            QLocalVtx.push_back(v + StartIndex);
-                            QGhostVtx.push_back(w);
-                            QMsgType.push_back(REQUEST);
-                            QOwner.push_back(ghostOwner);
-                        } // end of critical region
-
-                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
-                        {
-
-                            privateU.push_back(v + StartIndex);
-                            privateU.push_back(w);
-                            Mate[v] = w;
-                            // FIXME could this instruction create errors?
-                            GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
-
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
-                            fflush(stdout);
-#endif
-                            // Decrement the counter:
-                            // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-#pragma omp critical
-                            {
-                                if (Counter[Ghost2LocalMap[w]] > 0)
-                                {
-
-                                    Counter[Ghost2LocalMap[w]] -= 1; // Decrement
-                                    if (Counter[Ghost2LocalMap[w]] == 0)
-                                    {
-                                        S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
-                                        fflush(stdout);
-#endif
-                                    }
-                                }
-                            } // End of if Counter[w] > 0
-                            // End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-                        } // End of if CandidateMate[w] = v
-
-                    } // End of if a Ghost Vertex
-                    else
-                    { // w is a local vertex
-
-                        if (candidateMate[w - StartIndex] == (v + StartIndex))
-                        {
-                            privateU.push_back(v + StartIndex);
-                            privateU.push_back(w);
-
-                            Mate[v] = w; // v is local
-                            // FIXME this instruction could create errors
-                            Mate[w - StartIndex] = v + StartIndex; // w is local
-
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
-                            fflush(stdout);
-#endif
-
-                        } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-                    }     // End of Else
-
-                    continue;
-                } // End of second if
-
-            } // End of if(w >=0)
-
-            // This piece of code is executed a really small amount of times, I will not allocate a
-            // huge amount of memory to the private data structures.
-            adj11 = verLocPtr[v];
-            adj12 = verLocPtr[v + 1];
-            for (k1 = adj11; k1 < adj12; k1++)
-            {
-                w = verLocInd[k1];
-                if ((w < StartIndex) || (w > EndIndex))
-                { // A ghost
-
-#ifdef PRINT_DEBUG_INFO_
-                    cout << "\n(" << myRank << ")Sending a failure message: ";
-                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                    fflush(stdout);
-#endif
-
-                    msgInd++;
-                    NumMessagesBundled++;
-                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                    assert(ghostOwner != -1);
-                    assert(ghostOwner != myRank);
-                    PCounter[ghostOwner]++;
-                    QLocalVtx.push_back(v + StartIndex);
-                    QGhostVtx.push_back(w);
-                    QMsgType.push_back(FAILURE);
-                    QOwner.push_back(ghostOwner);
-
-                } // End of if(GHOST)
-            }     // End of for loop
-            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-        } // End of for ( v=0; v < NLVer; v++ )
-
-#pragma omp critical(privateMsg)
-        {
-            while (!privateQLocalVtx.empty())
-            {
-
-                QLocalVtx.push_back(privateQLocalVtx.pop_front());
-                QGhostVtx.push_back(privateQGhostVtx.pop_front());
-                QMsgType.push_back(privateQMsgType.pop_front());
-                QOwner.push_back(privateQOwner.pop_front());
-            }
-        }
-
-#pragma omp critical(U)
-        {
-            while (!privateU.empty())
-            {
-                U.push_back(privateU.pop_front());
-            }
-        }
-
-#pragma omp single
-        {
-            tempCounter.clear(); // Do not need this any more
-        }
-
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << "=========================************===============================" << endl;
         fflush(stdout);
@@ -788,7 +619,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         /////////////////////////////////////////////////////////////////////////////////////////
         ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
         /////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp barrier
+#pragma omp barrier // TODO check if necessary
 #pragma omp master
         {
             // Data structures for Bundled Messages:
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
new file mode 100644
index 00000000..a76d3df8
--- /dev/null
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -0,0 +1,256 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <assert.h>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+/*
+ * PARALLEL_PROCESS_EXPOSED_VERTEX_B
+ * The sequential version could be a bit more
+ * efficient.
+ *
+ * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner
+ *       first in a local variable and then, only at the end, append them to the real data structure
+ *       to remove the critical sections.
+ *
+ * TODO: Test when it's more efficient to execute this code
+ *       in parallel.
+ */
+
+inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+                                              MilanLongInt *candidateMate,
+                                              MilanLongInt *verLocInd,
+                                              MilanLongInt *verLocPtr,
+                                              MilanLongInt StartIndex,
+                                              MilanLongInt EndIndex,
+                                              MilanLongInt *Mate,
+                                              vector<MilanLongInt> &GMate,
+                                              map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                              MilanReal *edgeLocWeight,
+                                              MilanLongInt *myCardPtr,
+                                              MilanLongInt *msgIndPtr,
+                                              MilanLongInt *NumMessagesBundledPtr,
+                                              MilanLongInt *SPtr,
+                                              MilanLongInt *verDistance,
+                                              MilanLongInt *PCounter,
+                                              vector<MilanLongInt> &Counter,
+                                              MilanInt myRank,
+                                              MilanInt numProcs,
+                                              staticQueue &U,
+                                              staticQueue &privateU,
+                                              vector<MilanLongInt> &QLocalVtx,
+                                              vector<MilanLongInt> &QGhostVtx,
+                                              vector<MilanLongInt> &QMsgType,
+                                              vector<MilanInt> &QOwner,
+                                              staticQueue &privateQLocalVtx,
+                                              staticQueue &privateQGhostVtx,
+                                              staticQueue &privateQMsgType,
+                                              staticQueue &privateQOwner)
+{
+
+    const MilanLongInt REQUEST = 1;
+    const MilanLongInt SUCCESS = 2;
+    const MilanLongInt FAILURE = 3;
+    const MilanLongInt SIZEINFO = 4;
+    MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0, S = *SPtr;
+    MilanLongInt myCard = 0, msgInd = 0;
+    MilanLongInt NumMessagesBundled = 0;
+    MilanInt ghostOwner = 0;
+
+#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
+    {
+#pragma omp for reduction(+ \
+                          : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
+        for (v = 0; v < NLVer; v++)
+        {
+            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+            k = candidateMate[v];
+            candidateMate[v] = verLocInd[k];
+            w = candidateMate[v];
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
+            fflush(stdout);
+#endif
+            // If found a dominating edge:
+            if (w >= 0)
+            {
+
+                if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                {
+                    w = computeCandidateMate(verLocPtr[v],
+                                             verLocPtr[v + 1],
+                                             edgeLocWeight, 0,
+                                             verLocInd,
+                                             StartIndex,
+                                             EndIndex,
+                                             GMate,
+                                             Mate,
+                                             Ghost2LocalMap);
+                    candidateMate[v] = w;
+                }
+
+                if (w >= 0)
+                {
+
+                    myCard++;
+                    if ((w < StartIndex) || (w > EndIndex))
+                    { // w is a ghost vertex
+#ifdef PRINT_DEBUG_INFO_
+                        cout << "\n(" << myRank << ")Sending a request message (291):";
+                        cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                        fflush(stdout);
+#endif
+
+                        msgInd++;
+                        NumMessagesBundled++;
+                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        assert(ghostOwner != -1);
+                        assert(ghostOwner != myRank);
+                        PCounter[ghostOwner]++;
+
+                        /*
+                        //TODO why does it fail if I use a private data structure???
+                        privateQLocalVtx.push_back(v + StartIndex);
+                        privateQGhostVtx.push_back(w);
+                        privateQMsgType.push_back(REQUEST);
+                        privateQOwner.push_back(ghostOwner);
+                        */
+
+#pragma omp critical(MSG)
+                        {
+
+                            QLocalVtx.push_back(v + StartIndex);
+                            QGhostVtx.push_back(w);
+                            QMsgType.push_back(REQUEST);
+                            QOwner.push_back(ghostOwner);
+                        } // end of critical region
+
+                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
+                        {
+
+                            privateU.push_back(v + StartIndex);
+                            privateU.push_back(w);
+                            Mate[v] = w;
+                            // FIXME could this instruction create errors?
+                            GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
+                            fflush(stdout);
+#endif
+                            // Decrement the counter:
+                            // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+#pragma omp critical
+                            {
+                                if (Counter[Ghost2LocalMap[w]] > 0)
+                                {
+
+                                    Counter[Ghost2LocalMap[w]] -= 1; // Decrement
+                                    if (Counter[Ghost2LocalMap[w]] == 0)
+                                    {
+                                        S--; // Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
+                                        fflush(stdout);
+#endif
+                                    }
+                                }
+                            } // End of if Counter[w] > 0
+                            // End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                        } // End of if CandidateMate[w] = v
+
+                    } // End of if a Ghost Vertex
+                    else
+                    { // w is a local vertex
+
+                        if (candidateMate[w - StartIndex] == (v + StartIndex))
+                        {
+                            privateU.push_back(v + StartIndex);
+                            privateU.push_back(w);
+
+                            Mate[v] = w; // v is local
+                            // FIXME this instruction could create errors
+                            Mate[w - StartIndex] = v + StartIndex; // w is local
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
+                            fflush(stdout);
+#endif
+
+                        } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                    }     // End of Else
+
+                    continue;
+                } // End of second if
+
+            } // End of if(w >=0)
+
+            // This piece of code is executed a really small amount of times, I will not allocate a
+            // huge amount of memory for the private data structures.
+            adj11 = verLocPtr[v];
+            adj12 = verLocPtr[v + 1];
+            for (k1 = adj11; k1 < adj12; k1++)
+            {
+                w = verLocInd[k1];
+                if ((w < StartIndex) || (w > EndIndex))
+                { // A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    fflush(stdout);
+#endif
+
+                    msgInd++;
+                    NumMessagesBundled++;
+                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                    assert(ghostOwner != -1);
+                    assert(ghostOwner != myRank);
+                    PCounter[ghostOwner]++;
+                    QLocalVtx.push_back(v + StartIndex);
+                    QGhostVtx.push_back(w);
+                    QMsgType.push_back(FAILURE);
+                    QOwner.push_back(ghostOwner);
+
+                } // End of if(GHOST)
+            }     // End of for loop
+            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+        } // End of for ( v=0; v < NLVer; v++ )
+
+#pragma omp critical(U)
+        {
+            while (!privateU.empty())
+                U.push_back(privateU.pop_front());
+        }
+
+#pragma omp master
+        {
+            *myCardPtr = myCard;
+            *msgIndPtr = msgInd;
+            *NumMessagesBundledPtr = NumMessagesBundled;
+            *SPtr = S;
+        }
+
+#pragma omp critical(privateMsg)
+        {
+            while (!privateQLocalVtx.empty())
+            {
+                QLocalVtx.push_back(privateQLocalVtx.pop_front());
+                QGhostVtx.push_back(privateQGhostVtx.pop_front());
+                QMsgType.push_back(privateQMsgType.pop_front());
+                QOwner.push_back(privateQOwner.pop_front());
+            }
+        }
+
+    } // End of parallel region
+}
\ No newline at end of file

From 63b7602d3abd1adb2bbd41eacd80d994b2ea2ea9 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 6 Jul 2022 13:12:31 -0500
Subject: [PATCH 40/96] refactoring queueTransfer

---
 amgprec/impl/aggregator/MatchBoxPC.h          | 11 ++++++
 .../impl/aggregator/processExposedVertex.cpp  | 23 ++++-------
 amgprec/impl/aggregator/queueTransfer.cpp     | 38 +++++++++++++++++++
 3 files changed, 56 insertions(+), 16 deletions(-)
 create mode 100644 amgprec/impl/aggregator/queueTransfer.cpp

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 122a1380..96630f9c 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -241,6 +241,17 @@ extern "C"
                                                   staticQueue &privateQMsgType,
                                                   staticQueue &privateQOwner);
 
+        inline void queuesTransfer(staticQueue &U,
+                          staticQueue &privateU,
+                          vector<MilanLongInt> &QLocalVtx,
+                          vector<MilanLongInt> &QGhostVtx,
+                          vector<MilanLongInt> &QMsgType,
+                          vector<MilanInt> &QOwner,
+                          staticQueue &privateQLocalVtx,
+                          staticQueue &privateQGhostVtx,
+                          staticQueue &privateQMsgType,
+                          staticQueue &privateQOwner);
+
         void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             MilanLongInt NLVer, MilanLongInt NLEdge,
             MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index a76d3df8..148951a5 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -7,6 +7,7 @@
 #include "primitiveDataTypeDefinitions.h"
 #include "dataStrStaticQueue.h"
 #include "omp.h"
+#include "queueTransfer.cpp"
 
 /*
  * PARALLEL_PROCESS_EXPOSED_VERTEX_B
@@ -227,11 +228,12 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
             // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
         } // End of for ( v=0; v < NLVer; v++ )
 
-#pragma omp critical(U)
-        {
-            while (!privateU.empty())
-                U.push_back(privateU.pop_front());
-        }
+        queuesTransfer(U, privateU, QLocalVtx,
+                       QGhostVtx,
+                       QMsgType, QOwner, privateQLocalVtx,
+                       privateQGhostVtx,
+                       privateQMsgType,
+                       privateQOwner);
 
 #pragma omp master
         {
@@ -241,16 +243,5 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
             *SPtr = S;
         }
 
-#pragma omp critical(privateMsg)
-        {
-            while (!privateQLocalVtx.empty())
-            {
-                QLocalVtx.push_back(privateQLocalVtx.pop_front());
-                QGhostVtx.push_back(privateQGhostVtx.pop_front());
-                QMsgType.push_back(privateQMsgType.pop_front());
-                QOwner.push_back(privateQOwner.pop_front());
-            }
-        }
-
     } // End of parallel region
 }
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp
new file mode 100644
index 00000000..55b0983a
--- /dev/null
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@@ -0,0 +1,38 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+inline void queuesTransfer(staticQueue &U,
+                           staticQueue &privateU,
+                           vector<MilanLongInt> &QLocalVtx,
+                           vector<MilanLongInt> &QGhostVtx,
+                           vector<MilanLongInt> &QMsgType,
+                           vector<MilanInt> &QOwner,
+                           staticQueue &privateQLocalVtx,
+                           staticQueue &privateQGhostVtx,
+                           staticQueue &privateQMsgType,
+                           staticQueue &privateQOwner)
+{
+
+#pragma omp critical(U)
+    {
+        while (!privateU.empty())
+            U.push_back(privateU.pop_front());
+    }
+
+#pragma omp critical(privateMsg)
+    {
+        while (!privateQLocalVtx.empty())
+        {
+            QLocalVtx.push_back(privateQLocalVtx.pop_front());
+            QGhostVtx.push_back(privateQGhostVtx.pop_front());
+            QMsgType.push_back(privateQMsgType.pop_front());
+            QOwner.push_back(privateQOwner.pop_front());
+        }
+    }
+}
\ No newline at end of file

From 6dcae6d0c175da7246d5775b3f83820334ed4588 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 6 Jul 2022 15:33:29 -0500
Subject: [PATCH 41/96] fix private queues in PARALLEL_PROCESS_EXPOSED_VERTEX_B

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 92 +++++++++----------
 .../impl/aggregator/processExposedVertex.cpp  | 33 ++-----
 amgprec/impl/aggregator/queueTransfer.cpp     | 10 +-
 3 files changed, 60 insertions(+), 75 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 930a4d37..1fb1d90f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -274,35 +274,35 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                       edgeLocWeight,
                                       candidateMate);
 
-    PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer, 
-    candidateMate, 
-    verLocInd, 
-    verLocPtr, 
-    StartIndex, 
-    EndIndex, 
-    Mate, 
-    GMate, 
-    Ghost2LocalMap, 
-    edgeLocWeight, 
-    &myCard, 
-    &msgInd, 
-    &NumMessagesBundled, 
-    &S, 
-    verDistance, 
-    PCounter, 
-    Counter, 
-    myRank, 
-    numProcs, 
-    U, 
-    privateU, 
-    QLocalVtx, 
-    QGhostVtx, 
-    QMsgType, 
-    QOwner,
-    privateQLocalVtx,
-    privateQGhostVtx,
-    privateQMsgType,
-    privateQOwner);
+    PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
+                                      candidateMate,
+                                      verLocInd,
+                                      verLocPtr,
+                                      StartIndex,
+                                      EndIndex,
+                                      Mate,
+                                      GMate,
+                                      Ghost2LocalMap,
+                                      edgeLocWeight,
+                                      &myCard,
+                                      &msgInd,
+                                      &NumMessagesBundled,
+                                      &S,
+                                      verDistance,
+                                      PCounter,
+                                      Counter,
+                                      myRank,
+                                      numProcs,
+                                      U,
+                                      privateU,
+                                      QLocalVtx,
+                                      QGhostVtx,
+                                      QMsgType,
+                                      QOwner,
+                                      privateQLocalVtx,
+                                      privateQGhostVtx,
+                                      privateQMsgType,
+                                      privateQOwner);
 
     tempCounter.clear(); // Do not need this any more
 
@@ -455,6 +455,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                                 cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                                                 fflush(stdout);
 #endif
+
+                                                // TODO refactor this
                                                 // Decrement the counter:
                                                 // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
                                                 if (Counter[Ghost2LocalMap[w]] > 0)
@@ -579,29 +581,25 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 // Avoid to ask for the critical section if there is nothing to add
                 if (privateU.size() < UCHUNK && !U.empty())
                     continue;
-#pragma omp critical(U)
-                {
-                    while (!privateU.empty())
-                    {
-                        U.push_back(privateU.pop_front());
-                    }
-
-                    myCard += privateMyCard;
-                } // End of critical U
+                queuesTransfer(U, privateU, QLocalVtx,
+                               QGhostVtx,
+                               QMsgType, QOwner, privateQLocalVtx,
+                               privateQGhostVtx,
+                               privateQMsgType,
+                               privateQOwner);
             }
         } // End of while ( /*!Q.empty()*/ !U.empty() )
 
-#pragma omp critical(privateMsg)
+#pragma omp critical
         {
-            while (!privateQLocalVtx.empty())
-            {
-
-                QLocalVtx.push_back(privateQLocalVtx.pop_front());
-                QGhostVtx.push_back(privateQGhostVtx.pop_front());
-                QMsgType.push_back(privateQMsgType.pop_front());
-                QOwner.push_back(privateQOwner.pop_front());
-            }
+            myCard += privateMyCard;
         }
+        queuesTransfer(U, privateU, QLocalVtx,
+                       QGhostVtx,
+                       QMsgType, QOwner, privateQLocalVtx,
+                       privateQGhostVtx,
+                       privateQMsgType,
+                       privateQOwner);
 
 #ifdef COUNT_LOCAL_VERTEX
         printf("Count local vertexes: %ld for thread %d of processor %d\n",
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 148951a5..86d19eeb 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -11,14 +11,9 @@
 
 /*
  * PARALLEL_PROCESS_EXPOSED_VERTEX_B
- * The sequential version could be a bit more
- * efficient.
+ * TODO: write comment
  *
- * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner
- *       first in a local variable and then, only at the end, append them to the real data structure
- *       to remove the critical sections.
- *
- * TODO: Test when it's more efficient to execute this code
+ * TODO: Test when it's actually more efficient to execute this code
  *       in parallel.
  */
 
@@ -119,22 +114,11 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                         assert(ghostOwner != myRank);
                         PCounter[ghostOwner]++;
 
-                        /*
-                        //TODO why does it fail if I use a private data structure???
                         privateQLocalVtx.push_back(v + StartIndex);
                         privateQGhostVtx.push_back(w);
                         privateQMsgType.push_back(REQUEST);
                         privateQOwner.push_back(ghostOwner);
-                        */
-
-#pragma omp critical(MSG)
-                        {
-
-                            QLocalVtx.push_back(v + StartIndex);
-                            QGhostVtx.push_back(w);
-                            QMsgType.push_back(REQUEST);
-                            QOwner.push_back(ghostOwner);
-                        } // end of critical region
+                        
 
                         if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
                         {
@@ -149,6 +133,8 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                             cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
                             fflush(stdout);
 #endif
+
+                            //TODO refactor this!!
                             // Decrement the counter:
                             // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
 #pragma omp critical
@@ -218,10 +204,11 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                     assert(ghostOwner != -1);
                     assert(ghostOwner != myRank);
                     PCounter[ghostOwner]++;
-                    QLocalVtx.push_back(v + StartIndex);
-                    QGhostVtx.push_back(w);
-                    QMsgType.push_back(FAILURE);
-                    QOwner.push_back(ghostOwner);
+
+                    privateQLocalVtx.push_back(v + StartIndex);
+                    privateQGhostVtx.push_back(w);
+                    privateQMsgType.push_back(FAILURE);
+                    privateQOwner.push_back(ghostOwner);
 
                 } // End of if(GHOST)
             }     // End of for loop
diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp
index 55b0983a..becf14cf 100644
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@@ -22,17 +22,17 @@ inline void queuesTransfer(staticQueue &U,
 #pragma omp critical(U)
     {
         while (!privateU.empty())
-            U.push_back(privateU.pop_front());
+            U.push_back(privateU.pop_back());
     }
 
 #pragma omp critical(privateMsg)
     {
         while (!privateQLocalVtx.empty())
         {
-            QLocalVtx.push_back(privateQLocalVtx.pop_front());
-            QGhostVtx.push_back(privateQGhostVtx.pop_front());
-            QMsgType.push_back(privateQMsgType.pop_front());
-            QOwner.push_back(privateQOwner.pop_front());
+            QLocalVtx.push_back(privateQLocalVtx.pop_back());
+            QGhostVtx.push_back(privateQGhostVtx.pop_back());
+            QMsgType.push_back(privateQMsgType.pop_back());
+            QOwner.push_back(privateQOwner.pop_back());
         }
     }
 }
\ No newline at end of file

From 9b13aef1cedcb9c2bc0bc7d81167a1979d1a73a0 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Fri, 8 Jul 2022 13:32:24 -0500
Subject: [PATCH 42/96] processMathedVertices refactoring

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 354 +++---------------
 amgprec/impl/aggregator/extractUChunk.cpp     |  34 ++
 .../impl/aggregator/processExposedVertex.cpp  |   9 +-
 .../aggregator/processMatchedVertices.cpp     | 337 +++++++++++++++++
 4 files changed, 425 insertions(+), 309 deletions(-)
 create mode 100644 amgprec/impl/aggregator/extractUChunk.cpp
 create mode 100644 amgprec/impl/aggregator/processMatchedVertices.cpp

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 1fb1d90f..b6ac6364 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -7,6 +7,8 @@
 #include "initialize.cpp"
 #include "parallelComputeCandidateMateB.cpp"
 #include "processExposedVertex.cpp"
+#include "processMatchedVertices.cpp"
+//#include "extractUChunk.cpp"
 
 // ***********************************************************************
 //
@@ -227,7 +229,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt numMessagesToSend;
     MilanInt BufferSize;
     MilanLongInt *Buffer;
-    bool isEmpty;
 
     // Declare the locks
     //  TODO destroy the locks
@@ -274,6 +275,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                       edgeLocWeight,
                                       candidateMate);
 
+    /*
+     * PARALLEL_PROCESS_EXPOSED_VERTEX_B
+     * TODO: write comment
+     *
+     * TODO: Test when it's actually more efficient to execute this code
+     *       in parallel.
+     */
+
     PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
                                       candidateMate,
                                       verLocInd,
@@ -306,309 +315,52 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     tempCounter.clear(); // Do not need this any more
 
-#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
+    ///////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////
+//#define debug
+#ifndef debug
+    
+    vector<MilanLongInt> UChunkBeingProcessed;
+    UChunkBeingProcessed.reserve(UCHUNK);
+    processMatchedVertices(NLVer,
+                           UChunkBeingProcessed,
+                           U,
+                           privateU,
+                           StartIndex,
+                           EndIndex,
+                           &myCard,
+                           &msgInd,
+                           &NumMessagesBundled,
+                           &S,
+                           verLocPtr,
+                           verLocInd,
+                           verDistance,
+                           PCounter, 
+                           Counter,
+                           myRank,
+                           numProcs,
+                           candidateMate,
+                           GMate,
+                           Mate,
+                           Ghost2LocalMap,
+                           edgeLocWeight,
+                           QLocalVtx,
+                           QGhostVtx,
+                           QMsgType,
+                           QOwner,
+                           privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
+
+
+#endif
+
+#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
     {
 
-#ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << "=========================************===============================" << endl;
-        fflush(stdout);
-        fflush(stdout);
-#endif
-        ///////////////////////////////////////////////////////////////////////////////////
-        /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
-        ///////////////////////////////////////////////////////////////////////////////////
-        isEmpty = false;
-
-#ifdef COUNT_LOCAL_VERTEX
-        MilanLongInt localVertices = 0;
-#endif
-
-        // TODO what would be the optimal UCHUNK
-        vector<MilanLongInt> Us;
-        Us.reserve(UCHUNK);
-
-        while (true)
-        {
-
-            Us.clear();
-#pragma omp critical(U)
-            {
-                // If U is emptu and there are no new node to add to U
-                if (U.empty() && privateU.empty())
-                    isEmpty = true;
-                else
-                {
-                    if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
-                        while (!privateU.empty())
-                        {
-                            U.push_back(privateU.pop_front());
-                            myCard += privateMyCard;
-                        }
-                    for (int i = 0; i < UCHUNK; i++)
-                    { // Pop the new nodes
-                        if (U.empty())
-                            break;
-                        Us.push_back(U.pop_front());
-                    }
-                }
-            } // End of critical U
-            if (isEmpty)
-                break;
-
-            for (MilanLongInt u : Us)
-            {
-#ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")u: " << u;
-                fflush(stdout);
-#endif
-                if ((u >= StartIndex) && (u <= EndIndex))
-                { // Process Only the Local Vertices
-
-#ifdef COUNT_LOCAL_VERTEX
-                    localVertices++;
-#endif
-
-                    // Get the Adjacency list for u
-                    adj1 = verLocPtr[u - StartIndex]; // Pointer
-                    adj2 = verLocPtr[u - StartIndex + 1];
-                    for (k = adj1; k < adj2; k++)
-                    {
-                        v = verLocInd[k];
-
-                        if ((v >= StartIndex) && (v <= EndIndex))
-                        { // If Local Vertex:
-#pragma omp critical(innerProcessMatched)
-                            {
-
-#ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
-                                fflush(stdout);
-#endif
-
-                                // If the current vertex is pointing to a matched vertex and is not matched
-                                // FIXME is there a way to make candidateMate private?
-                                //       for the moment it could generate an error.
-                                if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
-                                    candidateMate[v - StartIndex] == u)
-                                {
-
-                                    // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                                    w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                             verLocPtr[v - StartIndex + 1],
-                                                             edgeLocWeight, 0,
-                                                             verLocInd,
-                                                             StartIndex,
-                                                             EndIndex,
-                                                             GMate,
-                                                             Mate,
-                                                             Ghost2LocalMap);
-
-                                    candidateMate[v - StartIndex] = w;
-
-                                    // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")" << v << " Points to: " << w;
-                                    fflush(stdout);
-#endif
-                                    // If found a dominating edge:
-                                    if (w >= 0)
-                                    {
-
-                                        // TODO is it possible to lock without a critical region?
-                                        // TODO there must be a more elegant and efficient way to do this
-                                        /*
-                                        while(true) {
-                                            if (omp_test_lock(&MateLock[v - StartIndex])) {
-                                                if (omp_test_lock(&MateLock[w - StartIndex])) break;
-                                                else omp_unset_lock(&MateLock[v - StartIndex]);
-                                            }
-                                        }
-                                        */
-
-                                        if ((w < StartIndex) || (w > EndIndex))
-                                        { // A ghost
-#ifdef PRINT_DEBUG_INFO_
-                                            cout << "\n(" << myRank << ")Sending a request message:";
-                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-#endif
-
-                                            QLocalVtx.push_back(v);
-                                            QGhostVtx.push_back(w);
-                                            QMsgType.push_back(REQUEST);
-                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                            assert(ghostOwner != -1);
-                                            assert(ghostOwner != myRank);
-                                            QOwner.push_back(ghostOwner);
-                                            PCounter[ghostOwner]++;
-                                            NumMessagesBundled++;
-                                            msgInd++;
-                                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                            {
-                                                Mate[v - StartIndex] = w;     // v is a local vertex
-                                                GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
-                                                // Q.push_back(u);
-                                                privateU.push_back(v);
-                                                privateU.push_back(w);
-                                                privateMyCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                                fflush(stdout);
-#endif
-
-                                                // TODO refactor this
-                                                // Decrement the counter:
-                                                // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                                if (Counter[Ghost2LocalMap[w]] > 0)
-                                                {
-                                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
-                                                    if (Counter[Ghost2LocalMap[w]] == 0)
-                                                    {
-                                                        S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
-                                                        fflush(stdout);
-#endif
-                                                    }
-                                                } // End of if Counter[w] > 0
-                                                // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                            } // End of if CandidateMate[w] = v
-                                        }     // End of if a Ghost Vertex
-                                        else
-                                        { // w is a local vertex
-                                            if (candidateMate[w - StartIndex] == v)
-                                            {
-                                                Mate[v - StartIndex] = w; // v is a local vertex
-                                                Mate[w - StartIndex] = v; // w is a local vertex
-                                                // Q.push_back(u);
-                                                privateU.push_back(v);
-                                                privateU.push_back(w);
-                                                privateMyCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                                fflush(stdout);
-#endif
-                                            } // End of if(CandidateMate(w) = v
-                                        }     // End of Else
-
-                                        // omp_unset_lock(&MateLock[v - StartIndex]);
-                                        // omp_unset_lock(&MateLock[w - StartIndex]);
-
-                                    } // End of if(w >=0)
-                                    else
-                                    {
-                                        adj11 = verLocPtr[v - StartIndex];
-                                        adj12 = verLocPtr[v - StartIndex + 1];
-                                        for (k1 = adj11; k1 < adj12; k1++)
-                                        {
-                                            w = verLocInd[k1];
-                                            if ((w < StartIndex) || (w > EndIndex))
-                                            { // A ghost
-
-#ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")Sending a failure message: ";
-                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                                fflush(stdout);
-#endif
-                                                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                                                 ComputeTag, comm); */
-                                                QLocalVtx.push_back(v);
-                                                QGhostVtx.push_back(w);
-                                                QMsgType.push_back(FAILURE);
-                                                // ghostOwner = inputSubGraph.findOwner(w);
-                                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                                assert(ghostOwner != -1);
-                                                assert(ghostOwner != myRank);
-                                                QOwner.push_back(ghostOwner);
-                                                PCounter[ghostOwner]++;
-                                                NumMessagesBundled++;
-                                                msgInd++;
-                                            } // End of if(GHOST)
-                                        }     // End of for loop
-                                    }         // End of Else: w == -1
-                                    // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-
-                                } // End of If (candidateMate[v-StartIndex] == u
-
-                            } // End of critical region if
-
-                        } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                        else
-                        { // Neighbor is a ghost vertex
 
-#pragma omp critical(innerProcessMatched)
-                            {
-
-                                // while(!omp_test_lock(&MateLock[u - StartIndex]));
-
-                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                                if (v != Mate[u - StartIndex])
-                                { // u is local
-                                  // Build the Message Packet:
-                                  // Message[0] = u; //LOCAL
-                                  // Message[1] = v; //GHOST
-                                  // Message[2] = SUCCESS;  //TYPE
-                                  // Send a Request (Asynchronous)
-
-#ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a success message: ";
-                                    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
-                                    fflush(stdout);
-#endif
-
-                                    QLocalVtx.push_back(u);
-                                    QGhostVtx.push_back(v);
-                                    QMsgType.push_back(SUCCESS);
-                                    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
-                                    QOwner.push_back(ghostOwner);
-                                    PCounter[ghostOwner]++;
-                                    NumMessagesBundled++;
-                                    msgInd++;
-                                } // End of If( v != Mate[u] )
-
-                                // omp_unset_lock(&MateLock[u - StartIndex]);
-
-                            } // End of critical region
-                        }     // End of Else //A Ghost Vertex
-
-                    } // End of For Loop adj(u)
-
-                } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
-
-                // Avoid to ask for the critical section if there is nothing to add
-                if (privateU.size() < UCHUNK && !U.empty())
-                    continue;
-                queuesTransfer(U, privateU, QLocalVtx,
-                               QGhostVtx,
-                               QMsgType, QOwner, privateQLocalVtx,
-                               privateQGhostVtx,
-                               privateQMsgType,
-                               privateQOwner);
-            }
-        } // End of while ( /*!Q.empty()*/ !U.empty() )
-
-#pragma omp critical
-        {
-            myCard += privateMyCard;
-        }
-        queuesTransfer(U, privateU, QLocalVtx,
-                       QGhostVtx,
-                       QMsgType, QOwner, privateQLocalVtx,
-                       privateQGhostVtx,
-                       privateQMsgType,
-                       privateQOwner);
-
-#ifdef COUNT_LOCAL_VERTEX
-        printf("Count local vertexes: %ld for thread %d of processor %d\n",
-               localVertices,
-               omp_get_thread_num(),
-               myRank);
-#endif
-
-        ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 #ifdef DEBUG_HANG_
         if (myRank == 0)
             cout << "\n(" << myRank << ") Send Bundles" << endl;
diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp
new file mode 100644
index 00000000..e8a6951c
--- /dev/null
+++ b/amgprec/impl/aggregator/extractUChunk.cpp
@@ -0,0 +1,34 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+#define UCHUNK 1000
+
+inline void extractUChunk(
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    staticQueue &U,
+    staticQueue &privateU)
+{
+
+    UChunkBeingProcessed.clear();
+#pragma omp critical(U)
+    {
+
+        if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
+            while (!privateU.empty())
+                U.push_back(privateU.pop_front());
+
+        for (int i = 0; i < UCHUNK; i++)
+        { // Pop the new nodes
+            if (U.empty())
+                break;
+            UChunkBeingProcessed.push_back(U.pop_front());
+        }
+
+    } // End of critical U
+}
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 86d19eeb..3847110a 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -9,14 +9,6 @@
 #include "omp.h"
 #include "queueTransfer.cpp"
 
-/*
- * PARALLEL_PROCESS_EXPOSED_VERTEX_B
- * TODO: write comment
- *
- * TODO: Test when it's actually more efficient to execute this code
- *       in parallel.
- */
-
 inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                               MilanLongInt *candidateMate,
                                               MilanLongInt *verLocInd,
@@ -48,6 +40,7 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                               staticQueue &privateQOwner)
 {
 
+    //TODO define all the constants in a single place!
     const MilanLongInt REQUEST = 1;
     const MilanLongInt SUCCESS = 2;
     const MilanLongInt FAILURE = 3;
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
new file mode 100644
index 00000000..5f54ef79
--- /dev/null
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -0,0 +1,337 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+#include "extractUChunk.cpp"
+
+#define UCHUNK 1000
+
+inline void processMatchedVertices(
+    MilanLongInt NLVer,
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    staticQueue &U,
+    staticQueue &privateU,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCardPtr,
+    MilanLongInt *msgIndPtr,
+    MilanLongInt *NumMessagesBundledPtr,
+    MilanLongInt *SPtr,
+    MilanLongInt *verLocPtr,
+    MilanLongInt *verLocInd,
+    MilanLongInt *verDistance,
+    MilanLongInt *PCounter,
+    vector<MilanLongInt> &Counter,
+    MilanInt myRank,
+    MilanInt numProcs,
+    MilanLongInt *candidateMate,
+    vector<MilanLongInt> &GMate,
+    MilanLongInt *Mate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    MilanReal *edgeLocWeight,
+    vector<MilanLongInt> &QLocalVtx,
+    vector<MilanLongInt> &QGhostVtx,
+    vector<MilanLongInt> &QMsgType,
+    vector<MilanInt> &QOwner,
+    staticQueue &privateQLocalVtx,
+    staticQueue &privateQGhostVtx,
+    staticQueue &privateQMsgType,
+    staticQueue &privateQOwner)
+{
+
+    // TODO define all the constants in a single place!
+    const MilanLongInt REQUEST = 1;
+    const MilanLongInt SUCCESS = 2;
+    const MilanLongInt FAILURE = 3;
+    const MilanLongInt SIZEINFO = 4;
+    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
+    MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0;
+
+    // TODO check if private queues arrive empty
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
+    {
+
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << "=========================************===============================" << endl;
+        fflush(stdout);
+        fflush(stdout);
+#endif
+
+#ifdef COUNT_LOCAL_VERTEX
+        MilanLongInt localVertices = 0;
+#endif
+
+        // TODO what would be the optimal UCHUNK
+        vector<MilanLongInt> UChunkBeingProcessed;
+        UChunkBeingProcessed.reserve(UCHUNK);
+
+        while (!U.empty())
+        {
+
+            extractUChunk(UChunkBeingProcessed, U, privateU);
+
+            for (MilanLongInt u : UChunkBeingProcessed)
+            {
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")u: " << u;
+                fflush(stdout);
+#endif
+                if ((u >= StartIndex) && (u <= EndIndex))
+                { // Process Only the Local Vertices
+
+#ifdef COUNT_LOCAL_VERTEX
+                    localVertices++;
+#endif
+
+                    // Get the Adjacency list for u
+                    adj1 = verLocPtr[u - StartIndex]; // Pointer
+                    adj2 = verLocPtr[u - StartIndex + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        v = verLocInd[k];
+
+                        if ((v >= StartIndex) && (v <= EndIndex))
+                        { // If Local Vertex:
+#pragma omp critical(innerProcessMatched)
+                            {
+
+#ifdef PRINT_DEBUG_INFO_
+                                cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                                fflush(stdout);
+#endif
+
+                                // If the current vertex is pointing to a matched vertex and is not matched
+                                // FIXME is there a way to make candidateMate private?
+                                //       for the moment it could generate an error.
+                                if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
+                                    candidateMate[v - StartIndex] == u)
+                                {
+
+                                    // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+                                    w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                             verLocPtr[v - StartIndex + 1],
+                                                             edgeLocWeight, 0,
+                                                             verLocInd,
+                                                             StartIndex,
+                                                             EndIndex,
+                                                             GMate,
+                                                             Mate,
+                                                             Ghost2LocalMap);
+
+                                    candidateMate[v - StartIndex] = w;
+
+                                    // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                    fflush(stdout);
+#endif
+                                    // If found a dominating edge:
+                                    if (w >= 0)
+                                    {
+
+                                        // TODO is it possible to lock without a critical region?
+                                        // TODO there must be a more elegant and efficient way to do this
+                                        /*
+                                        while(true) {
+                                            if (omp_test_lock(&MateLock[v - StartIndex])) {
+                                                if (omp_test_lock(&MateLock[w - StartIndex])) break;
+                                                else omp_unset_lock(&MateLock[v - StartIndex]);
+                                            }
+                                        }
+                                        */
+
+                                        if ((w < StartIndex) || (w > EndIndex))
+                                        { // A ghost
+#ifdef PRINT_DEBUG_INFO_
+                                            cout << "\n(" << myRank << ")Sending a request message:";
+                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+#endif
+
+                                            QLocalVtx.push_back(v);
+                                            QGhostVtx.push_back(w);
+                                            QMsgType.push_back(REQUEST);
+                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            assert(ghostOwner != -1);
+                                            assert(ghostOwner != myRank);
+                                            QOwner.push_back(ghostOwner);
+                                            PCounter[ghostOwner]++;
+                                            NumMessagesBundled++;
+                                            msgInd++;
+                                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                            {
+                                                Mate[v - StartIndex] = w;     // v is a local vertex
+                                                GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+                                                // Q.push_back(u);
+                                                privateU.push_back(v);
+                                                privateU.push_back(w);
+                                                privateMyCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                fflush(stdout);
+#endif
+
+                                                // TODO refactor this
+                                                // Decrement the counter:
+                                                // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                                if (Counter[Ghost2LocalMap[w]] > 0)
+                                                {
+                                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
+                                                    if (Counter[Ghost2LocalMap[w]] == 0)
+                                                    {
+                                                        S--; // Decrement S
+#ifdef PRINT_DEBUG_INFO_
+                                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
+                                                        fflush(stdout);
+#endif
+                                                    }
+                                                } // End of if Counter[w] > 0
+                                                // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                            } // End of if CandidateMate[w] = v
+                                        }     // End of if a Ghost Vertex
+                                        else
+                                        { // w is a local vertex
+                                            if (candidateMate[w - StartIndex] == v)
+                                            {
+                                                Mate[v - StartIndex] = w; // v is a local vertex
+                                                Mate[w - StartIndex] = v; // w is a local vertex
+                                                // Q.push_back(u);
+                                                privateU.push_back(v);
+                                                privateU.push_back(w);
+                                                privateMyCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                fflush(stdout);
+#endif
+                                            } // End of if(CandidateMate(w) = v
+                                        }     // End of Else
+
+                                        // omp_unset_lock(&MateLock[v - StartIndex]);
+                                        // omp_unset_lock(&MateLock[w - StartIndex]);
+
+                                    } // End of if(w >=0)
+                                    else
+                                    {
+                                        adj11 = verLocPtr[v - StartIndex];
+                                        adj12 = verLocPtr[v - StartIndex + 1];
+                                        for (k1 = adj11; k1 < adj12; k1++)
+                                        {
+                                            w = verLocInd[k1];
+                                            if ((w < StartIndex) || (w > EndIndex))
+                                            { // A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                                                cout << "\n(" << myRank << ")Sending a failure message: ";
+                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                fflush(stdout);
+#endif
+                                                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
+                                                 ComputeTag, comm); */
+                                                QLocalVtx.push_back(v);
+                                                QGhostVtx.push_back(w);
+                                                QMsgType.push_back(FAILURE);
+                                                // ghostOwner = inputSubGraph.findOwner(w);
+                                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                assert(ghostOwner != -1);
+                                                assert(ghostOwner != myRank);
+                                                QOwner.push_back(ghostOwner);
+                                                PCounter[ghostOwner]++;
+                                                NumMessagesBundled++;
+                                                msgInd++;
+                                            } // End of if(GHOST)
+                                        }     // End of for loop
+                                    }         // End of Else: w == -1
+                                    // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+
+                                } // End of If (candidateMate[v-StartIndex] == u
+
+                            } // End of critical region if
+
+                        } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else
+                        { // Neighbor is a ghost vertex
+
+#pragma omp critical(innerProcessMatched)
+                            {
+
+                                // while(!omp_test_lock(&MateLock[u - StartIndex]));
+
+                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                                if (v != Mate[u - StartIndex])
+                                { // u is local
+                                  // Build the Message Packet:
+                                  // Message[0] = u; //LOCAL
+                                  // Message[1] = v; //GHOST
+                                  // Message[2] = SUCCESS;  //TYPE
+                                  // Send a Request (Asynchronous)
+
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a success message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                                    fflush(stdout);
+#endif
+
+                                    QLocalVtx.push_back(u);
+                                    QGhostVtx.push_back(v);
+                                    QMsgType.push_back(SUCCESS);
+                                    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
+                                    QOwner.push_back(ghostOwner);
+                                    PCounter[ghostOwner]++;
+                                    NumMessagesBundled++;
+                                    msgInd++;
+                                } // End of If( v != Mate[u] )
+
+                                // omp_unset_lock(&MateLock[u - StartIndex]);
+
+                            } // End of critical region
+                        }     // End of Else //A Ghost Vertex
+
+                    } // End of For Loop adj(u)
+
+                } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+
+                // Ask for the critical section only when a certain amount
+                // of data have been accumulated in the private queue
+                if (privateU.size() < UCHUNK && !U.empty())
+                    continue;
+                queuesTransfer(U, privateU, QLocalVtx,
+                               QGhostVtx,
+                               QMsgType, QOwner, privateQLocalVtx,
+                               privateQGhostVtx,
+                               privateQMsgType,
+                               privateQOwner);
+            }
+        } // End of while ( /*!Q.empty()*/ !U.empty() )
+
+        queuesTransfer(U, privateU, QLocalVtx,
+                       QGhostVtx,
+                       QMsgType, QOwner, privateQLocalVtx,
+                       privateQGhostVtx,
+                       privateQMsgType,
+                       privateQOwner);
+
+// TODO it is possible that this is not working as expected
+//      further investigation needed.
+#pragma omp atomic
+        myCard += privateMyCard;
+
+#ifdef COUNT_LOCAL_VERTEX
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
+
+#endif
+    }
+    *myCardPtr = myCard;
+    *msgIndPtr = msgInd;
+    *NumMessagesBundledPtr = NumMessagesBundled;
+    *SPtr = S;
+}
\ No newline at end of file

From 3de1e607eb0a4d9b4e4a857125c791e1001fe614 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 10 Jul 2022 03:39:58 -0500
Subject: [PATCH 43/96] sendBundledMessages refactoring

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  14 ++
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 210 +++-------------
 .../impl/aggregator/processExposedVertex.cpp  |   1 +
 .../impl/aggregator/sendBundledMessages.cpp   | 225 ++++++++++++++++++
 4 files changed, 269 insertions(+), 181 deletions(-)
 create mode 100644 amgprec/impl/aggregator/sendBundledMessages.cpp

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 96630f9c..351dca98 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -66,6 +66,18 @@
 using namespace std;
 #define NUM_THREAD 4
 
+// MPI type map
+template <typename T>
+MPI_Datatype TypeMap();
+template <>
+inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
+template <>
+inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
+template <>
+inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
+template <>
+inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -150,6 +162,8 @@ extern "C"
 #define MilanRealMin MINUS_INFINITY
 #endif
 
+
+
     // Function of find the owner of a ghost vertex using binary search:
     inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                                      MilanInt myRank, MilanInt numProcs);
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index b6ac6364..cfd6b927 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -8,7 +8,7 @@
 #include "parallelComputeCandidateMateB.cpp"
 #include "processExposedVertex.cpp"
 #include "processMatchedVertices.cpp"
-//#include "extractUChunk.cpp"
+#include "sendBundledMessages.cpp"
 
 // ***********************************************************************
 //
@@ -85,17 +85,6 @@
 
 #ifdef SERIAL_MPI
 #else
-// MPI type map
-template <typename T>
-MPI_Datatype TypeMap();
-template <>
-inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
-template <>
-inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
-template <>
-inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
-template <>
-inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
 
 // DOUBLE PRECISION VERSION
 // WARNING: The vertex block on a given rank is contiguous
@@ -177,6 +166,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
     vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
 
+    // TODO move this inseide the initialization function
     MilanLongInt *PCounter = new MilanLongInt[numProcs];
     for (int i = 0; i < numProcs; i++)
         PCounter[i] = 0;
@@ -220,13 +210,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt S;
     MilanLongInt privateMyCard = 0;
     staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
-    MilanLongInt myIndex = 0;
     vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
     vector<MPI_Request> SRequest;  // Requests that are used for each send message
     vector<MPI_Status> SStatus;    // Status of sent messages, used in MPI_Wait
     MilanLongInt MessageIndex = 0; // Pointer for current message
-    MilanInt OneMessageSize = 0;
-    MilanLongInt numMessagesToSend;
     MilanInt BufferSize;
     MilanLongInt *Buffer;
 
@@ -318,9 +305,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     ///////////////////////////////////////////////////////////////////////////////////
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
-//#define debug
-#ifndef debug
-    
+
     vector<MilanLongInt> UChunkBeingProcessed;
     UChunkBeingProcessed.reserve(UCHUNK);
     processMatchedVertices(NLVer,
@@ -336,7 +321,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                            verLocPtr,
                            verLocInd,
                            verDistance,
-                           PCounter, 
+                           PCounter,
                            Counter,
                            myRank,
                            numProcs,
@@ -354,166 +339,32 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                            privateQMsgType,
                            privateQOwner);
 
+    /////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
 
-#endif
-
-#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
-    {
-
-
-#ifdef DEBUG_HANG_
-        if (myRank == 0)
-            cout << "\n(" << myRank << ") Send Bundles" << endl;
-        fflush(stdout);
-#endif
-        /////////////////////////////////////////////////////////////////////////////////////////
-        ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp barrier // TODO check if necessary
-#pragma omp master
-        {
-            // Data structures for Bundled Messages:
-            try
-            {
-                PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
-                PCumulative.reserve(numProcs + 1);              // Similar to Row Pointer vector in CSR data structure
-                PSizeInfoMessages.reserve(numProcs * 3);        // Buffer to hold the Size info message packets
-            }
-            catch (length_error)
-            {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-            PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
-            PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
-            PSizeInfoMessages.resize(numProcs * 3, 0);
-
-            for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
-                PCumulative[i + 1] = PCumulative[i] + PCounter[i];
-
-            // OMP not worth parallelizing
-            // Reuse PCounter to keep track of how many messages were inserted:
-            for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
-                PCounter[i] = 0;
-            // Build the Message Bundle packet:
-
-            // OMP Not parallelizable
-            for (MilanInt i = 0; i < NumMessagesBundled; i++)
-            { // Changed by Fabio to be an integer, addresses needs to be integers!
-                myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
-                PMessageBundle[myIndex + 0] = QLocalVtx[i];
-                PMessageBundle[myIndex + 1] = QGhostVtx[i];
-                PMessageBundle[myIndex + 2] = QMsgType[i];
-                PCounter[QOwner[i]]++;
-            }
-
-            // Send the Bundled Messages: Use ISend
-
-            try
-            {
-                SRequest.reserve(numProcs * 2); // At most two messages per processor
-                SStatus.reserve(numProcs * 2);  // At most two messages per processor
-            }
-            catch (length_error)
-            {
-                cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
-                cout << "Not enough memory to allocate the internal variables \n";
-                exit(1);
-            }
-            MPI_Request myReq; // A sample request
-            SRequest.resize(numProcs * 2, myReq);
-            MPI_Status myStat; // A sample status
-            SStatus.resize(numProcs * 2, myStat);
-
-            // Send the Messages
-            for (MilanInt i = 0; i < numProcs; i++)
-            {                    // Changed by Fabio to be an integer, addresses needs to be integers!
-                if (i == myRank) // Do not send anything to yourself
-                    continue;
-                // Send the Message with information about the size of next message:
-                // Build the Message Packet:
-                PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
-                PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
-                PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
-                // Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
-                fflush(stdout);
-#endif
-                if (PSizeInfoMessages[i * 3 + 0] > 0)
-                { // Send only if it is a nonempty packet
-                    MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
-                              &SRequest[MessageIndex]);
-                    msgActual++;
-                    MessageIndex++;
-                    // Now Send the message with the data packet:
-#ifdef PRINT_DEBUG_INFO_
-                    cout << "\n(" << myRank << ")Sending Bundle to : " << i << endl;
-                    for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
-                        cout << PMessageBundle[k] << ",";
-                    cout << endl;
-                    fflush(stdout);
-#endif
-                    MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
-                              TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[MessageIndex]);
-                    MessageIndex++;
-                } // End of if size > 0
-            }
-            // Free up temporary memory:
-            PCumulative.clear();
-            QLocalVtx.clear();
-            QGhostVtx.clear();
-            QMsgType.clear();
-            QOwner.clear();
-
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
-            cout << "\n(" << myRank << ")Total number of potential message X 2 = " << numGhostEdges * 2;
-            cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
-            if (numGhostEdges > 0)
-            {
-                cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(numGhostEdges * 2)) * 100.0 << "% \n";
-            }
-            fflush(stdout);
-#endif
-
-            // Allocate memory for MPI Send messages:
-            /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
-            OneMessageSize = 0;
-            MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
-            // How many messages to send?
-            // Potentially three kinds of messages will be sent/received:
-            // Request, Success, Failure.
-            // But only two will be sent from a given processor.
-            // Substract the number of messages that have already been sent as bundled messages:
-            numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled;
-            BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
-
-            Buffer = 0;
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
-            cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
-            cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
-            cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
-            cout << "\n(" << myRank << ")BufferSize = " << BufferSize;
-            cout << "\n(" << myRank << ")Attaching Buffer on.. ";
-            fflush(stdout);
-#endif
-            if (BufferSize > 0)
-            {
-                Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory
-                if (Buffer == 0)
-                {
-                    cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-                    cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
-                    exit(1);
-                }
-                MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer
-            }
-        } // End of master
+    sendBundledMessages(&numGhostEdges,
+                        &BufferSize,
+                        Buffer,
+                        PCumulative,
+                        PMessageBundle,
+                        PSizeInfoMessages,
+                        PCounter,
+                        NumMessagesBundled,
+                        &msgActual,
+                        &MessageIndex,
+                        numProcs,
+                        myRank,
+                        ComputeTag,
+                        BundleTag,
+                        comm,
+                        QLocalVtx,
+                        QGhostVtx,
+                        QMsgType,
+                        QOwner,
+                        SRequest,
+                        SStatus);
 
-    } // end of parallel region
     ///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
 
     finishTime = MPI_Wtime();
@@ -773,10 +624,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
-        /*
-         RECEIVE message ( u, v, message_type );
-         // u is a GHOST vertex ... v is a LOCAL vertex
-         */
+
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << "=========================************===============================" << endl;
         fflush(stdout);
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 3847110a..dd9562d5 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -215,6 +215,7 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                        privateQMsgType,
                        privateQOwner);
 
+//TODO move this outside of the parallel region!!
 #pragma omp master
         {
             *myCardPtr = myCard;
diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp
new file mode 100644
index 00000000..e16c5669
--- /dev/null
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@@ -0,0 +1,225 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+inline void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
+                                MilanInt *BufferSizePtr,
+                                MilanLongInt *Buffer,
+                                vector<MilanLongInt> &PCumulative,
+                                vector<MilanLongInt> &PMessageBundle,
+                                vector<MilanLongInt> &PSizeInfoMessages,
+                                MilanLongInt *PCounter,
+                                MilanLongInt NumMessagesBundled,
+                                MilanLongInt *msgActualPtr,
+                                MilanLongInt *MessageIndexPtr,
+                                MilanInt numProcs,
+                                MilanInt myRank,
+                                int ComputeTag,
+                                int BundleTag,
+                                MPI_Comm comm,
+                                vector<MilanLongInt> &QLocalVtx,
+                                vector<MilanLongInt> &QGhostVtx,
+                                vector<MilanLongInt> &QMsgType,
+                                vector<MilanInt> &QOwner,
+                                vector<MPI_Request> &SRequest,
+                                vector<MPI_Status> &SStatus)
+{
+
+    MilanLongInt myIndex = 0, msgActual = *msgActualPtr, MessageIndex = *MessageIndexPtr, numGhostEdges = *numGhostEdgesPtr, numMessagesToSend;
+    const MilanLongInt SIZEINFO = 4;
+    MilanInt i = 0, OneMessageSize = 0, BufferSize = *BufferSizePtr;
+
+#ifdef DEBUG_HANG_
+    if (myRank == 0)
+        cout << "\n(" << myRank << ") Send Bundles" << endl;
+    fflush(stdout);
+#endif
+
+#pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD)
+    {
+#pragma omp master
+        {
+// Data structures for Bundled Messages:
+#pragma omp task depend(inout                                                       \
+                        : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \
+                                                                                 : NumMessagesBundled, numProcs)
+            {try {
+                PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
+    PCumulative.reserve(numProcs + 1);                          // Similar to Row Pointer vector in CSR data structure
+    PSizeInfoMessages.reserve(numProcs * 3);                    // Buffer to hold the Size info message packets
+}
+catch (length_error)
+{
+    cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+    cout << "Not enough memory to allocate the internal variables \n";
+    exit(1);
+}
+PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
+PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
+PSizeInfoMessages.resize(numProcs * 3, 0);
+}
+
+#pragma omp task depend(inout                    \
+                        : PCumulative) depend(in \
+                                              : PCounter)
+{
+    for (i = 0; i < numProcs; i++)
+        PCumulative[i + 1] = PCumulative[i] + PCounter[i];
+}
+
+#pragma omp task depend(inout \
+                        : PCounter)
+{
+    // Reuse PCounter to keep track of how many messages were inserted:
+    for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+        PCounter[i] = 0;
+}
+
+// Build the Message Bundle packet:
+#pragma omp task depend(in                                                                                          \
+                        : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \
+                                                                                                                : myIndex, PMessageBundle, PCounter)
+{
+    for (i = 0; i < NumMessagesBundled; i++)
+    {
+        myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
+        PMessageBundle[myIndex + 0] = QLocalVtx[i];
+        PMessageBundle[myIndex + 1] = QGhostVtx[i];
+        PMessageBundle[myIndex + 2] = QMsgType[i];
+        PCounter[QOwner[i]]++;
+    }
+}
+
+// Send the Bundled Messages: Use ISend
+#pragma omp task depend(out \
+                        : SRequest, SStatus)
+{
+    try
+    {
+        SRequest.reserve(numProcs * 2); // At most two messages per processor
+        SStatus.reserve(numProcs * 2);  // At most two messages per processor
+    }
+    catch (length_error)
+    {
+        cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
+        cout << "Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+}
+
+// Send the Messages
+#pragma omp task depend(inout                                                  \
+                        : SRequest, PSizeInfoMessages, PCumulative) depend(out \
+                                                                           : msgActual, MessageIndex)
+{
+    for (i = 0; i < numProcs; i++)
+    {                    // Changed by Fabio to be an integer, addresses needs to be integers!
+        if (i == myRank) // Do not send anything to yourself
+            continue;
+        // Send the Message with information about the size of next message:
+        // Build the Message Packet:
+        PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
+        PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
+        PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
+                                                                                  // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
+        fflush(stdout);
+#endif
+        if (PSizeInfoMessages[i * 3 + 0] > 0)
+        { // Send only if it is a nonempty packet
+            MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
+                      &SRequest[MessageIndex]);
+            msgActual++;
+            MessageIndex++;
+            // Now Send the message with the data packet:
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
+            for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
+                cout << PMessageBundle[k] << ",";
+            cout << endl;
+            fflush(stdout);
+#endif
+            MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
+                      TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[MessageIndex]);
+            MessageIndex++;
+        } // End of if size > 0
+    }
+}
+
+#pragma omp task depend(inout \
+                        : PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner)
+{
+
+    // Free up temporary memory:
+    PCumulative.clear();
+    QLocalVtx.clear();
+    QGhostVtx.clear();
+    QMsgType.clear();
+    QOwner.clear();
+}
+
+#pragma omp task depend(inout : OneMessageSize, BufferSize) depend(out : numMessagesToSend) depend(in : numGhostEdges)
+{
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
+    cout << "\n(" << myRank << ")Total number of potential message X 2 = " << numGhostEdges * 2;
+    cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
+    if (numGhostEdges > 0)
+    {
+        cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(numGhostEdges * 2)) * 100.0 << "% \n";
+    }
+    fflush(stdout);
+#endif
+
+    // Allocate memory for MPI Send messages:
+    /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
+    OneMessageSize = 0;
+    MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
+    // How many messages to send?
+    // Potentially three kinds of messages will be sent/received:
+    // Request, Success, Failure.
+    // But only two will be sent from a given processor.
+    // Substract the number of messages that have already been sent as bundled messages:
+    numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled;
+    BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
+}
+
+#pragma omp task depend(out : Buffer) depend(in : BufferSize)
+{
+    Buffer = 0;
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
+    cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
+    cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
+    cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
+    cout << "\n(" << myRank << ")BufferSize = " << BufferSize;
+    cout << "\n(" << myRank << ")Attaching Buffer on.. ";
+    fflush(stdout);
+#endif
+    if (BufferSize > 0)
+    {
+        Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory
+        if (Buffer == 0)
+        {
+            cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+            cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
+            exit(1);
+        }
+        MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer
+    }
+}
+}
+}
+
+*MessageIndexPtr = MessageIndex;
+*msgActualPtr = msgActual;
+*numGhostEdgesPtr = numGhostEdges;
+*BufferSizePtr = BufferSize;
+}
\ No newline at end of file

From df1e4a4616f1b7d9a2aaa8a4b790d7c7d9dca302 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 10 Jul 2022 04:31:51 -0500
Subject: [PATCH 44/96] PROCESS_CROSS_EDGE refactoring

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 81 ++-----------------
 amgprec/impl/aggregator/processCrossEdge.cpp  | 33 ++++++++
 .../impl/aggregator/processExposedVertex.cpp  | 20 +----
 .../aggregator/processMatchedVertices.cpp     | 16 +---
 4 files changed, 45 insertions(+), 105 deletions(-)
 create mode 100644 amgprec/impl/aggregator/processCrossEdge.cpp

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index cfd6b927..6b189da5 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -9,6 +9,7 @@
 #include "processExposedVertex.cpp"
 #include "processMatchedVertices.cpp"
 #include "sendBundledMessages.cpp"
+//#include "processCrossEdge.cpp"
 
 // ***********************************************************************
 //
@@ -506,21 +507,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                         cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                                         fflush(stdout);
 #endif
-                                        // Decrement the counter:
-                                        // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                        if (Counter[Ghost2LocalMap[w]] > 0)
-                                        {
-                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
-                                            if (Counter[Ghost2LocalMap[w]] == 0)
-                                            {
-                                                S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
-                                                fflush(stdout);
-#endif
-                                            }
-                                        } // End of if Counter[w] > 0
-                                        // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+
+                                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
+
                                     } // End of if CandidateMate[w] = v
                                 }     // End of if a Ghost Vertex
                                 else
@@ -759,20 +748,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
                         fflush(stdout);
 #endif
-                        // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
-                        if (Counter[Ghost2LocalMap[u]] > 0)
-                        {
-                            Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement
-                            if (Counter[Ghost2LocalMap[u]] == 0)
-                            {
-                                S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages" << endl;
-                                fflush(stdout);
-#endif
-                            }
-                        } // End of if Counter[w] > 0
-                        // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
                     } // End of if ( candidateMate[v-StartIndex] == u )e
                 }     // End of if ( Mate[v] == -1 )
             }         // End of REQUEST
@@ -784,22 +760,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
                     fflush(stdout);
 #endif
-                    // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
                     GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
-                    // process it again
-                    if (Counter[Ghost2LocalMap[u]] > 0)
-                    {
-                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement
-                        if (Counter[Ghost2LocalMap[u]] == 0)
-                        {
-                            S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages";
-                            fflush(stdout);
-#endif
-                        }
-                    } // End of if Counter[w] > 0
-                    // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
 #ifdef DEBUG_GHOST_
                     if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
                     {
@@ -877,20 +839,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                         fflush(stdout);
 #endif
                                         // Decrement the counter:
-                                        // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                        if (Counter[Ghost2LocalMap[w]] > 0)
-                                        {
-                                            Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
-                                            if (Counter[Ghost2LocalMap[w]] == 0)
-                                            {
-                                                S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
-                                                fflush(stdout);
-#endif
-                                            }
-                                        } // End of if Counter[w] > 0
-                                        // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
                                     } // End of if CandidateMate[w] = v
                                 }     // End of if a Ghost Vertex
                                 else
@@ -949,22 +898,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
                     fflush(stdout);
 #endif
-                    // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
                     GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
-                    // process it again
-                    if (Counter[Ghost2LocalMap[u]] > 0)
-                    {
-                        Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement
-                        if (Counter[Ghost2LocalMap[u]] == 0)
-                        {
-                            S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages";
-                            fflush(stdout);
-#endif
-                        }
-                    } // End of if Counter[w] > 0
-                    // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u)
+                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
                 } // End of else: CASE III
             }     // End of else: CASE I
         }         // End of if (!MsgQ.empty())
diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
new file mode 100644
index 00000000..f99bee1a
--- /dev/null
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -0,0 +1,33 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <assert.h>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+inline void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
+                               map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                               MilanLongInt edge,
+                               MilanLongInt *SPtr)
+{
+    MilanLongInt S = *SPtr;
+    // Decrement the counter:
+    // Start: PARALLEL_PROCESS_CROSS_EDGE_B
+    if (Counter[Ghost2LocalMap[edge]] > 0)
+    {
+        Counter[Ghost2LocalMap[edge]] -= 1; // Decrement
+        if (Counter[Ghost2LocalMap[edge]] == 0)
+        {
+            S--; // Decrement S
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
+            fflush(stdout);
+#endif
+        }
+    } // End of if Counter[edge] > 0
+      // End: PARALLEL_PROCESS_CROSS_EDGE_B
+    *SPtr = S;
+}
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index dd9562d5..9ba155f9 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -8,6 +8,7 @@
 #include "dataStrStaticQueue.h"
 #include "omp.h"
 #include "queueTransfer.cpp"
+#include "processCrossEdge.cpp"
 
 inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                               MilanLongInt *candidateMate,
@@ -129,24 +130,7 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
                             //TODO refactor this!!
                             // Decrement the counter:
-                            // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v)
-#pragma omp critical
-                            {
-                                if (Counter[Ghost2LocalMap[w]] > 0)
-                                {
-
-                                    Counter[Ghost2LocalMap[w]] -= 1; // Decrement
-                                    if (Counter[Ghost2LocalMap[w]] == 0)
-                                    {
-                                        S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
-                                        fflush(stdout);
-#endif
-                                    }
-                                }
-                            } // End of if Counter[w] > 0
-                            // End: PARALLEL_PROCESS_CROSS_EDGE_B(v)
+                            PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
                         } // End of if CandidateMate[w] = v
 
                     } // End of if a Ghost Vertex
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 5f54ef79..1e496888 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -177,20 +177,8 @@ inline void processMatchedVertices(
 
                                                 // TODO refactor this
                                                 // Decrement the counter:
-                                                // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
-                                                if (Counter[Ghost2LocalMap[w]] > 0)
-                                                {
-                                                    Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement
-                                                    if (Counter[Ghost2LocalMap[w]] == 0)
-                                                    {
-                                                        S--; // Decrement S
-#ifdef PRINT_DEBUG_INFO_
-                                                        cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages";
-                                                        fflush(stdout);
-#endif
-                                                    }
-                                                } // End of if Counter[w] > 0
-                                                // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w)
+                                                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
+
                                             } // End of if CandidateMate[w] = v
                                         }     // End of if a Ghost Vertex
                                         else

From d19443052d9734b12f21fe8b15848354a1d25683 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 10 Jul 2022 05:24:31 -0500
Subject: [PATCH 45/96] Insert private queue error in
 processMatchedVertices.cpp

---
 .../aggregator/processMatchedVertices.cpp     | 62 +++++++++++++++----
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 1e496888..567ae2e3 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -8,7 +8,7 @@
 #include "omp.h"
 #include "extractUChunk.cpp"
 
-#define UCHUNK 1000
+//#define privateQueues
 
 inline void processMatchedVertices(
     MilanLongInt NLVer,
@@ -152,13 +152,22 @@ inline void processMatchedVertices(
                                             cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 
-                                            QLocalVtx.push_back(v);
-                                            QGhostVtx.push_back(w);
-                                            QMsgType.push_back(REQUEST);
                                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                             assert(ghostOwner != -1);
                                             assert(ghostOwner != myRank);
+
+#ifdef privateQueues
+                                            privateQLocalVtx.push_back(v);
+                                            privateQGhostVtx.push_back(w);
+                                            privateQMsgType.push_back(REQUEST);
+                                            privateQOwner.push_back(ghostOwner);
+#endif
+#ifndef privateQueues
+                                            QLocalVtx.push_back(v);
+                                            QGhostVtx.push_back(w);
+                                            QMsgType.push_back(REQUEST);
                                             QOwner.push_back(ghostOwner);
+#endif
                                             PCounter[ghostOwner]++;
                                             NumMessagesBundled++;
                                             msgInd++;
@@ -217,16 +226,25 @@ inline void processMatchedVertices(
                                                 cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                                 fflush(stdout);
 #endif
-                                                /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w),
-                                                 ComputeTag, comm); */
-                                                QLocalVtx.push_back(v);
-                                                QGhostVtx.push_back(w);
-                                                QMsgType.push_back(FAILURE);
+
                                                 // ghostOwner = inputSubGraph.findOwner(w);
                                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                                 assert(ghostOwner != -1);
                                                 assert(ghostOwner != myRank);
+
+#ifdef privateQueues
+                                                privateQLocalVtx.push_back(v);
+                                                privateQGhostVtx.push_back(w);
+                                                privateQMsgType.push_back(FAILURE);
+                                                privateQOwner.push_back(ghostOwner);
+#endif
+#ifndef privateQueues
+                                                QLocalVtx.push_back(v);
+                                                QGhostVtx.push_back(w);
+                                                QMsgType.push_back(FAILURE);
                                                 QOwner.push_back(ghostOwner);
+#endif
+
                                                 PCounter[ghostOwner]++;
                                                 NumMessagesBundled++;
                                                 msgInd++;
@@ -264,13 +282,23 @@ inline void processMatchedVertices(
                                     fflush(stdout);
 #endif
 
-                                    QLocalVtx.push_back(u);
-                                    QGhostVtx.push_back(v);
-                                    QMsgType.push_back(SUCCESS);
                                     ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
                                     assert(ghostOwner != -1);
                                     assert(ghostOwner != myRank);
+
+#ifdef privateQueues
+                                    privateQLocalVtx.push_back(u);
+                                    privateQGhostVtx.push_back(v);
+                                    privateQMsgType.push_back(SUCCESS);
+                                    privateQOwner.push_back(ghostOwner);
+#endif
+#ifndef privateQueues
+                                    QLocalVtx.push_back(u);
+                                    QGhostVtx.push_back(v);
+                                    QMsgType.push_back(SUCCESS);
                                     QOwner.push_back(ghostOwner);
+#endif
+
                                     PCounter[ghostOwner]++;
                                     NumMessagesBundled++;
                                     msgInd++;
@@ -289,12 +317,22 @@ inline void processMatchedVertices(
                 // of data have been accumulated in the private queue
                 if (privateU.size() < UCHUNK && !U.empty())
                     continue;
+
+#ifdef privateQueues
+#pragma omp critical(U)
+                {
+                    while (!privateU.empty())
+                        U.push_back(privateU.pop_back());
+                }
+#endif
+#ifndef privateQueues
                 queuesTransfer(U, privateU, QLocalVtx,
                                QGhostVtx,
                                QMsgType, QOwner, privateQLocalVtx,
                                privateQGhostVtx,
                                privateQMsgType,
                                privateQOwner);
+#endif
             }
         } // End of while ( /*!Q.empty()*/ !U.empty() )
 

From 64c23f93f8358adebe790913e9c31ce0b3dcc0d9 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 10 Jul 2022 10:01:50 -0500
Subject: [PATCH 46/96] processMessags partial refactoring, message const
 refactoring

---
 amgprec/impl/aggregator/MatchBoxPC.h          |   4 +
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 144 ++----------------
 .../impl/aggregator/computeCandidateMate.cpp  |  41 +++--
 .../parallelComputeCandidateMateB.cpp         |   2 +-
 .../impl/aggregator/processExposedVertex.cpp  |   7 +-
 .../aggregator/processMatchedVertices.cpp     |   5 -
 amgprec/impl/aggregator/processMessages.cpp   | 130 ++++++++++++++++
 7 files changed, 177 insertions(+), 156 deletions(-)
 create mode 100644 amgprec/impl/aggregator/processMessages.cpp

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 351dca98..c6445c81 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -65,6 +65,10 @@
 
 using namespace std;
 #define NUM_THREAD 4
+const MilanLongInt REQUEST = 1;
+const MilanLongInt SUCCESS = 2;
+const MilanLongInt FAILURE = 3;
+const MilanLongInt SIZEINFO = 4;
 
 // MPI type map
 template <typename T>
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 6b189da5..5a5ef836 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -9,7 +9,8 @@
 #include "processExposedVertex.cpp"
 #include "processMatchedVertices.cpp"
 #include "sendBundledMessages.cpp"
-//#include "processCrossEdge.cpp"
+#include "processMessages.cpp"
+
 
 // ***********************************************************************
 //
@@ -155,10 +156,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     // Data structures for sending and receiving messages:
     vector<MilanLongInt> Message; // [ u, v, message_type ]
     Message.resize(3, -1);
-    const MilanLongInt REQUEST = 1;
-    const MilanLongInt SUCCESS = 2;
-    const MilanLongInt FAILURE = 3;
-    const MilanLongInt SIZEINFO = 4;
     MilanLongInt message_type = 0;
     // Data structures for Message Bundling:
     // Although up to two messages can be sent along any cross edge,
@@ -186,7 +183,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt k = -1, adj1 = -1, adj2 = -1;
     MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
     MilanLongInt myCard = 0;
-    MilanInt Sender = 0; // This is the rank of the sending nodes, it has to be an integer! Fabio
 
     // Build the Ghost Vertex Set: Vg
     map<MilanLongInt, MilanLongInt> Ghost2LocalMap;       // Map each ghost vertex to a local vertex
@@ -614,90 +610,19 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
 
-#ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << "=========================************===============================" << endl;
-        fflush(stdout);
-        fflush(stdout);
-#endif
-#ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")About to begin Message processing phase ... S=" << S << endl;
-        fflush(stdout);
-#endif
-#ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << "=========================************===============================" << endl;
-        fflush(stdout);
-        fflush(stdout);
-#endif
-        // BLOCKING RECEIVE:
-#ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
-        fflush(stdout);
-        fflush(stdout);
-#endif
-        error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
-        if (error_codeC != MPI_SUCCESS)
-        {
-            MPI_Error_string(error_codeC, error_message, &message_length);
-            cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
-            fflush(stdout);
-        }
-        Sender = computeStatus.MPI_SOURCE;
-#ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
-        fflush(stdout);
-#endif
-        // If the Message Type is a size indicator, then receive the bigger message.
-        if (Message[2] == SIZEINFO)
-        {
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
-            fflush(stdout);
-#endif
-            bundleSize = Message[0]; //#of integers in the message
-            // Build the Message Buffer:
-            if (!ReceiveBuffer.empty())
-                ReceiveBuffer.clear();            // Empty it out first
-            ReceiveBuffer.resize(bundleSize, -1); // Initialize
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
-            for (i = 0; i < bundleSize; i++)
-                cout << ReceiveBuffer[i] << ",";
-            cout << endl;
-            fflush(stdout);
-#endif
-            // Receive the message
-            error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
-            if (error_codeC != MPI_SUCCESS)
-            {
-                MPI_Error_string(error_codeC, error_message, &message_length);
-                cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
-                fflush(stdout);
-            }
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Message Bundle After: " << endl;
-            for (i = 0; i < bundleSize; i++)
-                cout << ReceiveBuffer[i] << ",";
-            cout << endl;
-            fflush(stdout);
-#endif
-        }
-        else
-        { // Just a single message:
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
-            fflush(stdout);
-#endif
-            // Add the current message to Queue:
-            bundleSize = 3; //#of integers in the message
-            // Build the Message Buffer:
-            if (!ReceiveBuffer.empty())
-                ReceiveBuffer.clear();            // Empty it out first
-            ReceiveBuffer.resize(bundleSize, -1); // Initialize
+        processMessages(error_codeC, 
+                        numProcs,
+                        myRank,
+                        ComputeTag,
+                        BundleTag,
+                        comm,
+                        Message,
+                        error_message,
+                        message_length,
+                        ReceiveBuffer,
+                        &bundleSize);
+
 
-            ReceiveBuffer[0] = Message[0]; // u
-            ReceiveBuffer[1] = Message[1]; // v
-            ReceiveBuffer[2] = Message[2]; // message_type
-        }
         bundleCounter = 0;
         while (bundleCounter < bundleSize)
         {
@@ -707,17 +632,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
             bundleCounter++;
             message_type = ReceiveBuffer[bundleCounter]; // TYPE
             bundleCounter++;
-#ifdef DEBUG_GHOST_
-            if ((v < StartIndex) || (v > EndIndex))
-            {
-                cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
-                fflush(stdout);
-            }
-#endif
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
-            fflush(stdout);
-#endif
+
             // CASE I: REQUEST
             if (message_type == REQUEST)
             {
@@ -774,33 +689,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         if (candidateMate[v - StartIndex] == u)
                         {
                             // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                            adj11 = verLocPtr[v - StartIndex];
-                            adj12 = verLocPtr[v - StartIndex + 1];
-                            w = -1;
-                            heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
-                            for (k1 = adj11; k1 < adj12; k1++)
-                            {
-                                if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex))
-                                {                                                  // Is it a ghost vertex?
-                                    if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched
-                                        continue;
-                                }
-                                else
-                                {                                              // A local vertex
-                                    if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched
-                                        continue;
-                                }
-
-                                if ((edgeLocWeight[k1] > heaviestEdgeWt) ||
-                                    ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1])))
-                                {
-                                    heaviestEdgeWt = edgeLocWeight[k1];
-                                    w = verLocInd[k1];
-                                }
-                            } // End of for loop
+                            w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
                             candidateMate[v - StartIndex] = w;
-                            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
                             cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
                             fflush(stdout);
@@ -830,7 +720,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                     {
                                         Mate[v - StartIndex] = w;     // v is local
                                         GMate[Ghost2LocalMap[w]] = v; // w is ghost
-                                        // Q.push_back(u);
                                         U.push_back(v);
                                         U.push_back(w);
                                         myCard++;
@@ -878,7 +767,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                         cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                                         fflush(stdout);
 #endif
-                                        // MPI_Bsend(&Message[0], 3, MilanMpiLongInt, findOwnerOfGhost(w, verDistance, myRank, numProcs),
                                         ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                         assert(ghostOwner != -1);
                                         assert(ghostOwner != myRank);
diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp
index 92e3c92b..b6d8b3ff 100644
--- a/amgprec/impl/aggregator/computeCandidateMate.cpp
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@@ -11,21 +11,23 @@
  */
 inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
                                               MilanLongInt adj2,
-                                              MilanLongInt* verLocInd,
-                                              MilanReal* edgeLocWeight)
+                                              MilanLongInt *verLocInd,
+                                              MilanReal *edgeLocWeight)
 {
     MilanInt w = -1;
-    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
+    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
     int finalK;
-    for (int k = adj1; k < adj2; k++) {
+    for (int k = adj1; k < adj2; k++)
+    {
 
         if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))
+        {
             heaviestEdgeWt = edgeLocWeight[k];
             w = verLocInd[k];
             finalK = k;
         }
-    } //End of for loop
+    } // End of for loop
     return finalK;
 }
 
@@ -45,25 +47,32 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
  */
 inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
                                          MilanLongInt adj2,
-                                         MilanReal* edgeLocWeight,
+                                         MilanReal *edgeLocWeight,
                                          MilanLongInt k,
-                                         MilanLongInt* verLocInd,
+                                         MilanLongInt *verLocInd,
                                          MilanLongInt StartIndex,
                                          MilanLongInt EndIndex,
-                                         vector <MilanLongInt>& GMate,
-                                         MilanLongInt* Mate,
-                                         map <MilanLongInt, MilanLongInt>& Ghost2LocalMap)
+                                         vector<MilanLongInt> &GMate,
+                                         MilanLongInt *Mate,
+                                         map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
 {
+    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+
     MilanInt w = -1;
-    MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN
-    for (k = adj1; k < adj2; k++) {
-        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue;
+    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
+    for (k = adj1; k < adj2; k++)
+    {
+        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+            continue;
 
         if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))
+        {
             heaviestEdgeWt = edgeLocWeight[k];
             w = verLocInd[k];
         }
-    } //End of for loop
+    } // End of for loop
+      //  End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+
     return w;
 }
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
index ced93456..d3e39056 100644
--- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
+++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
@@ -18,7 +18,7 @@ inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
 
     MilanLongInt v = -1;
 
-#pragma omp parallel private(v) default(shared) num_threads(4)
+#pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD)
     {
 
 #pragma omp for schedule(static)
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 9ba155f9..ff57b5e5 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -41,17 +41,12 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                               staticQueue &privateQOwner)
 {
 
-    //TODO define all the constants in a single place!
-    const MilanLongInt REQUEST = 1;
-    const MilanLongInt SUCCESS = 2;
-    const MilanLongInt FAILURE = 3;
-    const MilanLongInt SIZEINFO = 4;
     MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0, S = *SPtr;
     MilanLongInt myCard = 0, msgInd = 0;
     MilanLongInt NumMessagesBundled = 0;
     MilanInt ghostOwner = 0;
 
-#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
+#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
     {
 #pragma omp for reduction(+ \
                           : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 567ae2e3..c6c729ad 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -43,11 +43,6 @@ inline void processMatchedVertices(
     staticQueue &privateQOwner)
 {
 
-    // TODO define all the constants in a single place!
-    const MilanLongInt REQUEST = 1;
-    const MilanLongInt SUCCESS = 2;
-    const MilanLongInt FAILURE = 3;
-    const MilanLongInt SIZEINFO = 4;
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
     MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0;
 
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
new file mode 100644
index 00000000..ae2c8671
--- /dev/null
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -0,0 +1,130 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+inline void processMessages(int error_codeC,
+                            MilanInt numProcs,
+                            MilanInt myRank,
+                            int ComputeTag,
+                            int BundleTag,
+                            MPI_Comm comm,
+                            vector<MilanLongInt> &Message,
+                            char *error_message,
+                            int message_length,
+                            vector<MilanLongInt> &ReceiveBuffer,
+                            MilanLongInt *BundleSizePtr)
+{
+
+    MilanInt Sender;
+    MPI_Status computeStatus;
+    MilanLongInt bundleSize = *BundleSizePtr;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout
+        << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")About to begin Message processing phase ... S=" << S << endl;
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+    // BLOCKING RECEIVE:
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+    error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
+    if (error_codeC != MPI_SUCCESS)
+    {
+        MPI_Error_string(error_codeC, error_message, &message_length);
+        cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
+        fflush(stdout);
+    }
+    Sender = computeStatus.MPI_SOURCE;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
+    fflush(stdout);
+#endif
+
+    if (Message[2] == SIZEINFO)
+    {
+
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
+        fflush(stdout);
+#endif
+        bundleSize = Message[0]; //#of integers in the message
+        // Build the Message Buffer:
+        if (!ReceiveBuffer.empty())
+            ReceiveBuffer.clear();            // Empty it out first
+        ReceiveBuffer.resize(bundleSize, -1); // Initialize
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
+        for (i = 0; i < bundleSize; i++)
+            cout << ReceiveBuffer[i] << ",";
+        cout << endl;
+        fflush(stdout);
+#endif
+        // Receive the message
+        error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
+        if (error_codeC != MPI_SUCCESS)
+        {
+            MPI_Error_string(error_codeC, error_message, &message_length);
+            cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
+            fflush(stdout);
+        }
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Message Bundle After: " << endl;
+        for (i = 0; i < bundleSize; i++)
+            cout << ReceiveBuffer[i] << ",";
+        cout << endl;
+        fflush(stdout);
+#endif
+    }
+    else
+    { // Just a single message:
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
+        fflush(stdout);
+#endif
+        // Add the current message to Queue:
+        bundleSize = 3; //#of integers in the message
+        // Build the Message Buffer:
+        if (!ReceiveBuffer.empty())
+            ReceiveBuffer.clear();            // Empty it out first
+        ReceiveBuffer.resize(bundleSize, -1); // Initialize
+
+        ReceiveBuffer[0] = Message[0]; // u
+        ReceiveBuffer[1] = Message[1]; // v
+        ReceiveBuffer[2] = Message[2]; // message_type
+    }
+
+#ifdef DEBUG_GHOST_
+    if ((v < StartIndex) || (v > EndIndex))
+    {
+        cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
+        fflush(stdout);
+    }
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
+    fflush(stdout);
+#endif
+
+    *BundleSizePtr = bundleSize;
+    return;
+}
\ No newline at end of file

From 32777cc15c562f8288e084d3bcf819f5c9cc4d08 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 10 Jul 2022 11:09:10 -0500
Subject: [PATCH 47/96] clean partial refactoring

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 75 ++++------------
 amgprec/impl/aggregator/clean.cpp             | 88 +++++++++++++++++++
 2 files changed, 103 insertions(+), 60 deletions(-)
 create mode 100644 amgprec/impl/aggregator/clean.cpp

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 5a5ef836..ad5d1142 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -10,7 +10,7 @@
 #include "processMatchedVertices.cpp"
 #include "sendBundledMessages.cpp"
 #include "processMessages.cpp"
-
+#include "clean.cpp"
 
 // ***********************************************************************
 //
@@ -610,7 +610,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
 
-        processMessages(error_codeC, 
+        processMessages(error_codeC,
                         numProcs,
                         myRank,
                         ComputeTag,
@@ -622,7 +622,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         ReceiveBuffer,
                         &bundleSize);
 
-
         bundleCounter = 0;
         while (bundleCounter < bundleSize)
         {
@@ -802,66 +801,22 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
     } // End of while (true)
 
-#ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ") Waitall= " << endl;
-    fflush(stdout);
-#endif
-#ifdef DEBUG_HANG_
-    cout << "\n(" << myRank << ") Waitall " << endl;
-    fflush(stdout);
-#endif
-    // MPI_Barrier(comm);
-    // Cleanup Phase
-    MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
-    // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
-    if (BufferSize > 0)
-    {
-        MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
-        free(Buffer);                            // Free the memory that was allocated
-    }
+    clean(myRank,
+          MessageIndex,
+          SRequest,
+          SStatus,
+          BufferSize,
+          Buffer,
+          msgActual,
+          msgActualSent,
+          msgInd,
+          msgIndSent,
+          NumMessagesBundled,
+          msgPercent);
+
     finishTime = MPI_Wtime();
     *ph2_time = finishTime - startTime; // Time taken for Phase-2
     *ph2_card = myCard;                 // Cardinality at the end of Phase-2
-
-#ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
-    fflush(stdout);
-#endif
-#ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
-    cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
-    cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
-    cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
-    if (msgInd > 0)
-    {
-        cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
-    }
-    fflush(stdout);
-#endif
-
-    *msgActualSent = msgActual;
-    *msgIndSent = msgInd;
-    if (msgInd > 0)
-    {
-        *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
-    }
-    else
-    {
-        *msgPercent = 0;
-    }
-
-#ifdef DEBUG_HANG_
-    if (myRank == 0)
-        cout << "\n(" << myRank << ") Done" << endl;
-    fflush(stdout);
-#endif
-    // MPI_Barrier(comm);
 }
 // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
 #endif
diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp
new file mode 100644
index 00000000..6c5543b8
--- /dev/null
+++ b/amgprec/impl/aggregator/clean.cpp
@@ -0,0 +1,88 @@
+#include "MatchBoxPC.h"
+#include <stdio.h>
+#include <iostream>
+#include <assert.h>
+#include <map>
+#include <vector>
+#include "primitiveDataTypeDefinitions.h"
+#include "dataStrStaticQueue.h"
+#include "omp.h"
+
+
+//TODO comment
+//TODO use task
+//TODO destroy the locks
+
+inline void clean(MilanInt myRank,
+                  MilanLongInt MessageIndex,
+                  vector<MPI_Request> &SRequest,
+                  vector<MPI_Status> &SStatus,
+                  MilanInt BufferSize,
+                  MilanLongInt *Buffer,
+                  MilanLongInt msgActual,
+                  MilanLongInt *msgActualSent,
+                  MilanLongInt msgInd,
+                  MilanLongInt *msgIndSent,
+                  MilanLongInt NumMessagesBundled,
+                  MilanReal *msgPercent)
+{
+    // Cleanup Phase
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ") Waitall= " << endl;
+    fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    cout << "\n(" << myRank << ") Waitall " << endl;
+    fflush(stdout);
+#endif
+    return;
+
+    MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
+
+    // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
+    if (BufferSize > 0)
+    {
+        MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
+        free(Buffer);                            // Free the memory that was allocated
+    }
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
+    cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
+    cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
+    cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
+    if (msgInd > 0)
+    {
+        cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
+    }
+    fflush(stdout);
+#endif
+
+    *msgActualSent = msgActual;
+    *msgIndSent = msgInd;
+    if (msgInd > 0)
+    {
+        *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
+    }
+    else
+    {
+        *msgPercent = 0;
+    }
+
+#ifdef DEBUG_HANG_
+    if (myRank == 0)
+        cout << "\n(" << myRank << ") Done" << endl;
+    fflush(stdout);
+#endif
+}
\ No newline at end of file

From 36bd3a51a22a26589728ab2cf06c32f8f3336e3d Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Mon, 11 Jul 2022 16:31:58 -0500
Subject: [PATCH 48/96] Makefile fix

---
 amgprec/impl/aggregator/Makefile              |  13 +
 amgprec/impl/aggregator/MatchBoxPC.h          | 314 +++++++++++-------
 ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp |   6 -
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |  12 -
 amgprec/impl/aggregator/clean.cpp             |  11 +-
 .../impl/aggregator/computeCandidateMate.cpp  |   4 +-
 amgprec/impl/aggregator/extractUChunk.cpp     |  11 +-
 amgprec/impl/aggregator/findOwnerOfGhost.cpp  |   8 +-
 amgprec/impl/aggregator/initialize.cpp        |  14 +-
 amgprec/impl/aggregator/isAlreadyMatched.cpp  |   8 +-
 .../parallelComputeCandidateMateB.cpp         |  10 +-
 amgprec/impl/aggregator/processCrossEdge.cpp  |  10 +-
 .../impl/aggregator/processExposedVertex.cpp  |  14 +-
 .../aggregator/processMatchedVertices.cpp     |  11 +-
 amgprec/impl/aggregator/processMessages.cpp   |   9 +-
 amgprec/impl/aggregator/queueTransfer.cpp     |   9 +-
 .../impl/aggregator/sendBundledMessages.cpp   |  11 +-
 17 files changed, 233 insertions(+), 242 deletions(-)

diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile
index 1f6f52af..f1760822 100644
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@@ -62,9 +62,22 @@ amg_s_parmatch_smth_bld.o \
 amg_s_parmatch_spmm_bld_inner.o
 
 MPCOBJS=MatchBoxPC.o \
+sendBundledMessages.o \
+initialize.o \
+extractUChunk.o \
+isAlreadyMatched.o \
+findOwnerOfGhost.o \
+computeCandidateMate.o \
+parallelComputeCandidateMateB.o \
+processMatchedVertices.o \
+processCrossEdge.o \
+queueTransfer.o \
+processMessages.o \
+processExposedVertex.o \
 algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
 algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
 
+
 OBJS = $(FOBJS) $(MPCOBJS)
 
 LIBNAME=libamg_prec.a
diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index c6445c81..dfcb6f7e 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -64,7 +64,10 @@
 #include "dataStrStaticQueue.h"
 
 using namespace std;
+
 #define NUM_THREAD 4
+#define UCHUNK 1000
+
 const MilanLongInt REQUEST = 1;
 const MilanLongInt SUCCESS = 2;
 const MilanLongInt FAILURE = 3;
@@ -166,119 +169,206 @@ extern "C"
 #define MilanRealMin MINUS_INFINITY
 #endif
 
-
-
     // Function of find the owner of a ghost vertex using binary search:
-    inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
-                                     MilanInt myRank, MilanInt numProcs);
-
-    inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
-                                                  MilanLongInt adj2,
-                                                  MilanLongInt *verLocInd,
-                                                  MilanReal *edgeLocWeight);
-
-    inline bool isAlreadyMatched(MilanLongInt node,
-                                 MilanLongInt StartIndex,
-                                 MilanLongInt EndIndex,
-                                 vector<MilanLongInt> &GMate,
-                                 MilanLongInt *Mate,
-                                 map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
-
-    inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
-                                             MilanLongInt adj2,
-                                             MilanReal *edgeLocWeight,
-                                             MilanLongInt k,
-                                             MilanLongInt *verLocInd,
-                                             MilanLongInt StartIndex,
-                                             MilanLongInt EndIndex,
-                                             vector<MilanLongInt> &GMate,
-                                             MilanLongInt *Mate,
-                                             map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
-
-    inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
-                           MilanLongInt StartIndex, MilanLongInt EndIndex,
-                           MilanLongInt *numGhostEdgesPtr,
-                           MilanLongInt *numGhostVerticesPtr,
-                           MilanLongInt *S,
-                           MilanLongInt *verLocInd,
-                           MilanLongInt *verLocPtr,
-                           omp_lock_t *MateLock,
-                           map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-                           vector<MilanLongInt> &Counter,
-                           vector<MilanLongInt> &verGhostPtr,
-                           vector<MilanLongInt> &verGhostInd,
-                           vector<MilanLongInt> &tempCounter,
-                           vector<MilanLongInt> &GMate,
-                           vector<MilanLongInt> &Message,
-                           vector<MilanLongInt> &QLocalVtx,
-                           vector<MilanLongInt> &QGhostVtx,
-                           vector<MilanLongInt> &QMsgType,
-                           vector<MilanInt> &QOwner,
-                           MilanLongInt *&candidateMate,
-                           staticQueue &U,
-                           staticQueue &privateU,
-                           staticQueue &privateQLocalVtx,
-                           staticQueue &privateQGhostVtx,
-                           staticQueue &privateQMsgType,
-                           staticQueue &privateQOwner);
-
-    inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
-                                                  MilanLongInt *verLocPtr,
-                                                  MilanLongInt *verLocInd,
-                                                  MilanInt myRank,
-                                                  MilanReal *edgeLocWeight,
-                                                  MilanLongInt *candidateMate);
-
-    inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
-                                                  MilanLongInt *candidateMate,
-                                                  MilanLongInt *verLocInd,
-                                                  MilanLongInt *verLocPtr,
-                                                  MilanLongInt StartIndex,
-                                                  MilanLongInt EndIndex,
-                                                  MilanLongInt *Mate,
-                                                  vector<MilanLongInt> &GMate,
-                                                  map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-                                                  MilanReal *edgeLocWeight,
-                                                  MilanLongInt *myCardPtr,
-                                                  MilanLongInt *msgIndPtr,
-                                                  MilanLongInt *NumMessagesBundledPtr,
-                                                  MilanLongInt *SPtr,
-                                                  MilanLongInt *verDistance,
-                                                  MilanLongInt *PCounter,
-                                                  vector<MilanLongInt> &Counter,
-                                                  MilanInt myRank,
-                                                  MilanInt numProcs,
-                                                  staticQueue &U,
-                                                  staticQueue &privateU,
-                                                  vector<MilanLongInt> &QLocalVtx,
-                                                  vector<MilanLongInt> &QGhostVtx,
-                                                  vector<MilanLongInt> &QMsgType,
-                                                  vector<MilanInt> &QOwner,
-                                                  staticQueue &privateQLocalVtx,
-                                                  staticQueue &privateQGhostVtx,
-                                                  staticQueue &privateQMsgType,
-                                                  staticQueue &privateQOwner);
-
-        inline void queuesTransfer(staticQueue &U,
-                          staticQueue &privateU,
-                          vector<MilanLongInt> &QLocalVtx,
-                          vector<MilanLongInt> &QGhostVtx,
-                          vector<MilanLongInt> &QMsgType,
-                          vector<MilanInt> &QOwner,
-                          staticQueue &privateQLocalVtx,
-                          staticQueue &privateQGhostVtx,
-                          staticQueue &privateQMsgType,
-                          staticQueue &privateQOwner);
-
-        void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
-            MilanLongInt NLVer, MilanLongInt NLEdge,
-            MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
-            MilanLongInt *verDistance,
-            MilanLongInt *Mate,
-            MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-            MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
-            MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
-            MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+    MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+                              MilanInt myRank, MilanInt numProcs);
+
+    MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                           MilanLongInt adj2,
+                                           MilanLongInt *verLocInd,
+                                           MilanReal *edgeLocWeight);
+
+    void queuesTransfer(staticQueue &U,
+                        staticQueue &privateU,
+                        vector<MilanLongInt> &QLocalVtx,
+                        vector<MilanLongInt> &QGhostVtx,
+                        vector<MilanLongInt> &QMsgType,
+                        vector<MilanInt> &QOwner,
+                        staticQueue &privateQLocalVtx,
+                        staticQueue &privateQGhostVtx,
+                        staticQueue &privateQMsgType,
+                        staticQueue &privateQOwner);
+
+    bool isAlreadyMatched(MilanLongInt node,
+                          MilanLongInt StartIndex,
+                          MilanLongInt EndIndex,
+                          vector<MilanLongInt> &GMate,
+                          MilanLongInt *Mate,
+                          map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
+    MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                      MilanLongInt adj2,
+                                      MilanReal *edgeLocWeight,
+                                      MilanLongInt k,
+                                      MilanLongInt *verLocInd,
+                                      MilanLongInt StartIndex,
+                                      MilanLongInt EndIndex,
+                                      vector<MilanLongInt> &GMate,
+                                      MilanLongInt *Mate,
+                                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
+    void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                    MilanLongInt StartIndex, MilanLongInt EndIndex,
+                    MilanLongInt *numGhostEdgesPtr,
+                    MilanLongInt *numGhostVerticesPtr,
+                    MilanLongInt *S,
+                    MilanLongInt *verLocInd,
+                    MilanLongInt *verLocPtr,
+                    omp_lock_t *MateLock,
+                    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                    vector<MilanLongInt> &Counter,
+                    vector<MilanLongInt> &verGhostPtr,
+                    vector<MilanLongInt> &verGhostInd,
+                    vector<MilanLongInt> &tempCounter,
+                    vector<MilanLongInt> &GMate,
+                    vector<MilanLongInt> &Message,
+                    vector<MilanLongInt> &QLocalVtx,
+                    vector<MilanLongInt> &QGhostVtx,
+                    vector<MilanLongInt> &QMsgType,
+                    vector<MilanInt> &QOwner,
+                    MilanLongInt *&candidateMate,
+                    staticQueue &U,
+                    staticQueue &privateU,
+                    staticQueue &privateQLocalVtx,
+                    staticQueue &privateQGhostVtx,
+                    staticQueue &privateQMsgType,
+                    staticQueue &privateQOwner);
+
+    void clean(MilanInt myRank,
+               MilanLongInt MessageIndex,
+               vector<MPI_Request> &SRequest,
+               vector<MPI_Status> &SStatus,
+               MilanInt BufferSize,
+               MilanLongInt *Buffer,
+               MilanLongInt msgActual,
+               MilanLongInt *msgActualSent,
+               MilanLongInt msgInd,
+               MilanLongInt *msgIndSent,
+               MilanLongInt NumMessagesBundled,
+               MilanReal *msgPercent);
+
+    void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+                                           MilanLongInt *verLocPtr,
+                                           MilanLongInt *verLocInd,
+                                           MilanInt myRank,
+                                           MilanReal *edgeLocWeight,
+                                           MilanLongInt *candidateMate);
+
+    void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+                                           MilanLongInt *candidateMate,
+                                           MilanLongInt *verLocInd,
+                                           MilanLongInt *verLocPtr,
+                                           MilanLongInt StartIndex,
+                                           MilanLongInt EndIndex,
+                                           MilanLongInt *Mate,
+                                           vector<MilanLongInt> &GMate,
+                                           map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                           MilanReal *edgeLocWeight,
+                                           MilanLongInt *myCardPtr,
+                                           MilanLongInt *msgIndPtr,
+                                           MilanLongInt *NumMessagesBundledPtr,
+                                           MilanLongInt *SPtr,
+                                           MilanLongInt *verDistance,
+                                           MilanLongInt *PCounter,
+                                           vector<MilanLongInt> &Counter,
+                                           MilanInt myRank,
+                                           MilanInt numProcs,
+                                           staticQueue &U,
+                                           staticQueue &privateU,
+                                           vector<MilanLongInt> &QLocalVtx,
+                                           vector<MilanLongInt> &QGhostVtx,
+                                           vector<MilanLongInt> &QMsgType,
+                                           vector<MilanInt> &QOwner,
+                                           staticQueue &privateQLocalVtx,
+                                           staticQueue &privateQGhostVtx,
+                                           staticQueue &privateQMsgType,
+                                           staticQueue &privateQOwner);
+
+    void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
+                            map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                            MilanLongInt edge,
+                            MilanLongInt *SPtr);
+
+    void processMatchedVertices(
+        MilanLongInt NLVer,
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        staticQueue &U,
+        staticQueue &privateU,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *NumMessagesBundledPtr,
+        MilanLongInt *SPtr,
+        MilanLongInt *verLocPtr,
+        MilanLongInt *verLocInd,
+        MilanLongInt *verDistance,
+        MilanLongInt *PCounter,
+        vector<MilanLongInt> &Counter,
+        MilanInt myRank,
+        MilanInt numProcs,
+        MilanLongInt *candidateMate,
+        vector<MilanLongInt> &GMate,
+        MilanLongInt *Mate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        MilanReal *edgeLocWeight,
+        vector<MilanLongInt> &QLocalVtx,
+        vector<MilanLongInt> &QGhostVtx,
+        vector<MilanLongInt> &QMsgType,
+        vector<MilanInt> &QOwner,
+        staticQueue &privateQLocalVtx,
+        staticQueue &privateQGhostVtx,
+        staticQueue &privateQMsgType,
+        staticQueue &privateQOwner);
+
+    void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
+                             MilanInt *BufferSizePtr,
+                             MilanLongInt *Buffer,
+                             vector<MilanLongInt> &PCumulative,
+                             vector<MilanLongInt> &PMessageBundle,
+                             vector<MilanLongInt> &PSizeInfoMessages,
+                             MilanLongInt *PCounter,
+                             MilanLongInt NumMessagesBundled,
+                             MilanLongInt *msgActualPtr,
+                             MilanLongInt *MessageIndexPtr,
+                             MilanInt numProcs,
+                             MilanInt myRank,
+                             int ComputeTag,
+                             int BundleTag,
+                             MPI_Comm comm,
+                             vector<MilanLongInt> &QLocalVtx,
+                             vector<MilanLongInt> &QGhostVtx,
+                             vector<MilanLongInt> &QMsgType,
+                             vector<MilanInt> &QOwner,
+                             vector<MPI_Request> &SRequest,
+                             vector<MPI_Status> &SStatus);
+
+    void processMessages(int error_codeC,
+                         MilanInt numProcs,
+                         MilanInt myRank,
+                         int ComputeTag,
+                         int BundleTag,
+                         MPI_Comm comm,
+                         vector<MilanLongInt> &Message,
+                         char *error_message,
+                         int message_length,
+                         vector<MilanLongInt> &ReceiveBuffer,
+                         MilanLongInt *BundleSizePtr);
+
+    void extractUChunk(
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        staticQueue &U,
+        staticQueue &privateU);
+
+    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 
     void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
         MilanLongInt NLVer, MilanLongInt NLEdge,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
index 8be438b6..f03f726f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@@ -72,12 +72,6 @@
 
 #ifdef SERIAL_MPI
 #else
-//MPI type map
-template<typename T> MPI_Datatype TypeMap();
-template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
-template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
-template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
-template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
 
 // DOUBLE PRECISION VERSION
 //WARNING: The vertex block on a given rank is contiguous
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index ad5d1142..50930601 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -1,16 +1,4 @@
 #include "MatchBoxPC.h"
-#include <omp.h>
-#include <stdio.h>
-#include "isAlreadyMatched.cpp"
-#include "findOwnerOfGhost.cpp"
-#include "computeCandidateMate.cpp"
-#include "initialize.cpp"
-#include "parallelComputeCandidateMateB.cpp"
-#include "processExposedVertex.cpp"
-#include "processMatchedVertices.cpp"
-#include "sendBundledMessages.cpp"
-#include "processMessages.cpp"
-#include "clean.cpp"
 
 // ***********************************************************************
 //
diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp
index 6c5543b8..5204894a 100644
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@@ -1,19 +1,10 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <assert.h>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
-
 
 //TODO comment
 //TODO use task
 //TODO destroy the locks
 
-inline void clean(MilanInt myRank,
+void clean(MilanInt myRank,
                   MilanLongInt MessageIndex,
                   vector<MPI_Request> &SRequest,
                   vector<MPI_Status> &SStatus,
diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp
index b6d8b3ff..36eaa727 100644
--- a/amgprec/impl/aggregator/computeCandidateMate.cpp
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@@ -9,7 +9,7 @@
  * @param edgeLocWeight
  * @return
  */
-inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
                                               MilanLongInt adj2,
                                               MilanLongInt *verLocInd,
                                               MilanReal *edgeLocWeight)
@@ -45,7 +45,7 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
  * @param Ghost2LocalMap
  * @return
  */
-inline MilanLongInt computeCandidateMate(MilanLongInt adj1,
+MilanLongInt computeCandidateMate(MilanLongInt adj1,
                                          MilanLongInt adj2,
                                          MilanReal *edgeLocWeight,
                                          MilanLongInt k,
diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp
index e8a6951c..b5bc1f5f 100644
--- a/amgprec/impl/aggregator/extractUChunk.cpp
+++ b/amgprec/impl/aggregator/extractUChunk.cpp
@@ -1,15 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
 
-#define UCHUNK 1000
-
-inline void extractUChunk(
+void extractUChunk(
     vector<MilanLongInt> &UChunkBeingProcessed,
     staticQueue &U,
     staticQueue &privateU)
diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
index 10850607..59a87bc3 100644
--- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@@ -1,13 +1,7 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
 
 ///Find the owner of a ghost node:
-inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                                      MilanInt myRank, MilanInt numProcs) {
   //MilanLongInt Size = mVerDistance.size();
   MilanLongInt mStartInd = mVerDistance[myRank];
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index c5ae3f26..979cdcf5 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -1,16 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <assert.h>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
-
-#define NUM_THREAD 4
-
-inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+
+void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                        MilanLongInt StartIndex, MilanLongInt EndIndex,
                        MilanLongInt *numGhostEdgesPtr,
                        MilanLongInt *numGhostVerticesPtr,
diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp
index 5a9cf476..dbb1052f 100644
--- a/amgprec/impl/aggregator/isAlreadyMatched.cpp
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@@ -1,10 +1,4 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
 
 /**
  * //TODO documentation
@@ -17,7 +11,7 @@
  * @param Ghost2LocalMap
  * @return
  */
-inline bool isAlreadyMatched(MilanLongInt node,
+bool isAlreadyMatched(MilanLongInt node,
                              MilanLongInt StartIndex,
                              MilanLongInt EndIndex,
                              vector <MilanLongInt> &GMate,
diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
index d3e39056..998edd9e 100644
--- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
+++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
@@ -1,14 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <assert.h>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
 
-inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
                                               MilanLongInt *verLocPtr,
                                               MilanLongInt *verLocInd,
                                               MilanInt myRank,
diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
index f99bee1a..05cae5d2 100644
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -1,14 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <assert.h>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
 
-inline void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
+void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
                                map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                                MilanLongInt edge,
                                MilanLongInt *SPtr)
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index ff57b5e5..50a5ecfd 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -1,16 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <assert.h>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
-#include "queueTransfer.cpp"
-#include "processCrossEdge.cpp"
-
-inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+
+void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                               MilanLongInt *candidateMate,
                                               MilanLongInt *verLocInd,
                                               MilanLongInt *verLocPtr,
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index c6c729ad..aaef21a1 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -1,16 +1,8 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
-#include "extractUChunk.cpp"
 
 //#define privateQueues
 
-inline void processMatchedVertices(
+void processMatchedVertices(
     MilanLongInt NLVer,
     vector<MilanLongInt> &UChunkBeingProcessed,
     staticQueue &U,
@@ -61,6 +53,7 @@ inline void processMatchedVertices(
 #endif
 
         // TODO what would be the optimal UCHUNK
+        // TODO refactor
         vector<MilanLongInt> UChunkBeingProcessed;
         UChunkBeingProcessed.reserve(UCHUNK);
 
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index ae2c8671..cb43cdb8 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -1,13 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
 
-inline void processMessages(int error_codeC,
+void processMessages(int error_codeC,
                             MilanInt numProcs,
                             MilanInt myRank,
                             int ComputeTag,
diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp
index becf14cf..ed2829c6 100644
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@@ -1,13 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
 
-inline void queuesTransfer(staticQueue &U,
+void queuesTransfer(staticQueue &U,
                            staticQueue &privateU,
                            vector<MilanLongInt> &QLocalVtx,
                            vector<MilanLongInt> &QGhostVtx,
diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp
index e16c5669..382d8a16 100644
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@@ -1,13 +1,6 @@
 #include "MatchBoxPC.h"
-#include <stdio.h>
-#include <iostream>
-#include <map>
-#include <vector>
-#include "primitiveDataTypeDefinitions.h"
-#include "dataStrStaticQueue.h"
-#include "omp.h"
-
-inline void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
+
+void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
                                 MilanInt *BufferSizePtr,
                                 MilanLongInt *Buffer,
                                 vector<MilanLongInt> &PCumulative,

From c7e81935142a3e8d352c5c35d90440efed86e48e Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Tue, 12 Jul 2022 12:12:15 -0500
Subject: [PATCH 49/96] omp task in clean.cpp, lock destroy

---
 amgprec/impl/aggregator/MatchBoxPC.h          |   6 +-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |   6 +-
 amgprec/impl/aggregator/clean.cpp             | 133 ++++++++++--------
 3 files changed, 84 insertions(+), 61 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index dfcb6f7e..0b3dcd74 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -234,7 +234,8 @@ extern "C"
                     staticQueue &privateQMsgType,
                     staticQueue &privateQOwner);
 
-    void clean(MilanInt myRank,
+    void clean(MilanLongInt NLVer,
+               MilanInt myRank,
                MilanLongInt MessageIndex,
                vector<MPI_Request> &SRequest,
                vector<MPI_Status> &SStatus,
@@ -245,7 +246,8 @@ extern "C"
                MilanLongInt msgInd,
                MilanLongInt *msgIndSent,
                MilanLongInt NumMessagesBundled,
-               MilanReal *msgPercent);
+               MilanReal *msgPercent,
+               omp_lock_t *MateLock);
 
     void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
                                            MilanLongInt *verLocPtr,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 50930601..e45ee792 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -789,7 +789,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
     } // End of while (true)
 
-    clean(myRank,
+    clean(NLVer,
+          myRank,
           MessageIndex,
           SRequest,
           SStatus,
@@ -800,7 +801,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
           msgInd,
           msgIndSent,
           NumMessagesBundled,
-          msgPercent);
+          msgPercent,
+          MateLock);
 
     finishTime = MPI_Wtime();
     *ph2_time = finishTime - startTime; // Time taken for Phase-2
diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp
index 5204894a..d91076c9 100644
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@@ -1,79 +1,98 @@
 #include "MatchBoxPC.h"
 
-//TODO comment
-//TODO use task
-//TODO destroy the locks
+// TODO comment
+// TODO use task
+// TODO destroy the locks
 
-void clean(MilanInt myRank,
-                  MilanLongInt MessageIndex,
-                  vector<MPI_Request> &SRequest,
-                  vector<MPI_Status> &SStatus,
-                  MilanInt BufferSize,
-                  MilanLongInt *Buffer,
-                  MilanLongInt msgActual,
-                  MilanLongInt *msgActualSent,
-                  MilanLongInt msgInd,
-                  MilanLongInt *msgIndSent,
-                  MilanLongInt NumMessagesBundled,
-                  MilanReal *msgPercent)
+void clean(MilanLongInt NLVer,
+           MilanInt myRank,
+           MilanLongInt MessageIndex,
+           vector<MPI_Request> &SRequest,
+           vector<MPI_Status> &SStatus,
+           MilanInt BufferSize,
+           MilanLongInt *Buffer,
+           MilanLongInt msgActual,
+           MilanLongInt *msgActualSent,
+           MilanLongInt msgInd,
+           MilanLongInt *msgIndSent,
+           MilanLongInt NumMessagesBundled,
+           MilanReal *msgPercent,
+           omp_lock_t *MateLock)
 {
     // Cleanup Phase
 
+#pragma omp parallel
+    {
+#pragma omp master
+        {
+#pragma omp task
+            {
+
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ") Waitall= " << endl;
-    fflush(stdout);
+                cout << "\n(" << myRank << ") Waitall= " << endl;
+                fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
-    cout << "\n(" << myRank << ") Waitall " << endl;
-    fflush(stdout);
+                cout << "\n(" << myRank << ") Waitall " << endl;
+                fflush(stdout);
 #endif
-    return;
+                return;
 
-    MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
+                MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
 
-    // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
-    if (BufferSize > 0)
-    {
-        MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
-        free(Buffer);                            // Free the memory that was allocated
-    }
+                // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
+                if (BufferSize > 0)
+                {
+                    MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
+                    free(Buffer);                            // Free the memory that was allocated
+                }
+            }
 
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
-    fflush(stdout);
+            cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
+            fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
-    cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
-    cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
-    cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
-    if (msgInd > 0)
-    {
-        cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
-    }
-    fflush(stdout);
+            cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
+            cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
+            cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
+            cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
+            if (msgInd > 0)
+            {
+                cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
+            }
+            fflush(stdout);
 #endif
 
-    *msgActualSent = msgActual;
-    *msgIndSent = msgInd;
-    if (msgInd > 0)
-    {
-        *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
-    }
-    else
-    {
-        *msgPercent = 0;
-    }
+#pragma omp task
+            {
+                *msgActualSent = msgActual;
+                *msgIndSent = msgInd;
+                if (msgInd > 0)
+                {
+                    *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
+                }
+                else
+                {
+                    *msgPercent = 0;
+                }
+            }
+            // Destroy the locks
+#pragma omp taskloop num_tasks(NUM_THREAD)
+            for (int i = 0; i < NLVer; i++)
+                omp_destroy_lock(&MateLock[i]);
 
 #ifdef DEBUG_HANG_
-    if (myRank == 0)
-        cout << "\n(" << myRank << ") Done" << endl;
-    fflush(stdout);
+            if (myRank == 0)
+                cout << "\n(" << myRank << ") Done" << endl;
+            fflush(stdout);
 #endif
+        }
+    }
 }
\ No newline at end of file

From ccac816f52b4be91a0231c14c3a4983c85285e04 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Tue, 12 Jul 2022 13:24:12 -0500
Subject: [PATCH 50/96] processCrossEdge small refactoring

---
 amgprec/impl/aggregator/processCrossEdge.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
index 05cae5d2..1ef99560 100644
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -1,11 +1,10 @@
 #include "MatchBoxPC.h"
 
 void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
-                               map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-                               MilanLongInt edge,
-                               MilanLongInt *SPtr)
+                        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                        MilanLongInt edge,
+                        MilanLongInt *SPtr)
 {
-    MilanLongInt S = *SPtr;
     // Decrement the counter:
     // Start: PARALLEL_PROCESS_CROSS_EDGE_B
     if (Counter[Ghost2LocalMap[edge]] > 0)
@@ -13,13 +12,13 @@ void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
         Counter[Ghost2LocalMap[edge]] -= 1; // Decrement
         if (Counter[Ghost2LocalMap[edge]] == 0)
         {
-            S--; // Decrement S
+            (*SPtr)--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
             cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
             fflush(stdout);
 #endif
         }
+
     } // End of if Counter[edge] > 0
       // End: PARALLEL_PROCESS_CROSS_EDGE_B
-    *SPtr = S;
 }
\ No newline at end of file

From dc1675766fe1fc8d75bf1c264176178fa1d2ae85 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 13 Jul 2022 16:19:38 -0500
Subject: [PATCH 51/96] processMessages.cpp further refactoring

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  43 ++-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 246 ++++--------------
 .../impl/aggregator/computeCandidateMate.cpp  |  24 +-
 amgprec/impl/aggregator/processMessages.cpp   | 224 +++++++++++++++-
 4 files changed, 304 insertions(+), 233 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 0b3dcd74..eecc57ce 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -345,17 +345,38 @@ extern "C"
                              vector<MPI_Request> &SRequest,
                              vector<MPI_Status> &SStatus);
 
-    void processMessages(int error_codeC,
-                         MilanInt numProcs,
-                         MilanInt myRank,
-                         int ComputeTag,
-                         int BundleTag,
-                         MPI_Comm comm,
-                         vector<MilanLongInt> &Message,
-                         char *error_message,
-                         int message_length,
-                         vector<MilanLongInt> &ReceiveBuffer,
-                         MilanLongInt *BundleSizePtr);
+    void processMessages(
+        MilanLongInt NLVer,
+        MilanLongInt *Mate,
+        MilanLongInt *candidateMate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        vector<MilanLongInt> &GMate,
+        vector<MilanLongInt> &Counter,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *msgActualPtr,
+        MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *verLocPtr,
+        MilanLongInt k,
+        MilanLongInt *verLocInd,
+        int error_codeC,
+        MilanInt numProcs,
+        MilanInt myRank,
+        int ComputeTag,
+        int BundleTag,
+        MPI_Comm comm,
+        vector<MilanLongInt> &Message,
+        char *error_message,
+        int message_length,
+        vector<MilanLongInt> &ReceiveBuffer,
+        MilanLongInt u,
+        MilanLongInt v,
+        MilanLongInt message_type,
+        MilanLongInt *SPtr,
+        staticQueue &U);
 
     void extractUChunk(
         vector<MilanLongInt> &UChunkBeingProcessed,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index e45ee792..c5a81c4d 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -598,7 +598,23 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
 
-        processMessages(error_codeC,
+        processMessages(NLVer,
+                        Mate,
+                        candidateMate,
+                        Ghost2LocalMap,
+                        GMate,
+                        Counter,
+                        StartIndex,
+                        EndIndex,
+                        &myCard,
+                        &msgInd,
+                        &msgActual,
+                        edgeLocWeight,
+                        verDistance,
+                        verLocPtr,
+                        k,
+                        verLocInd,
+                        error_codeC,
                         numProcs,
                         myRank,
                         ComputeTag,
@@ -608,205 +624,41 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         error_message,
                         message_length,
                         ReceiveBuffer,
-                        &bundleSize);
-
-        bundleCounter = 0;
-        while (bundleCounter < bundleSize)
-        {
-            u = ReceiveBuffer[bundleCounter]; // GHOST
-            bundleCounter++;
-            v = ReceiveBuffer[bundleCounter]; // LOCAL
-            bundleCounter++;
-            message_type = ReceiveBuffer[bundleCounter]; // TYPE
-            bundleCounter++;
-
-            // CASE I: REQUEST
-            if (message_type == REQUEST)
-            {
-#ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
-                fflush(stdout);
-#endif
-#ifdef DEBUG_GHOST_
-                if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
-                {
-                    cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
-                    fflush(stdout);
-                }
+                        u,
+                        v,
+                        message_type,
+                        &S,
+                        U);
 
-#endif
-                if (Mate[v - StartIndex] == -1)
-                {                                                 // Process only if not already matched  (v is local)
-                    candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
-                    if (candidateMate[v - StartIndex] == u)
-                    {
-                        GMate[Ghost2LocalMap[u]] = v; // u is ghost
-                        Mate[v - StartIndex] = u;     // v is local
-                        // Q.push_back(u);
-                        U.push_back(v);
-                        U.push_back(u);
-                        myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
-                        fflush(stdout);
-#endif
-                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
-                    } // End of if ( candidateMate[v-StartIndex] == u )e
-                }     // End of if ( Mate[v] == -1 )
-            }         // End of REQUEST
-            else
-            { // CASE II: SUCCESS
-                if (message_type == SUCCESS)
-                {
-#ifdef PRINT_DEBUG_INFO_
-                    cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
-                    fflush(stdout);
-#endif
-                    GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
-                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
-#ifdef DEBUG_GHOST_
-                    if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
-                    {
-                        cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
-                        fflush(stdout);
-                    }
-#endif
-                    if (Mate[v - StartIndex] == -1)
-                    { // Process only if not already matched ( v is local)
-                        if (candidateMate[v - StartIndex] == u)
-                        {
-                            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
-                            candidateMate[v - StartIndex] = w;
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
-                            fflush(stdout);
-#endif
-                            // If found a dominating edge:
-                            if (w >= 0)
-                            {
-                                if ((w < StartIndex) || (w > EndIndex))
-                                { // w is a ghost
-                                    // Build the Message Packet:
-                                    Message[0] = v;       // LOCAL
-                                    Message[1] = w;       // GHOST
-                                    Message[2] = REQUEST; // TYPE
-                                    // Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a request message: ";
-                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
-                                    fflush(stdout);
-#endif
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
-                                    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                    msgInd++;
-                                    msgActual++;
-                                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                    {
-                                        Mate[v - StartIndex] = w;     // v is local
-                                        GMate[Ghost2LocalMap[w]] = v; // w is ghost
-                                        U.push_back(v);
-                                        U.push_back(w);
-                                        myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
-                                        fflush(stdout);
-#endif
-                                        // Decrement the counter:
-                                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
-                                    } // End of if CandidateMate[w] = v
-                                }     // End of if a Ghost Vertex
-                                else
-                                { // w is a local vertex
-                                    if (candidateMate[w - StartIndex] == v)
-                                    {
-                                        Mate[v - StartIndex] = w; // v is local
-                                        Mate[w - StartIndex] = v; // w is local
-                                        // Q.push_back(u);
-                                        U.push_back(v);
-                                        U.push_back(w);
-                                        myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
-                                        fflush(stdout);
-#endif
-                                    } // End of if(CandidateMate(w) = v
-                                }     // End of Else
-                            }         // End of if(w >=0)
-                            else
-                            { // No dominant edge found
-                                adj11 = verLocPtr[v - StartIndex];
-                                adj12 = verLocPtr[v - StartIndex + 1];
-                                for (k1 = adj11; k1 < adj12; k1++)
-                                {
-                                    w = verLocInd[k1];
-                                    if ((w < StartIndex) || (w > EndIndex))
-                                    { // A ghost
-                                        // Build the Message Packet:
-                                        Message[0] = v;       // LOCAL
-                                        Message[1] = w;       // GHOST
-                                        Message[2] = FAILURE; // TYPE
-                                        // Send a Request (Asynchronous)
+    ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
 #ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")Sending a failure message: ";
-                                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
-                                        fflush(stdout);
-#endif
-                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                        assert(ghostOwner != -1);
-                                        assert(ghostOwner != myRank);
-                                        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                        msgInd++;
-                                        msgActual++;
-                                    } // End of if(GHOST)
-                                }     // End of for loop
-                            }         // End of Else: w == -1
-                            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        } // End of if ( candidateMate[v-StartIndex] == u )
-                    }     // End of if ( Mate[v] == -1 )
-                }         // End of if ( message_type == SUCCESS )
-                else
-                { // CASE III: FAILURE
-#ifdef PRINT_DEBUG_INFO_
-                    cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
-                    fflush(stdout);
-#endif
-                    GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
-                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
-                } // End of else: CASE III
-            }     // End of else: CASE I
-        }         // End of if (!MsgQ.empty())
-        ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
-#ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
-        fflush(stdout);
-        cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
-        fflush(stdout);
-        cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
-        fflush(stdout);
+    cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
+    fflush(stdout);
 #endif
-    } // End of while (true)
-
-    clean(NLVer,
-          myRank,
-          MessageIndex,
-          SRequest,
-          SStatus,
-          BufferSize,
-          Buffer,
-          msgActual,
-          msgActualSent,
-          msgInd,
-          msgIndSent,
-          NumMessagesBundled,
-          msgPercent,
-          MateLock);
-
-    finishTime = MPI_Wtime();
-    *ph2_time = finishTime - startTime; // Time taken for Phase-2
-    *ph2_card = myCard;                 // Cardinality at the end of Phase-2
+} // End of while (true)
+
+clean(NLVer,
+      myRank,
+      MessageIndex,
+      SRequest,
+      SStatus,
+      BufferSize,
+      Buffer,
+      msgActual,
+      msgActualSent,
+      msgInd,
+      msgIndSent,
+      NumMessagesBundled,
+      msgPercent,
+      MateLock);
+
+finishTime = MPI_Wtime();
+*ph2_time = finishTime - startTime; // Time taken for Phase-2
+*ph2_card = myCard;                 // Cardinality at the end of Phase-2
 }
 // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
 #endif
diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp
index 36eaa727..f79fc613 100644
--- a/amgprec/impl/aggregator/computeCandidateMate.cpp
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@@ -10,9 +10,9 @@
  * @return
  */
 MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
-                                              MilanLongInt adj2,
-                                              MilanLongInt *verLocInd,
-                                              MilanReal *edgeLocWeight)
+                                       MilanLongInt adj2,
+                                       MilanLongInt *verLocInd,
+                                       MilanReal *edgeLocWeight)
 {
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
@@ -46,15 +46,15 @@ MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
  * @return
  */
 MilanLongInt computeCandidateMate(MilanLongInt adj1,
-                                         MilanLongInt adj2,
-                                         MilanReal *edgeLocWeight,
-                                         MilanLongInt k,
-                                         MilanLongInt *verLocInd,
-                                         MilanLongInt StartIndex,
-                                         MilanLongInt EndIndex,
-                                         vector<MilanLongInt> &GMate,
-                                         MilanLongInt *Mate,
-                                         map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
+                                  MilanLongInt adj2,
+                                  MilanReal *edgeLocWeight,
+                                  MilanLongInt k,
+                                  MilanLongInt *verLocInd,
+                                  MilanLongInt StartIndex,
+                                  MilanLongInt EndIndex,
+                                  vector<MilanLongInt> &GMate,
+                                  MilanLongInt *Mate,
+                                  map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
 {
     // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index cb43cdb8..7d44b281 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -1,21 +1,45 @@
 #include "MatchBoxPC.h"
 
-void processMessages(int error_codeC,
-                            MilanInt numProcs,
-                            MilanInt myRank,
-                            int ComputeTag,
-                            int BundleTag,
-                            MPI_Comm comm,
-                            vector<MilanLongInt> &Message,
-                            char *error_message,
-                            int message_length,
-                            vector<MilanLongInt> &ReceiveBuffer,
-                            MilanLongInt *BundleSizePtr)
+void processMessages(
+    MilanLongInt NLVer,
+    MilanLongInt *Mate,
+    MilanLongInt *candidateMate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    vector<MilanLongInt> &GMate,
+    vector<MilanLongInt> &Counter,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCardPtr,
+    MilanLongInt *msgIndPtr,
+    MilanLongInt *msgActualPtr,
+    MilanReal *edgeLocWeight,
+    MilanLongInt *verDistance,
+    MilanLongInt *verLocPtr,
+    MilanLongInt k,
+    MilanLongInt *verLocInd,
+    int error_codeC,
+    MilanInt numProcs,
+    MilanInt myRank,
+    int ComputeTag,
+    int BundleTag,
+    MPI_Comm comm,
+    vector<MilanLongInt> &Message,
+    char *error_message,
+    int message_length,
+    vector<MilanLongInt> &ReceiveBuffer,
+    MilanLongInt u,
+    MilanLongInt v,
+    MilanLongInt message_type,
+    MilanLongInt *SPtr,
+    staticQueue &U)
 {
 
     MilanInt Sender;
     MPI_Status computeStatus;
-    MilanLongInt bundleSize = *BundleSizePtr;
+    MilanLongInt bundleSize, bundleCounter = 0, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w;
+    MilanLongInt S = *SPtr; // TODO refactor this
+    MilanLongInt adj11, adj12, k1;
+    MilanLongInt ghostOwner;
 
 #ifdef PRINT_DEBUG_INFO_
     cout
@@ -118,6 +142,180 @@ void processMessages(int error_codeC,
     fflush(stdout);
 #endif
 
-    *BundleSizePtr = bundleSize;
+    bundleCounter = 0;
+    while (bundleCounter < bundleSize)
+    {
+        u = ReceiveBuffer[bundleCounter]; // GHOST
+        bundleCounter++;
+        v = ReceiveBuffer[bundleCounter]; // LOCAL
+        bundleCounter++;
+        message_type = ReceiveBuffer[bundleCounter]; // TYPE
+        bundleCounter++;
+
+        // CASE I: REQUEST
+        if (message_type == REQUEST)
+        {
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
+            fflush(stdout);
+#endif
+#ifdef DEBUG_GHOST_
+            if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
+            {
+                cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+                fflush(stdout);
+            }
+
+#endif
+
+            if (Mate[v - StartIndex] == -1)
+            {                                                 // Process only if not already matched  (v is local)
+                candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
+                if (candidateMate[v - StartIndex] == u)
+                {
+                    GMate[Ghost2LocalMap[u]] = v; // u is ghost
+                    Mate[v - StartIndex] = u;     // v is local
+                    // Q.push_back(u);
+                    U.push_back(v);
+                    U.push_back(u);
+                    myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
+                    fflush(stdout);
+#endif
+                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
+                } // End of if ( candidateMate[v-StartIndex] == u )e
+            }     // End of if ( Mate[v] == -1 )
+        }         // End of REQUEST
+        else
+        { // CASE II: SUCCESS
+            if (message_type == SUCCESS)
+            {
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
+                fflush(stdout);
+#endif
+                GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
+                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
+#ifdef DEBUG_GHOST_
+                if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
+                {
+                    cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+                    fflush(stdout);
+                }
+#endif
+                if (Mate[v - StartIndex] == -1)
+                { // Process only if not already matched ( v is local)
+                    if (candidateMate[v - StartIndex] == u)
+                    {
+                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
+                        candidateMate[v - StartIndex] = w;
+#ifdef PRINT_DEBUG_INFO_
+                        cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
+                        fflush(stdout);
+#endif
+                        // If found a dominating edge:
+                        if (w >= 0)
+                        {
+                            if ((w < StartIndex) || (w > EndIndex))
+                            { // w is a ghost
+                                // Build the Message Packet:
+                                Message[0] = v;       // LOCAL
+                                Message[1] = w;       // GHOST
+                                Message[2] = REQUEST; // TYPE
+                                                      // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                cout << "\n(" << myRank << ")Sending a request message: ";
+                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                                fflush(stdout);
+#endif
+                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                assert(ghostOwner != -1);
+                                assert(ghostOwner != myRank);
+
+                                MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                msgInd++;
+                                msgActual++;
+                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                {
+                                    Mate[v - StartIndex] = w;     // v is local
+                                    GMate[Ghost2LocalMap[w]] = v; // w is ghost
+                                    U.push_back(v);
+                                    U.push_back(w);
+                                    myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+                                    fflush(stdout);
+#endif
+                                    // Decrement the counter:
+                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
+                                } // End of if CandidateMate[w] = v
+                            }     // End of if a Ghost Vertex
+                            else
+                            { // w is a local vertex
+                                if (candidateMate[w - StartIndex] == v)
+                                {
+                                    Mate[v - StartIndex] = w; // v is local
+                                    Mate[w - StartIndex] = v; // w is local
+                                    // Q.push_back(u);
+                                    U.push_back(v);
+                                    U.push_back(w);
+                                    myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+                                    fflush(stdout);
+#endif
+                                } // End of if(CandidateMate(w) = v
+                            }     // End of Else
+                        }         // End of if(w >=0)
+                        else
+                        { // No dominant edge found
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                w = verLocInd[k1];
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // A ghost
+                                    // Build the Message Packet:
+                                    Message[0] = v;       // LOCAL
+                                    Message[1] = w;       // GHOST
+                                    Message[2] = FAILURE; // TYPE
+                                                          // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                                    fflush(stdout);
+#endif
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
+                                    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                    msgInd++;
+                                    msgActual++;
+                                } // End of if(GHOST)
+                            }     // End of for loop
+                        }         // End of Else: w == -1
+                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                    } // End of if ( candidateMate[v-StartIndex] == u )
+                }     // End of if ( Mate[v] == -1 )
+            }         // End of if ( message_type == SUCCESS )
+            else
+            { // CASE III: FAILURE
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
+                fflush(stdout);
+#endif
+                GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
+                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); // Decrease the counter
+            }                                                       // End of else: CASE III
+        }                                                           // End of else: CASE I
+    }
+
+    *myCardPtr = myCard;
+    *SPtr = S;
+    *msgIndPtr = msgInd;
+    *msgActualPtr = msgActual;
     return;
 }
\ No newline at end of file

From 47c6f4f2f8787ba7b70d7994b853b0799dddfacf Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 13 Jul 2022 16:19:52 -0500
Subject: [PATCH 52/96] comments

---
 ...dgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 9 +++++----
 amgprec/impl/aggregator/processMessages.cpp              | 2 ++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index c5a81c4d..2ebb2876 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -121,14 +121,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     fflush(stdout);
 #endif
 
-    // inputSubGraph.getStartEndIndices(StartIndex, EndIndex);
     MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank
-    // MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank
     MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
 
     MPI_Status computeStatus;
     const int ComputeTag = 7; // Predefined tag
-    const int BundleTag = 9;  // Predefined tag
+    const int BundleTag = 9;  // Predefined tag //TODO refactor this
+
+    //TODO refactor this
     int error_codeC;
     error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
     char error_message[MPI_MAX_ERROR_STRING];
@@ -144,7 +144,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     // Data structures for sending and receiving messages:
     vector<MilanLongInt> Message; // [ u, v, message_type ]
     Message.resize(3, -1);
-    MilanLongInt message_type = 0;
+    MilanLongInt message_type = 0; //TODO refactor this, it could be constants
     // Data structures for Message Bundling:
     // Although up to two messages can be sent along any cross edge,
     // only one message will be sent in the initialization phase -
@@ -158,6 +158,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         PCounter[i] = 0;
 
     MilanLongInt NumMessagesBundled = 0;
+    //TODO when the last computational section will be refactored this could be eliminated
     MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
     MilanLongInt *candidateMate = nullptr;
 #ifdef PRINT_DEBUG_INFO_
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 7d44b281..2ae84317 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -1,5 +1,7 @@
 #include "MatchBoxPC.h"
 
+//TODO there are many useless parameter passed to this function
+
 void processMessages(
     MilanLongInt NLVer,
     MilanLongInt *Mate,

From 1ea1be33badee014e6208a3bbb0e430f1d1ef054 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 14 Jul 2022 15:23:32 -0500
Subject: [PATCH 53/96] Refactoring, eliminated useless passed variables

---
 amgprec/impl/aggregator/MatchBoxPC.h          | 13 ++----
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 41 +++----------------
 amgprec/impl/aggregator/processMessages.cpp   | 29 +++++++++----
 .../impl/aggregator/sendBundledMessages.cpp   |  3 --
 4 files changed, 29 insertions(+), 57 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index eecc57ce..fc141b43 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -73,6 +73,9 @@ const MilanLongInt SUCCESS = 2;
 const MilanLongInt FAILURE = 3;
 const MilanLongInt SIZEINFO = 4;
 
+const int ComputeTag = 7; // Predefined tag
+const int BundleTag = 9;  // Predefined tag
+
 // MPI type map
 template <typename T>
 MPI_Datatype TypeMap();
@@ -335,8 +338,6 @@ extern "C"
                              MilanLongInt *MessageIndexPtr,
                              MilanInt numProcs,
                              MilanInt myRank,
-                             int ComputeTag,
-                             int BundleTag,
                              MPI_Comm comm,
                              vector<MilanLongInt> &QLocalVtx,
                              vector<MilanLongInt> &QGhostVtx,
@@ -362,19 +363,13 @@ extern "C"
         MilanLongInt *verLocPtr,
         MilanLongInt k,
         MilanLongInt *verLocInd,
-        int error_codeC,
         MilanInt numProcs,
         MilanInt myRank,
-        int ComputeTag,
-        int BundleTag,
         MPI_Comm comm,
         vector<MilanLongInt> &Message,
-        char *error_message,
-        int message_length,
-        vector<MilanLongInt> &ReceiveBuffer,
+        MilanLongInt numGhostEdges,
         MilanLongInt u,
         MilanLongInt v,
-        MilanLongInt message_type,
         MilanLongInt *SPtr,
         staticQueue &U);
 
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 2ebb2876..45c49ec0 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -125,26 +125,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
 
     MPI_Status computeStatus;
-    const int ComputeTag = 7; // Predefined tag
-    const int BundleTag = 9;  // Predefined tag //TODO refactor this
 
-    //TODO refactor this
-    int error_codeC;
-    error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
-    char error_message[MPI_MAX_ERROR_STRING];
-    int message_length;
-
-    // MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0;
     MilanLongInt msgActual = 0, msgInd = 0;
     MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight
     MilanReal startTime, finishTime;
-    // MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer
+
     startTime = MPI_Wtime();
 
     // Data structures for sending and receiving messages:
     vector<MilanLongInt> Message; // [ u, v, message_type ]
     Message.resize(3, -1);
-    MilanLongInt message_type = 0; //TODO refactor this, it could be constants
     // Data structures for Message Bundling:
     // Although up to two messages can be sent along any cross edge,
     // only one message will be sent in the initialization phase -
@@ -204,7 +194,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanLongInt *Buffer;
 
     // Declare the locks
-    //  TODO destroy the locks
     omp_lock_t MateLock[NLVer];
 
     initialize(NLVer, NLEdge, StartIndex,
@@ -341,8 +330,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         &MessageIndex,
                         numProcs,
                         myRank,
-                        ComputeTag,
-                        BundleTag,
+                        //ComputeTag,
+                        //BundleTag,
                         comm,
                         QLocalVtx,
                         QGhostVtx,
@@ -376,21 +365,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     fflush(stdout);
     fflush(stdout);
 #endif
-    // Buffer to receive bundled messages
-    // Maximum messages that can be received from any processor is
-    // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
-    vector<MilanLongInt> ReceiveBuffer;
-    MilanLongInt bundleSize = 0, bundleCounter = 0;
-    try
-    {
-        ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
-    }
-    catch (length_error)
-    {
-        cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-        cout << "Not enough memory to allocate the internal variables \n";
-        exit(1);
-    }
+
     while (true)
     {
 #ifdef DEBUG_HANG_
@@ -615,19 +590,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         verLocPtr,
                         k,
                         verLocInd,
-                        error_codeC,
                         numProcs,
                         myRank,
-                        ComputeTag,
-                        BundleTag,
                         comm,
                         Message,
-                        error_message,
-                        message_length,
-                        ReceiveBuffer,
+                        numGhostEdges,
                         u,
                         v,
-                        message_type,
                         &S,
                         U);
 
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 2ae84317..c487042a 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -1,7 +1,5 @@
 #include "MatchBoxPC.h"
 
-//TODO there are many useless parameter passed to this function
-
 void processMessages(
     MilanLongInt NLVer,
     MilanLongInt *Mate,
@@ -19,19 +17,13 @@ void processMessages(
     MilanLongInt *verLocPtr,
     MilanLongInt k,
     MilanLongInt *verLocInd,
-    int error_codeC,
     MilanInt numProcs,
     MilanInt myRank,
-    int ComputeTag,
-    int BundleTag,
     MPI_Comm comm,
     vector<MilanLongInt> &Message,
-    char *error_message,
-    int message_length,
-    vector<MilanLongInt> &ReceiveBuffer,
+    MilanLongInt numGhostEdges,
     MilanLongInt u,
     MilanLongInt v,
-    MilanLongInt message_type,
     MilanLongInt *SPtr,
     staticQueue &U)
 {
@@ -42,6 +34,25 @@ void processMessages(
     MilanLongInt S = *SPtr; // TODO refactor this
     MilanLongInt adj11, adj12, k1;
     MilanLongInt ghostOwner;
+    int error_codeC;
+    error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
+    char error_message[MPI_MAX_ERROR_STRING];
+    int message_length;
+    MilanLongInt message_type = 0;
+    // Buffer to receive bundled messages
+    // Maximum messages that can be received from any processor is
+    // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
+    vector<MilanLongInt> ReceiveBuffer;
+    try
+    {
+        ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
+    }
+    catch (length_error)
+    {
+        cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+        cout << "Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
 
 #ifdef PRINT_DEBUG_INFO_
     cout
diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp
index 382d8a16..f3dd3e46 100644
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@@ -12,8 +12,6 @@ void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
                                 MilanLongInt *MessageIndexPtr,
                                 MilanInt numProcs,
                                 MilanInt myRank,
-                                int ComputeTag,
-                                int BundleTag,
                                 MPI_Comm comm,
                                 vector<MilanLongInt> &QLocalVtx,
                                 vector<MilanLongInt> &QGhostVtx,
@@ -24,7 +22,6 @@ void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
 {
 
     MilanLongInt myIndex = 0, msgActual = *msgActualPtr, MessageIndex = *MessageIndexPtr, numGhostEdges = *numGhostEdgesPtr, numMessagesToSend;
-    const MilanLongInt SIZEINFO = 4;
     MilanInt i = 0, OneMessageSize = 0, BufferSize = *BufferSizePtr;
 
 #ifdef DEBUG_HANG_

From f17082b3375d95bcfe75df2a32b215ad09581f4c Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 14 Jul 2022 15:27:53 -0500
Subject: [PATCH 54/96] Refactoring: eliminatino of SPtr inside processMessages

---
 amgprec/impl/aggregator/processMessages.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index c487042a..7ce867ff 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -24,14 +24,13 @@ void processMessages(
     MilanLongInt numGhostEdges,
     MilanLongInt u,
     MilanLongInt v,
-    MilanLongInt *SPtr,
+    MilanLongInt *S,
     staticQueue &U)
 {
 
     MilanInt Sender;
     MPI_Status computeStatus;
     MilanLongInt bundleSize, bundleCounter = 0, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w;
-    MilanLongInt S = *SPtr; // TODO refactor this
     MilanLongInt adj11, adj12, k1;
     MilanLongInt ghostOwner;
     int error_codeC;
@@ -39,6 +38,7 @@ void processMessages(
     char error_message[MPI_MAX_ERROR_STRING];
     int message_length;
     MilanLongInt message_type = 0;
+
     // Buffer to receive bundled messages
     // Maximum messages that can be received from any processor is
     // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
@@ -61,7 +61,7 @@ void processMessages(
     fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ")About to begin Message processing phase ... S=" << S << endl;
+    cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl;
     fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
@@ -196,7 +196,7 @@ void processMessages(
                     cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
                     fflush(stdout);
 #endif
-                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
+                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S);
                 } // End of if ( candidateMate[v-StartIndex] == u )e
             }     // End of if ( Mate[v] == -1 )
         }         // End of REQUEST
@@ -209,7 +209,7 @@ void processMessages(
                 fflush(stdout);
 #endif
                 GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
-                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S);
+                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S);
 #ifdef DEBUG_GHOST_
                 if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
                 {
@@ -262,7 +262,7 @@ void processMessages(
                                     fflush(stdout);
 #endif
                                     // Decrement the counter:
-                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
+                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, S);
                                 } // End of if CandidateMate[w] = v
                             }     // End of if a Ghost Vertex
                             else
@@ -321,13 +321,12 @@ void processMessages(
                 fflush(stdout);
 #endif
                 GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
-                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); // Decrease the counter
+                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); // Decrease the counter
             }                                                       // End of else: CASE III
         }                                                           // End of else: CASE I
     }
 
     *myCardPtr = myCard;
-    *SPtr = S;
     *msgIndPtr = msgInd;
     *msgActualPtr = msgActual;
     return;

From 5ca78fb871c963a93578af05e2b330a08a964334 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 14 Jul 2022 17:10:36 -0500
Subject: [PATCH 55/96] Refactoring isAlreadyMatched and processCrossEdge

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  1 -
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |  4 +--
 amgprec/impl/aggregator/isAlreadyMatched.cpp  |  6 ++--
 amgprec/impl/aggregator/processCrossEdge.cpp  |  7 ++--
 .../impl/aggregator/processExposedVertex.cpp  |  2 +-
 .../aggregator/processMatchedVertices.cpp     |  2 +-
 amgprec/impl/aggregator/processMessages.cpp   | 36 +++++++++----------
 7 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index fc141b43..a8f22f49 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -290,7 +290,6 @@ extern "C"
                                            staticQueue &privateQOwner);
 
     void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
-                            map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                             MilanLongInt edge,
                             MilanLongInt *SPtr);
 
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 45c49ec0..6e24393b 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -142,7 +142,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
     vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
 
-    // TODO move this inseide the initialization function
     MilanLongInt *PCounter = new MilanLongInt[numProcs];
     for (int i = 0; i < numProcs; i++)
         PCounter[i] = 0;
@@ -358,7 +357,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << ")Entering While(true) loop..";
     fflush(stdout);
-    // U.display(); fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
@@ -468,7 +466,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                         fflush(stdout);
 #endif
 
-                                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
+                                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S);
 
                                     } // End of if CandidateMate[w] = v
                                 }     // End of if a Ghost Vertex
diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp
index dbb1052f..38ae73f5 100644
--- a/amgprec/impl/aggregator/isAlreadyMatched.cpp
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@@ -23,11 +23,9 @@ bool isAlreadyMatched(MilanLongInt node,
 #pragma omp critical(Mate)
     {
         if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
-            if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched
-                result = true;
+            result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched
         } else { //A local vertex
-            if (Mate[node - StartIndex] >= 0) // Already matched
-                result = true;
+            result = (Mate[node - StartIndex] >= 0); // Already matched
         }
 
     }
diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
index 1ef99560..ee367a61 100644
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -1,16 +1,15 @@
 #include "MatchBoxPC.h"
 
 void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
-                        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                         MilanLongInt edge,
                         MilanLongInt *SPtr)
 {
     // Decrement the counter:
     // Start: PARALLEL_PROCESS_CROSS_EDGE_B
-    if (Counter[Ghost2LocalMap[edge]] > 0)
+    if (Counter[edge] > 0)
     {
-        Counter[Ghost2LocalMap[edge]] -= 1; // Decrement
-        if (Counter[Ghost2LocalMap[edge]] == 0)
+        Counter[edge] -= 1; // Decrement
+        if (Counter[edge] == 0)
         {
             (*SPtr)--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 50a5ecfd..97840b19 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -115,7 +115,7 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
                             //TODO refactor this!!
                             // Decrement the counter:
-                            PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
+                            PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S);
                         } // End of if CandidateMate[w] = v
 
                     } // End of if a Ghost Vertex
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index aaef21a1..c1ae6d13 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -174,7 +174,7 @@ void processMatchedVertices(
 
                                                 // TODO refactor this
                                                 // Decrement the counter:
-                                                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S);
+                                                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S);
 
                                             } // End of if CandidateMate[w] = v
                                         }     // End of if a Ghost Vertex
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 7ce867ff..7e5c3915 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -30,7 +30,7 @@ void processMessages(
 
     MilanInt Sender;
     MPI_Status computeStatus;
-    MilanLongInt bundleSize, bundleCounter = 0, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w;
+    MilanLongInt bundleSize, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w;
     MilanLongInt adj11, adj12, k1;
     MilanLongInt ghostOwner;
     int error_codeC;
@@ -155,15 +155,13 @@ void processMessages(
     fflush(stdout);
 #endif
 
-    bundleCounter = 0;
-    while (bundleCounter < bundleSize)
+
+    //Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
+    for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3)
     {
-        u = ReceiveBuffer[bundleCounter]; // GHOST
-        bundleCounter++;
-        v = ReceiveBuffer[bundleCounter]; // LOCAL
-        bundleCounter++;
-        message_type = ReceiveBuffer[bundleCounter]; // TYPE
-        bundleCounter++;
+        u = ReceiveBuffer[bundleCounter - 3];            // GHOST
+        v = ReceiveBuffer[bundleCounter - 2];            // LOCAL
+        message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
 
         // CASE I: REQUEST
         if (message_type == REQUEST)
@@ -188,7 +186,6 @@ void processMessages(
                 {
                     GMate[Ghost2LocalMap[u]] = v; // u is ghost
                     Mate[v - StartIndex] = u;     // v is local
-                    // Q.push_back(u);
                     U.push_back(v);
                     U.push_back(u);
                     myCard++;
@@ -196,7 +193,8 @@ void processMessages(
                     cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
                     fflush(stdout);
 #endif
-                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S);
+
+                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S);
                 } // End of if ( candidateMate[v-StartIndex] == u )e
             }     // End of if ( Mate[v] == -1 )
         }         // End of REQUEST
@@ -208,8 +206,8 @@ void processMessages(
                 cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
                 fflush(stdout);
 #endif
-                GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost)
-                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S);
+                GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
+                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S);
 #ifdef DEBUG_GHOST_
                 if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
                 {
@@ -261,8 +259,8 @@ void processMessages(
                                     cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
                                     fflush(stdout);
 #endif
-                                    // Decrement the counter:
-                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, S);
+
+                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], S);
                                 } // End of if CandidateMate[w] = v
                             }     // End of if a Ghost Vertex
                             else
@@ -320,10 +318,10 @@ void processMessages(
                 cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
                 fflush(stdout);
 #endif
-                GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
-                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); // Decrease the counter
-            }                                                       // End of else: CASE III
-        }                                                           // End of else: CASE I
+                GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
+                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); // Decrease the counter
+            }                                                      // End of else: CASE III
+        }                                                          // End of else: CASE I
     }
 
     *myCardPtr = myCard;

From 561cadee0fd721d475f625c3441ead31790c2ede Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Fri, 15 Jul 2022 07:27:30 -0500
Subject: [PATCH 56/96] parallelQueues working

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  2 +-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 63 +++++++++----------
 .../aggregator/processMatchedVertices.cpp     | 63 +++++++------------
 amgprec/impl/aggregator/queueTransfer.cpp     |  1 +
 4 files changed, 54 insertions(+), 75 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index a8f22f49..d1e26fbc 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -66,7 +66,7 @@
 using namespace std;
 
 #define NUM_THREAD 4
-#define UCHUNK 1000
+#define UCHUNK 100000
 
 const MilanLongInt REQUEST = 1;
 const MilanLongInt SUCCESS = 2;
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 6e24393b..d5ac4394 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -71,8 +71,6 @@
  Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
  */
 
-#define UCHUNK 1000
-
 #ifdef SERIAL_MPI
 #else
 
@@ -121,7 +119,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     fflush(stdout);
 #endif
 
-    MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank
+    MilanLongInt StartIndex = verDistance[myRank];       // The starting vertex owned by the current rank
     MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
 
     MPI_Status computeStatus;
@@ -147,7 +145,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         PCounter[i] = 0;
 
     MilanLongInt NumMessagesBundled = 0;
-    //TODO when the last computational section will be refactored this could be eliminated
+    // TODO when the last computational section will be refactored this could be eliminated
     MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
     MilanLongInt *candidateMate = nullptr;
 #ifdef PRINT_DEBUG_INFO_
@@ -282,6 +280,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     vector<MilanLongInt> UChunkBeingProcessed;
     UChunkBeingProcessed.reserve(UCHUNK);
+
     processMatchedVertices(NLVer,
                            UChunkBeingProcessed,
                            U,
@@ -329,8 +328,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         &MessageIndex,
                         numProcs,
                         myRank,
-                        //ComputeTag,
-                        //BundleTag,
                         comm,
                         QLocalVtx,
                         QGhostVtx,
@@ -598,35 +595,35 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         &S,
                         U);
 
-    ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
+        ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
-    fflush(stdout);
-    cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
-    fflush(stdout);
+        cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
+        fflush(stdout);
 #endif
-} // End of while (true)
-
-clean(NLVer,
-      myRank,
-      MessageIndex,
-      SRequest,
-      SStatus,
-      BufferSize,
-      Buffer,
-      msgActual,
-      msgActualSent,
-      msgInd,
-      msgIndSent,
-      NumMessagesBundled,
-      msgPercent,
-      MateLock);
-
-finishTime = MPI_Wtime();
-*ph2_time = finishTime - startTime; // Time taken for Phase-2
-*ph2_card = myCard;                 // Cardinality at the end of Phase-2
+    } // End of while (true)
+
+    clean(NLVer,
+          myRank,
+          MessageIndex,
+          SRequest,
+          SStatus,
+          BufferSize,
+          Buffer,
+          msgActual,
+          msgActualSent,
+          msgInd,
+          msgIndSent,
+          NumMessagesBundled,
+          msgPercent,
+          MateLock);
+
+    finishTime = MPI_Wtime();
+    *ph2_time = finishTime - startTime; // Time taken for Phase-2
+    *ph2_card = myCard;                 // Cardinality at the end of Phase-2
 }
 // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
 #endif
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index c1ae6d13..d766bc42 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -1,6 +1,6 @@
 #include "MatchBoxPC.h"
 
-//#define privateQueues
+//#define error
 
 void processMatchedVertices(
     MilanLongInt NLVer,
@@ -38,8 +38,13 @@ void processMatchedVertices(
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
     MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0;
 
-    // TODO check if private queues arrive empty
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
+    // TODO check that the queues arrives empty
+    assert(privateQGhostVtx.empty());
+    assert(privateQLocalVtx.empty());
+    assert(privateQMsgType.empty());
+    assert(privateQOwner.empty());
+
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
     {
 
 #ifdef PRINT_DEBUG_INFO_
@@ -140,25 +145,18 @@ void processMatchedVertices(
                                             cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 
+                                            msgInd++;
+                                            NumMessagesBundled++;
                                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                             assert(ghostOwner != -1);
                                             assert(ghostOwner != myRank);
+                                            PCounter[ghostOwner]++;
 
-#ifdef privateQueues
                                             privateQLocalVtx.push_back(v);
                                             privateQGhostVtx.push_back(w);
                                             privateQMsgType.push_back(REQUEST);
                                             privateQOwner.push_back(ghostOwner);
-#endif
-#ifndef privateQueues
-                                            QLocalVtx.push_back(v);
-                                            QGhostVtx.push_back(w);
-                                            QMsgType.push_back(REQUEST);
-                                            QOwner.push_back(ghostOwner);
-#endif
-                                            PCounter[ghostOwner]++;
-                                            NumMessagesBundled++;
-                                            msgInd++;
+
                                             if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
                                             {
                                                 Mate[v - StartIndex] = w;     // v is a local vertex
@@ -214,28 +212,18 @@ void processMatchedVertices(
                                                 cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                                 fflush(stdout);
 #endif
-
-                                                // ghostOwner = inputSubGraph.findOwner(w);
+                                                msgInd++;
+                                                NumMessagesBundled++;
                                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                                 assert(ghostOwner != -1);
                                                 assert(ghostOwner != myRank);
+                                                PCounter[ghostOwner]++;
 
-#ifdef privateQueues
                                                 privateQLocalVtx.push_back(v);
                                                 privateQGhostVtx.push_back(w);
                                                 privateQMsgType.push_back(FAILURE);
                                                 privateQOwner.push_back(ghostOwner);
-#endif
-#ifndef privateQueues
-                                                QLocalVtx.push_back(v);
-                                                QGhostVtx.push_back(w);
-                                                QMsgType.push_back(FAILURE);
-                                                QOwner.push_back(ghostOwner);
-#endif
 
-                                                PCounter[ghostOwner]++;
-                                                NumMessagesBundled++;
-                                                msgInd++;
                                             } // End of if(GHOST)
                                         }     // End of for loop
                                     }         // End of Else: w == -1
@@ -270,26 +258,18 @@ void processMatchedVertices(
                                     fflush(stdout);
 #endif
 
+                                    msgInd++;
+                                    NumMessagesBundled++;
                                     ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
                                     assert(ghostOwner != -1);
                                     assert(ghostOwner != myRank);
+                                    PCounter[ghostOwner]++;
 
-#ifdef privateQueues
                                     privateQLocalVtx.push_back(u);
                                     privateQGhostVtx.push_back(v);
                                     privateQMsgType.push_back(SUCCESS);
                                     privateQOwner.push_back(ghostOwner);
-#endif
-#ifndef privateQueues
-                                    QLocalVtx.push_back(u);
-                                    QGhostVtx.push_back(v);
-                                    QMsgType.push_back(SUCCESS);
-                                    QOwner.push_back(ghostOwner);
-#endif
 
-                                    PCounter[ghostOwner]++;
-                                    NumMessagesBundled++;
-                                    msgInd++;
                                 } // End of If( v != Mate[u] )
 
                                 // omp_unset_lock(&MateLock[u - StartIndex]);
@@ -306,14 +286,15 @@ void processMatchedVertices(
                 if (privateU.size() < UCHUNK && !U.empty())
                     continue;
 
-#ifdef privateQueues
+                    printf("Executed \n");
+#ifdef error
 #pragma omp critical(U)
                 {
                     while (!privateU.empty())
                         U.push_back(privateU.pop_back());
                 }
 #endif
-#ifndef privateQueues
+#ifndef error
                 queuesTransfer(U, privateU, QLocalVtx,
                                QGhostVtx,
                                QMsgType, QOwner, privateQLocalVtx,
@@ -322,7 +303,7 @@ void processMatchedVertices(
                                privateQOwner);
 #endif
             }
-        } // End of while ( /*!Q.empty()*/ !U.empty() )
+        } // End of while ( !U.empty() )
 
         queuesTransfer(U, privateU, QLocalVtx,
                        QGhostVtx,
diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp
index ed2829c6..cbae1fc2 100644
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@@ -12,6 +12,7 @@ void queuesTransfer(staticQueue &U,
                            staticQueue &privateQOwner)
 {
 
+
 #pragma omp critical(U)
     {
         while (!privateU.empty())

From a9bb6b26fad62f7e95438b0495810d02c7313f23 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 16 Jul 2022 11:20:39 -0500
Subject: [PATCH 57/96] processMatchedVertices partially working mixed critical
 and lock version

---
 amgprec/impl/aggregator/MatchBoxPC.h          |   5 +-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |   4 +-
 .../aggregator/processMatchedVertices.cpp     | 288 +++++++++---------
 3 files changed, 151 insertions(+), 146 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index d1e26fbc..58053c18 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -66,7 +66,7 @@
 using namespace std;
 
 #define NUM_THREAD 4
-#define UCHUNK 100000
+#define UCHUNK 1000
 
 const MilanLongInt REQUEST = 1;
 const MilanLongInt SUCCESS = 2;
@@ -323,7 +323,8 @@ extern "C"
         staticQueue &privateQLocalVtx,
         staticQueue &privateQGhostVtx,
         staticQueue &privateQMsgType,
-        staticQueue &privateQOwner);
+        staticQueue &privateQOwner,
+        omp_lock_t *MateLock);
 
     void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
                              MilanInt *BufferSizePtr,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index d5ac4394..ffe27f68 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -281,6 +281,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector<MilanLongInt> UChunkBeingProcessed;
     UChunkBeingProcessed.reserve(UCHUNK);
 
+//#define PRINT_DEBUG_INFO_
     processMatchedVertices(NLVer,
                            UChunkBeingProcessed,
                            U,
@@ -310,7 +311,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                            privateQLocalVtx,
                            privateQGhostVtx,
                            privateQMsgType,
-                           privateQOwner);
+                           privateQOwner,
+                           MateLock);
 
     /////////////////////////////////////////////////////////////////////////////////////////
     ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index d766bc42..d92f1a57 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -32,30 +32,24 @@ void processMatchedVertices(
     staticQueue &privateQLocalVtx,
     staticQueue &privateQGhostVtx,
     staticQueue &privateQMsgType,
-    staticQueue &privateQOwner)
+    staticQueue &privateQOwner,
+    omp_lock_t *MateLock)
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
     MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0;
 
-    // TODO check that the queues arrives empty
-    assert(privateQGhostVtx.empty());
-    assert(privateQLocalVtx.empty());
-    assert(privateQMsgType.empty());
-    assert(privateQOwner.empty());
-
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
-    {
-
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << "=========================************===============================" << endl;
-        fflush(stdout);
-        fflush(stdout);
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
 #endif
 
 #ifdef COUNT_LOCAL_VERTEX
-        MilanLongInt localVertices = 0;
+    MilanLongInt localVertices = 0;
 #endif
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
+    {
 
         // TODO what would be the optimal UCHUNK
         // TODO refactor
@@ -89,23 +83,29 @@ void processMatchedVertices(
 
                         if ((v >= StartIndex) && (v <= EndIndex))
                         { // If Local Vertex:
-#pragma omp critical(innerProcessMatched)
-                            {
 
 #ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
-                                fflush(stdout);
+                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                            fflush(stdout);
 #endif
 
-                                // If the current vertex is pointing to a matched vertex and is not matched
-                                // FIXME is there a way to make candidateMate private?
-                                //       for the moment it could generate an error.
-                                if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and
-                                    candidateMate[v - StartIndex] == u)
+                            // If the current vertex is pointing to a matched vertex and is not matched
+                            // FIXME is there a way to make candidateMate private?
+                            //       for the moment it could generate an error.
+                            if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                            {
+
+                                bool seh = false;
+#pragma omp critical(prova)
                                 {
+                                    seh = candidateMate[v - StartIndex] != u;
+                                }
+                                if (seh)
+                                    continue;
 
+#pragma omp critical(prova)
+                                {
                                     // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
                                     w = computeCandidateMate(verLocPtr[v - StartIndex],
                                                              verLocPtr[v - StartIndex + 1],
                                                              edgeLocWeight, 0,
@@ -117,48 +117,44 @@ void processMatchedVertices(
                                                              Ghost2LocalMap);
 
                                     candidateMate[v - StartIndex] = w;
+                                }
 
-                                    // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")" << v << " Points to: " << w;
-                                    fflush(stdout);
+                                cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                fflush(stdout);
 #endif
-                                    // If found a dominating edge:
-                                    if (w >= 0)
-                                    {
-
-                                        // TODO is it possible to lock without a critical region?
-                                        // TODO there must be a more elegant and efficient way to do this
-                                        /*
-                                        while(true) {
-                                            if (omp_test_lock(&MateLock[v - StartIndex])) {
-                                                if (omp_test_lock(&MateLock[w - StartIndex])) break;
-                                                else omp_unset_lock(&MateLock[v - StartIndex]);
-                                            }
-                                        }
-                                        */
+                                // If found a dominating edge:
+                                if (w >= 0)
+                                {
 
-                                        if ((w < StartIndex) || (w > EndIndex))
-                                        { // A ghost
+                                    if ((w < StartIndex) || (w > EndIndex))
+                                    { // A ghost
 #ifdef PRINT_DEBUG_INFO_
-                                            cout << "\n(" << myRank << ")Sending a request message:";
-                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        cout << "\n(" << myRank << ")Sending a request message:";
+                                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 
-                                            msgInd++;
-                                            NumMessagesBundled++;
-                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                            assert(ghostOwner != -1);
-                                            assert(ghostOwner != myRank);
-                                            PCounter[ghostOwner]++;
+                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                        assert(ghostOwner != -1);
+                                        assert(ghostOwner != myRank);
+#pragma omp atomic
+                                        PCounter[ghostOwner]++;
+#pragma omp atomic
+                                        msgInd++;
+#pragma omp atomic
+                                        NumMessagesBundled++;
 
-                                            privateQLocalVtx.push_back(v);
-                                            privateQGhostVtx.push_back(w);
-                                            privateQMsgType.push_back(REQUEST);
-                                            privateQOwner.push_back(ghostOwner);
+                                        privateQLocalVtx.push_back(v);
+                                        privateQGhostVtx.push_back(w);
+                                        privateQMsgType.push_back(REQUEST);
+                                        privateQOwner.push_back(ghostOwner);
 
+#pragma omp critical(prova)
+                                        {
                                             if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
                                             {
+                                                while (!omp_test_lock(&MateLock[v - StartIndex]))
+                                                    ;
                                                 Mate[v - StartIndex] = w;     // v is a local vertex
                                                 GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
                                                 // Q.push_back(u);
@@ -170,16 +166,23 @@ void processMatchedVertices(
                                                 fflush(stdout);
 #endif
 
-                                                // TODO refactor this
                                                 // Decrement the counter:
                                                 PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S);
-
+                                                omp_unset_lock(&MateLock[v - StartIndex]);
                                             } // End of if CandidateMate[w] = v
-                                        }     // End of if a Ghost Vertex
-                                        else
-                                        { // w is a local vertex
+                                        }
+                                    } // End of if a Ghost Vertex
+                                    else
+                                    { // w is a local vertex
+#pragma omp critical(prova)
+                                        {
                                             if (candidateMate[w - StartIndex] == v)
                                             {
+                                                while (!omp_test_lock(&MateLock[v - StartIndex]))
+                                                    ;
+                                                while (!omp_test_lock(&MateLock[w - StartIndex]))
+                                                    ;
+
                                                 Mate[v - StartIndex] = w; // v is a local vertex
                                                 Mate[w - StartIndex] = v; // w is a local vertex
                                                 // Q.push_back(u);
@@ -190,121 +193,120 @@ void processMatchedVertices(
                                                 cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                                                 fflush(stdout);
 #endif
+                                                omp_unset_lock(&MateLock[v - StartIndex]);
+                                                omp_unset_lock(&MateLock[w - StartIndex]);
                                             } // End of if(CandidateMate(w) = v
-                                        }     // End of Else
-
-                                        // omp_unset_lock(&MateLock[v - StartIndex]);
-                                        // omp_unset_lock(&MateLock[w - StartIndex]);
+                                        }
+                                    } // End of Else
 
-                                    } // End of if(w >=0)
-                                    else
+                                } // End of if(w >=0)
+                                else
+                                {
+                                    adj11 = verLocPtr[v - StartIndex];
+                                    adj12 = verLocPtr[v - StartIndex + 1];
+                                    for (k1 = adj11; k1 < adj12; k1++)
                                     {
-                                        adj11 = verLocPtr[v - StartIndex];
-                                        adj12 = verLocPtr[v - StartIndex + 1];
-                                        for (k1 = adj11; k1 < adj12; k1++)
-                                        {
-                                            w = verLocInd[k1];
-                                            if ((w < StartIndex) || (w > EndIndex))
-                                            { // A ghost
+                                        w = verLocInd[k1];
+                                        if ((w < StartIndex) || (w > EndIndex))
+                                        { // A ghost
 
 #ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")Sending a failure message: ";
-                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                                fflush(stdout);
+                                            cout << "\n(" << myRank << ")Sending a failure message: ";
+                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            fflush(stdout);
 #endif
-                                                msgInd++;
-                                                NumMessagesBundled++;
-                                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                                assert(ghostOwner != -1);
-                                                assert(ghostOwner != myRank);
-                                                PCounter[ghostOwner]++;
 
-                                                privateQLocalVtx.push_back(v);
-                                                privateQGhostVtx.push_back(w);
-                                                privateQMsgType.push_back(FAILURE);
-                                                privateQOwner.push_back(ghostOwner);
+                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            assert(ghostOwner != -1);
+                                            assert(ghostOwner != myRank);
+#pragma omp atomic
+                                            PCounter[ghostOwner]++;
+#pragma omp atomic
+                                            msgInd++;
+#pragma omp atomic
+                                            NumMessagesBundled++;
 
-                                            } // End of if(GHOST)
-                                        }     // End of for loop
-                                    }         // End of Else: w == -1
-                                    // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                            privateQLocalVtx.push_back(v);
+                                            privateQGhostVtx.push_back(w);
+                                            privateQMsgType.push_back(FAILURE);
+                                            privateQOwner.push_back(ghostOwner);
 
-                                } // End of If (candidateMate[v-StartIndex] == u
+                                        } // End of if(GHOST)
+                                    }     // End of for loop
+                                }         // End of Else: w == -1
+                                // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 
-                            } // End of critical region if
+                            } // End of If (candidateMate[v-StartIndex] == u
 
                         } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
                         else
                         { // Neighbor is a ghost vertex
 
-#pragma omp critical(innerProcessMatched)
+                            while (!omp_test_lock(&MateLock[u - StartIndex]))
+                                ;
+#pragma omp critical(prova)
                             {
-
-                                // while(!omp_test_lock(&MateLock[u - StartIndex]));
-
                                 if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
                                     candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                                if (v != Mate[u - StartIndex])
-                                { // u is local
-                                  // Build the Message Packet:
-                                  // Message[0] = u; //LOCAL
-                                  // Message[1] = v; //GHOST
-                                  // Message[2] = SUCCESS;  //TYPE
-                                  // Send a Request (Asynchronous)
+                            }
+                            if (v != Mate[u - StartIndex])
+                            { // u is local
 
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a success message: ";
-                                    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
-                                    fflush(stdout);
+                                cout << "\n(" << myRank << ")Sending a success message: ";
+                                cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                                fflush(stdout);
 #endif
 
-                                    msgInd++;
-                                    NumMessagesBundled++;
-                                    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
-                                    PCounter[ghostOwner]++;
-
-                                    privateQLocalVtx.push_back(u);
-                                    privateQGhostVtx.push_back(v);
-                                    privateQMsgType.push_back(SUCCESS);
-                                    privateQOwner.push_back(ghostOwner);
-
-                                } // End of If( v != Mate[u] )
+                                ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                                assert(ghostOwner != -1);
+                                assert(ghostOwner != myRank);
+#pragma omp atomic
+                                PCounter[ghostOwner]++;
+#pragma omp atomic
+                                msgInd++;
+#pragma omp atomic
+                                NumMessagesBundled++;
 
-                                // omp_unset_lock(&MateLock[u - StartIndex]);
+                                privateQLocalVtx.push_back(u);
+                                privateQGhostVtx.push_back(v);
+                                privateQMsgType.push_back(SUCCESS);
+                                privateQOwner.push_back(ghostOwner);
 
-                            } // End of critical region
-                        }     // End of Else //A Ghost Vertex
+                            } // End of If( v != Mate[u] )
 
-                    } // End of For Loop adj(u)
+                            omp_unset_lock(&MateLock[u - StartIndex]);
 
-                } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
+                        } // End of Else //A Ghost Vertex
 
-                // Ask for the critical section only when a certain amount
-                // of data have been accumulated in the private queue
-                if (privateU.size() < UCHUNK && !U.empty())
-                    continue;
+                    } // End of for
 
-                    printf("Executed \n");
-#ifdef error
+                    // TODO commenting that part of code might generate errors
+                    // Ask for the critical section only when there are no more data to
+                    // compute.
+                    if (/*privateU.size() < UCHUNK &&*/ !U.empty())
+                        continue;
 #pragma omp critical(U)
-                {
-                    while (!privateU.empty())
-                        U.push_back(privateU.pop_back());
-                }
-#endif
+                    {
+                        while (!privateU.empty())
+                            U.push_back(privateU.pop_back());
+                    }
+
 #ifndef error
-                queuesTransfer(U, privateU, QLocalVtx,
-                               QGhostVtx,
-                               QMsgType, QOwner, privateQLocalVtx,
-                               privateQGhostVtx,
-                               privateQMsgType,
-                               privateQOwner);
+#pragma omp critical(privateMsg)
+                    {
+                        while (!privateQLocalVtx.empty())
+                        {
+                            QLocalVtx.push_back(privateQLocalVtx.pop_back());
+                            QGhostVtx.push_back(privateQGhostVtx.pop_back());
+                            QMsgType.push_back(privateQMsgType.pop_back());
+                            QOwner.push_back(privateQOwner.pop_back());
+                        }
+                    }
 #endif
+                }
             }
         } // End of while ( !U.empty() )
-
         queuesTransfer(U, privateU, QLocalVtx,
                        QGhostVtx,
                        QMsgType, QOwner, privateQLocalVtx,
@@ -329,4 +331,4 @@ void processMatchedVertices(
     *msgIndPtr = msgInd;
     *NumMessagesBundledPtr = NumMessagesBundled;
     *SPtr = S;
-}
\ No newline at end of file
+}

From 1374f21ba807ed0483050477726b1452a3037302 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 16 Jul 2022 13:54:40 -0500
Subject: [PATCH 58/96] refactor increment on variables passed by reference in
 processMatchedVertices.cpp

---
 .../aggregator/processMatchedVertices.cpp     | 33 +++++++------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index d92f1a57..3816080c 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -37,7 +37,6 @@ void processMatchedVertices(
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
-    MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0;
 
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
@@ -48,7 +47,7 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
     {
 
         // TODO what would be the optimal UCHUNK
@@ -140,9 +139,9 @@ void processMatchedVertices(
 #pragma omp atomic
                                         PCounter[ghostOwner]++;
 #pragma omp atomic
-                                        msgInd++;
+                                        (*msgIndPtr)++;
 #pragma omp atomic
-                                        NumMessagesBundled++;
+                                        (*NumMessagesBundledPtr)++;
 
                                         privateQLocalVtx.push_back(v);
                                         privateQGhostVtx.push_back(w);
@@ -160,14 +159,15 @@ void processMatchedVertices(
                                                 // Q.push_back(u);
                                                 privateU.push_back(v);
                                                 privateU.push_back(w);
-                                                privateMyCard++;
+#pragma omp atomic
+                                                (*myCardPtr)++;
 #ifdef PRINT_DEBUG_INFO_
                                                 cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                                                 fflush(stdout);
 #endif
 
                                                 // Decrement the counter:
-                                                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S);
+                                                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr);
                                                 omp_unset_lock(&MateLock[v - StartIndex]);
                                             } // End of if CandidateMate[w] = v
                                         }
@@ -188,7 +188,8 @@ void processMatchedVertices(
                                                 // Q.push_back(u);
                                                 privateU.push_back(v);
                                                 privateU.push_back(w);
-                                                privateMyCard++;
+#pragma omp atomic
+                                                (*myCardPtr)++;
 #ifdef PRINT_DEBUG_INFO_
                                                 cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                                                 fflush(stdout);
@@ -222,9 +223,9 @@ void processMatchedVertices(
 #pragma omp atomic
                                             PCounter[ghostOwner]++;
 #pragma omp atomic
-                                            msgInd++;
+                                            (*msgIndPtr)++;
 #pragma omp atomic
-                                            NumMessagesBundled++;
+                                            (*NumMessagesBundledPtr)++;
 
                                             privateQLocalVtx.push_back(v);
                                             privateQGhostVtx.push_back(w);
@@ -264,10 +265,9 @@ void processMatchedVertices(
 #pragma omp atomic
                                 PCounter[ghostOwner]++;
 #pragma omp atomic
-                                msgInd++;
+                                (*msgIndPtr)++;
 #pragma omp atomic
-                                NumMessagesBundled++;
-
+                                (*NumMessagesBundledPtr)++;
                                 privateQLocalVtx.push_back(u);
                                 privateQGhostVtx.push_back(v);
                                 privateQMsgType.push_back(SUCCESS);
@@ -314,11 +314,6 @@ void processMatchedVertices(
                        privateQMsgType,
                        privateQOwner);
 
-// TODO it is possible that this is not working as expected
-//      further investigation needed.
-#pragma omp atomic
-        myCard += privateMyCard;
-
 #ifdef COUNT_LOCAL_VERTEX
         printf("Count local vertexes: %ld for thread %d of processor %d\n",
                localVertices,
@@ -327,8 +322,4 @@ void processMatchedVertices(
 
 #endif
     }
-    *myCardPtr = myCard;
-    *msgIndPtr = msgInd;
-    *NumMessagesBundledPtr = NumMessagesBundled;
-    *SPtr = S;
 }

From 71d4cdc3191ab782a4f90ae25e18eca187f1de2f Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 06:11:11 -0500
Subject: [PATCH 59/96] processMatchedVertices rollback to critical regions

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  41 +-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |   6 +-
 amgprec/impl/aggregator/findOwnerOfGhost.cpp  |   2 +
 .../aggregator/processMatchedVertices.cpp     | 400 ++++++++----------
 4 files changed, 207 insertions(+), 242 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 58053c18..fba63883 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -66,7 +66,7 @@
 using namespace std;
 
 #define NUM_THREAD 4
-#define UCHUNK 1000
+#define UCHUNK 5
 
 const MilanLongInt REQUEST = 1;
 const MilanLongInt SUCCESS = 2;
@@ -295,7 +295,6 @@ extern "C"
 
     void processMatchedVertices(
         MilanLongInt NLVer,
-        vector<MilanLongInt> &UChunkBeingProcessed,
         staticQueue &U,
         staticQueue &privateU,
         MilanLongInt StartIndex,
@@ -326,25 +325,25 @@ extern "C"
         staticQueue &privateQOwner,
         omp_lock_t *MateLock);
 
-    void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
-                             MilanInt *BufferSizePtr,
-                             MilanLongInt *Buffer,
-                             vector<MilanLongInt> &PCumulative,
-                             vector<MilanLongInt> &PMessageBundle,
-                             vector<MilanLongInt> &PSizeInfoMessages,
-                             MilanLongInt *PCounter,
-                             MilanLongInt NumMessagesBundled,
-                             MilanLongInt *msgActualPtr,
-                             MilanLongInt *MessageIndexPtr,
-                             MilanInt numProcs,
-                             MilanInt myRank,
-                             MPI_Comm comm,
-                             vector<MilanLongInt> &QLocalVtx,
-                             vector<MilanLongInt> &QGhostVtx,
-                             vector<MilanLongInt> &QMsgType,
-                             vector<MilanInt> &QOwner,
-                             vector<MPI_Request> &SRequest,
-                             vector<MPI_Status> &SStatus);
+        void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
+                                 MilanInt *BufferSizePtr,
+                                 MilanLongInt *Buffer,
+                                 vector<MilanLongInt> &PCumulative,
+                                 vector<MilanLongInt> &PMessageBundle,
+                                 vector<MilanLongInt> &PSizeInfoMessages,
+                                 MilanLongInt *PCounter,
+                                 MilanLongInt NumMessagesBundled,
+                                 MilanLongInt *msgActualPtr,
+                                 MilanLongInt *MessageIndexPtr,
+                                 MilanInt numProcs,
+                                 MilanInt myRank,
+                                 MPI_Comm comm,
+                                 vector<MilanLongInt> &QLocalVtx,
+                                 vector<MilanLongInt> &QGhostVtx,
+                                 vector<MilanLongInt> &QMsgType,
+                                 vector<MilanInt> &QOwner,
+                                 vector<MPI_Request> &SRequest,
+                                 vector<MPI_Status> &SStatus);
 
     void processMessages(
         MilanLongInt NLVer,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index ffe27f68..950e844b 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -278,12 +278,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
 
-    vector<MilanLongInt> UChunkBeingProcessed;
-    UChunkBeingProcessed.reserve(UCHUNK);
-
-//#define PRINT_DEBUG_INFO_
     processMatchedVertices(NLVer,
-                           UChunkBeingProcessed,
+                           //UChunkBeingProcessed,
                            U,
                            privateU,
                            StartIndex,
diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
index 59a87bc3..1c41b439 100644
--- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@@ -1,5 +1,7 @@
 #include "MatchBoxPC.h"
 
+
+//TODO parallelize this
 ///Find the owner of a ghost node:
 MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                                      MilanInt myRank, MilanInt numProcs) {
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 3816080c..6255b35c 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -4,7 +4,6 @@
 
 void processMatchedVertices(
     MilanLongInt NLVer,
-    vector<MilanLongInt> &UChunkBeingProcessed,
     staticQueue &U,
     staticQueue &privateU,
     MilanLongInt StartIndex,
@@ -37,7 +36,6 @@ void processMatchedVertices(
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
-
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
     fflush(stdout);
@@ -50,276 +48,246 @@ void processMatchedVertices(
 #pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
     {
 
-        // TODO what would be the optimal UCHUNK
-        // TODO refactor
-        vector<MilanLongInt> UChunkBeingProcessed;
-        UChunkBeingProcessed.reserve(UCHUNK);
+            // TODO what would be the optimal UCHUNK
+            // TODO refactor
+            vector<MilanLongInt> UChunkBeingProcessed;
+            UChunkBeingProcessed.reserve(UCHUNK);
 
-        while (!U.empty())
-        {
+            while (!U.empty())
+            {
 
-            extractUChunk(UChunkBeingProcessed, U, privateU);
+                extractUChunk(UChunkBeingProcessed, U, privateU);
 
-            for (MilanLongInt u : UChunkBeingProcessed)
-            {
+                for (MilanLongInt u : UChunkBeingProcessed)
+                {
 #ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")u: " << u;
-                fflush(stdout);
+                    cout << "\n(" << myRank << ")u: " << u;
+                    fflush(stdout);
 #endif
-                if ((u >= StartIndex) && (u <= EndIndex))
-                { // Process Only the Local Vertices
+                    if ((u >= StartIndex) && (u <= EndIndex))
+                    { // Process Only the Local Vertices
 
 #ifdef COUNT_LOCAL_VERTEX
-                    localVertices++;
+                        localVertices++;
 #endif
 
-                    // Get the Adjacency list for u
-                    adj1 = verLocPtr[u - StartIndex]; // Pointer
-                    adj2 = verLocPtr[u - StartIndex + 1];
-                    for (k = adj1; k < adj2; k++)
-                    {
-                        v = verLocInd[k];
+                        // Get the Adjacency list for u
+                        adj1 = verLocPtr[u - StartIndex]; // Pointer
+                        adj2 = verLocPtr[u - StartIndex + 1];
+                        for (k = adj1; k < adj2; k++)
+                        {
+                            v = verLocInd[k];
 
-                        if ((v >= StartIndex) && (v <= EndIndex))
-                        { // If Local Vertex:
+                            if ((v >= StartIndex) && (v <= EndIndex))
+                            { // If Local Vertex:
 
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
-                            fflush(stdout);
+                                cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                                fflush(stdout);
 #endif
 
-                            // If the current vertex is pointing to a matched vertex and is not matched
-                            // FIXME is there a way to make candidateMate private?
-                            //       for the moment it could generate an error.
-                            if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
-                            {
-
-                                bool seh = false;
-#pragma omp critical(prova)
-                                {
-                                    seh = candidateMate[v - StartIndex] != u;
-                                }
-                                if (seh)
-                                    continue;
-
-#pragma omp critical(prova)
+                                // If the current vertex is pointing to a matched vertex and is not matched
+                                // FIXME is there a way to make candidateMate private?
+                                //       for the moment it could generate an error.
+                                if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
                                 {
-                                    // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                             verLocPtr[v - StartIndex + 1],
-                                                             edgeLocWeight, 0,
-                                                             verLocInd,
-                                                             StartIndex,
-                                                             EndIndex,
-                                                             GMate,
-                                                             Mate,
-                                                             Ghost2LocalMap);
-
-                                    candidateMate[v - StartIndex] = w;
-                                }
+#pragma omp critical
+                                    {
+                                        if (candidateMate[v - StartIndex] == u)
+                                        {
+                                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                        w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                                 verLocPtr[v - StartIndex + 1],
+                                                                 edgeLocWeight, 0,
+                                                                 verLocInd,
+                                                                 StartIndex,
+                                                                 EndIndex,
+                                                                 GMate,
+                                                                 Mate,
+                                                                 Ghost2LocalMap);
+
+                                        candidateMate[v - StartIndex] = w;
 
 #ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")" << v << " Points to: " << w;
-                                fflush(stdout);
+                                        cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                        fflush(stdout);
 #endif
-                                // If found a dominating edge:
-                                if (w >= 0)
-                                {
+                                        // If found a dominating edge:
+                                        if (w >= 0)
+                                        {
 
-                                    if ((w < StartIndex) || (w > EndIndex))
-                                    { // A ghost
+                                            if ((w < StartIndex) || (w > EndIndex))
+                                            { // A ghost
 #ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")Sending a request message:";
-                                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                cout << "\n(" << myRank << ")Sending a request message:";
+                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 
-                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                        assert(ghostOwner != -1);
-                                        assert(ghostOwner != myRank);
+                                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                assert(ghostOwner != -1);
+                                                assert(ghostOwner != myRank);
 #pragma omp atomic
-                                        PCounter[ghostOwner]++;
+                                                PCounter[ghostOwner]++;
 #pragma omp atomic
-                                        (*msgIndPtr)++;
+                                                (*msgIndPtr)++;
 #pragma omp atomic
-                                        (*NumMessagesBundledPtr)++;
-
-                                        privateQLocalVtx.push_back(v);
-                                        privateQGhostVtx.push_back(w);
-                                        privateQMsgType.push_back(REQUEST);
-                                        privateQOwner.push_back(ghostOwner);
-
-#pragma omp critical(prova)
-                                        {
-                                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                            {
-                                                while (!omp_test_lock(&MateLock[v - StartIndex]))
-                                                    ;
-                                                Mate[v - StartIndex] = w;     // v is a local vertex
-                                                GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
-                                                // Q.push_back(u);
-                                                privateU.push_back(v);
-                                                privateU.push_back(w);
+                                                (*NumMessagesBundledPtr)++;
+
+                                                privateQLocalVtx.push_back(v);
+                                                privateQGhostVtx.push_back(w);
+                                                privateQMsgType.push_back(REQUEST);
+                                                privateQOwner.push_back(ghostOwner);
+
+                                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                                {
+                                                    Mate[v - StartIndex] = w;     // v is a local vertex
+                                                    GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+                                                    privateU.push_back(v);
+                                                    privateU.push_back(w);
 #pragma omp atomic
-                                                (*myCardPtr)++;
+                                                    (*myCardPtr)++;
 #ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                                fflush(stdout);
+                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                    fflush(stdout);
 #endif
 
-                                                // Decrement the counter:
-                                                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr);
-                                                omp_unset_lock(&MateLock[v - StartIndex]);
-                                            } // End of if CandidateMate[w] = v
-                                        }
-                                    } // End of if a Ghost Vertex
-                                    else
-                                    { // w is a local vertex
-#pragma omp critical(prova)
-                                        {
-                                            if (candidateMate[w - StartIndex] == v)
-                                            {
-                                                while (!omp_test_lock(&MateLock[v - StartIndex]))
-                                                    ;
-                                                while (!omp_test_lock(&MateLock[w - StartIndex]))
-                                                    ;
-
-                                                Mate[v - StartIndex] = w; // v is a local vertex
-                                                Mate[w - StartIndex] = v; // w is a local vertex
-                                                // Q.push_back(u);
-                                                privateU.push_back(v);
-                                                privateU.push_back(w);
+                                                    // Decrement the counter:
+                                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr);
+                                                } // End of if CandidateMate[w] = v
+                                            } // End of if a Ghost Vertex
+                                            else
+                                            { // w is a local vertex
+                                                if (candidateMate[w - StartIndex] == v)
+                                                {
+                                                    Mate[v - StartIndex] = w; // v is a local vertex
+                                                    Mate[w - StartIndex] = v; // w is a local vertex
+                                                    privateU.push_back(v);
+                                                    privateU.push_back(w);
 #pragma omp atomic
-                                                (*myCardPtr)++;
+                                                    (*myCardPtr)++;
 #ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                                fflush(stdout);
+                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                    fflush(stdout);
 #endif
-                                                omp_unset_lock(&MateLock[v - StartIndex]);
-                                                omp_unset_lock(&MateLock[w - StartIndex]);
-                                            } // End of if(CandidateMate(w) = v
-                                        }
-                                    } // End of Else
-
-                                } // End of if(w >=0)
-                                else
-                                {
-                                    adj11 = verLocPtr[v - StartIndex];
-                                    adj12 = verLocPtr[v - StartIndex + 1];
-                                    for (k1 = adj11; k1 < adj12; k1++)
-                                    {
-                                        w = verLocInd[k1];
-                                        if ((w < StartIndex) || (w > EndIndex))
-                                        { // A ghost
+                                                } // End of if(CandidateMate(w) = v
+                                            } // End of Else
+
+                                        } // End of if(w >=0)
+                                        else
+                                        {
+                                            adj11 = verLocPtr[v - StartIndex];
+                                            adj12 = verLocPtr[v - StartIndex + 1];
+                                            for (k1 = adj11; k1 < adj12; k1++)
+                                            {
+                                                w = verLocInd[k1];
+                                                if ((w < StartIndex) || (w > EndIndex))
+                                                { // A ghost
 
 #ifdef PRINT_DEBUG_INFO_
-                                            cout << "\n(" << myRank << ")Sending a failure message: ";
-                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                            fflush(stdout);
+                                                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                    fflush(stdout);
 #endif
 
-                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                            assert(ghostOwner != -1);
-                                            assert(ghostOwner != myRank);
+                                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                                    assert(ghostOwner != -1);
+                                                    assert(ghostOwner != myRank);
 #pragma omp atomic
-                                            PCounter[ghostOwner]++;
+                                                    PCounter[ghostOwner]++;
 #pragma omp atomic
-                                            (*msgIndPtr)++;
+                                                    (*msgIndPtr)++;
 #pragma omp atomic
-                                            (*NumMessagesBundledPtr)++;
-
-                                            privateQLocalVtx.push_back(v);
-                                            privateQGhostVtx.push_back(w);
-                                            privateQMsgType.push_back(FAILURE);
-                                            privateQOwner.push_back(ghostOwner);
-
-                                        } // End of if(GHOST)
-                                    }     // End of for loop
-                                }         // End of Else: w == -1
-                                // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-
-                            } // End of If (candidateMate[v-StartIndex] == u
-
-                        } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                        else
-                        { // Neighbor is a ghost vertex
-
-                            while (!omp_test_lock(&MateLock[u - StartIndex]))
-                                ;
-#pragma omp critical(prova)
-                            {
-                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                            }
-                            if (v != Mate[u - StartIndex])
-                            { // u is local
+                                                    (*NumMessagesBundledPtr)++;
+
+                                                    privateQLocalVtx.push_back(v);
+                                                    privateQGhostVtx.push_back(w);
+                                                    privateQMsgType.push_back(FAILURE);
+                                                    privateQOwner.push_back(ghostOwner);
+
+                                                } // End of if(GHOST)
+                                            }     // End of for loop
+                                        }         // End of Else: w == -1
+                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    }
+                                    } // End of task
+                                }     // End of If (candidateMate[v-StartIndex] == u
+
+                            } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                            else
+                            { // Neighbor is a ghost vertex
+
+#pragma omp critical
+                                {
+                                    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                        candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                                    if (v != Mate[u - StartIndex])
+                                    { // u is local
 
 #ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")Sending a success message: ";
-                                cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
-                                fflush(stdout);
+                                        cout << "\n(" << myRank << ")Sending a success message: ";
+                                        cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                                        fflush(stdout);
 #endif
 
-                                ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                                assert(ghostOwner != -1);
-                                assert(ghostOwner != myRank);
+                                        ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                                        assert(ghostOwner != -1);
+                                        assert(ghostOwner != myRank);
 #pragma omp atomic
-                                PCounter[ghostOwner]++;
+                                        PCounter[ghostOwner]++;
 #pragma omp atomic
-                                (*msgIndPtr)++;
+                                        (*msgIndPtr)++;
 #pragma omp atomic
-                                (*NumMessagesBundledPtr)++;
-                                privateQLocalVtx.push_back(u);
-                                privateQGhostVtx.push_back(v);
-                                privateQMsgType.push_back(SUCCESS);
-                                privateQOwner.push_back(ghostOwner);
-
-                            } // End of If( v != Mate[u] )
-
-                            omp_unset_lock(&MateLock[u - StartIndex]);
+                                        (*NumMessagesBundledPtr)++;
+                                        privateQLocalVtx.push_back(u);
+                                        privateQGhostVtx.push_back(v);
+                                        privateQMsgType.push_back(SUCCESS);
+                                        privateQOwner.push_back(ghostOwner);
 
-                        } // End of Else //A Ghost Vertex
+                                    } // End of If( v != Mate[u] )
 
-                    } // End of for
+                                } // End of task
+                            }     // End of Else //A Ghost Vertex
+                        } // End of inner for
 
-                    // TODO commenting that part of code might generate errors
-                    // Ask for the critical section only when there are no more data to
-                    // compute.
-                    if (/*privateU.size() < UCHUNK &&*/ !U.empty())
-                        continue;
+                        // TODO privateU.size() < UCHUNK could be commented but it generate errors, why?
+                        if (privateU.size() > UCHUNK || U.empty())
+                        {
 #pragma omp critical(U)
-                    {
-                        while (!privateU.empty())
-                            U.push_back(privateU.pop_back());
-                    }
+                            {
+                                while (!privateU.empty())
+                                    U.push_back(privateU.pop_back());
+                            }
 
 #ifndef error
 #pragma omp critical(privateMsg)
-                    {
-                        while (!privateQLocalVtx.empty())
-                        {
-                            QLocalVtx.push_back(privateQLocalVtx.pop_back());
-                            QGhostVtx.push_back(privateQGhostVtx.pop_back());
-                            QMsgType.push_back(privateQMsgType.pop_back());
-                            QOwner.push_back(privateQOwner.pop_back());
-                        }
-                    }
+                            {
+                                while (!privateQLocalVtx.empty())
+                                {
+                                    QLocalVtx.push_back(privateQLocalVtx.pop_back());
+                                    QGhostVtx.push_back(privateQGhostVtx.pop_back());
+                                    QMsgType.push_back(privateQMsgType.pop_back());
+                                    QOwner.push_back(privateQOwner.pop_back());
+                                }
+                            }
+
 #endif
-                }
-            }
-        } // End of while ( !U.empty() )
-        queuesTransfer(U, privateU, QLocalVtx,
-                       QGhostVtx,
-                       QMsgType, QOwner, privateQLocalVtx,
-                       privateQGhostVtx,
-                       privateQMsgType,
-                       privateQOwner);
+                        } // End of private.size()
+                    }
+                } // End of outer for
+            }     // End of while ( !U.empty() )
+            queuesTransfer(U, privateU, QLocalVtx,
+                           QGhostVtx,
+                           QMsgType, QOwner, privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
 
 #ifdef COUNT_LOCAL_VERTEX
-        printf("Count local vertexes: %ld for thread %d of processor %d\n",
-               localVertices,
-               omp_get_thread_num(),
-               myRank);
+            printf("Count local vertexes: %ld for thread %d of processor %d\n",
+                   localVertices,
+                   omp_get_thread_num(),
+                   myRank);
 
 #endif
-    }
+    } // End of parallel region
 }

From 9ab54adf3f392de5a30dac7df1533e54dd4d5c39 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 08:59:23 -0500
Subject: [PATCH 60/96] processMatchedVertices parallelized

---
 .../aggregator/processMatchedVertices.cpp     | 314 +++++++++---------
 1 file changed, 166 insertions(+), 148 deletions(-)

diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 6255b35c..bfdbd6cb 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -36,6 +36,7 @@ void processMatchedVertices(
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
+    int option;
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
     fflush(stdout);
@@ -45,56 +46,55 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
     {
 
-            // TODO what would be the optimal UCHUNK
-            // TODO refactor
-            vector<MilanLongInt> UChunkBeingProcessed;
-            UChunkBeingProcessed.reserve(UCHUNK);
+        // TODO what would be the optimal UCHUNK
+        // TODO refactor
+        vector<MilanLongInt> UChunkBeingProcessed;
+        UChunkBeingProcessed.reserve(UCHUNK);
 
-            while (!U.empty())
-            {
+        while (!U.empty())
+        {
 
-                extractUChunk(UChunkBeingProcessed, U, privateU);
+            extractUChunk(UChunkBeingProcessed, U, privateU);
 
-                for (MilanLongInt u : UChunkBeingProcessed)
-                {
+            for (MilanLongInt u : UChunkBeingProcessed)
+            {
 #ifdef PRINT_DEBUG_INFO_
-                    cout << "\n(" << myRank << ")u: " << u;
-                    fflush(stdout);
+                cout << "\n(" << myRank << ")u: " << u;
+                fflush(stdout);
 #endif
-                    if ((u >= StartIndex) && (u <= EndIndex))
-                    { // Process Only the Local Vertices
+                if ((u >= StartIndex) && (u <= EndIndex))
+                { // Process Only the Local Vertices
 
 #ifdef COUNT_LOCAL_VERTEX
-                        localVertices++;
+                    localVertices++;
 #endif
 
-                        // Get the Adjacency list for u
-                        adj1 = verLocPtr[u - StartIndex]; // Pointer
-                        adj2 = verLocPtr[u - StartIndex + 1];
-                        for (k = adj1; k < adj2; k++)
-                        {
-                            v = verLocInd[k];
+                    // Get the Adjacency list for u
+                    adj1 = verLocPtr[u - StartIndex]; // Pointer
+                    adj2 = verLocPtr[u - StartIndex + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        option = -1;
+                        v = verLocInd[k];
 
-                            if ((v >= StartIndex) && (v <= EndIndex))
-                            { // If Local Vertex:
+                        if ((v >= StartIndex) && (v <= EndIndex))
+                        { // If Local Vertex:
 
 #ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
-                                fflush(stdout);
+                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                            fflush(stdout);
 #endif
 
-                                // If the current vertex is pointing to a matched vertex and is not matched
-                                // FIXME is there a way to make candidateMate private?
-                                //       for the moment it could generate an error.
-                                if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
-                                {
+                            // If the current vertex is pointing to a matched vertex and is not matched
+                            if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                            {
 #pragma omp critical
+                                {
+                                    if (candidateMate[v - StartIndex] == u)
                                     {
-                                        if (candidateMate[v - StartIndex] == u)
-                                        {
                                         // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
                                         w = computeCandidateMate(verLocPtr[v - StartIndex],
                                                                  verLocPtr[v - StartIndex + 1],
@@ -122,171 +122,189 @@ void processMatchedVertices(
                                                 cout << "\n(" << myRank << ")Sending a request message:";
                                                 cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
-
-                                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                                assert(ghostOwner != -1);
-                                                assert(ghostOwner != myRank);
-#pragma omp atomic
-                                                PCounter[ghostOwner]++;
-#pragma omp atomic
-                                                (*msgIndPtr)++;
-#pragma omp atomic
-                                                (*NumMessagesBundledPtr)++;
-
-                                                privateQLocalVtx.push_back(v);
-                                                privateQGhostVtx.push_back(w);
-                                                privateQMsgType.push_back(REQUEST);
-                                                privateQOwner.push_back(ghostOwner);
+                                                option = 2;
 
                                                 if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
                                                 {
+                                                    option = 1;
                                                     Mate[v - StartIndex] = w;     // v is a local vertex
                                                     GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
-                                                    privateU.push_back(v);
-                                                    privateU.push_back(w);
-#pragma omp atomic
-                                                    (*myCardPtr)++;
-#ifdef PRINT_DEBUG_INFO_
-                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                                    fflush(stdout);
-#endif
 
                                                     // Decrement the counter:
                                                     PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr);
                                                 } // End of if CandidateMate[w] = v
-                                            } // End of if a Ghost Vertex
+                                            }     // End of if a Ghost Vertex
                                             else
                                             { // w is a local vertex
                                                 if (candidateMate[w - StartIndex] == v)
                                                 {
+                                                    option = 3;
                                                     Mate[v - StartIndex] = w; // v is a local vertex
                                                     Mate[w - StartIndex] = v; // w is a local vertex
-                                                    privateU.push_back(v);
-                                                    privateU.push_back(w);
-#pragma omp atomic
-                                                    (*myCardPtr)++;
+
 #ifdef PRINT_DEBUG_INFO_
                                                     cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                                                     fflush(stdout);
 #endif
                                                 } // End of if(CandidateMate(w) = v
-                                            } // End of Else
+                                            }     // End of Else
 
                                         } // End of if(w >=0)
-                                        else
-                                        {
-                                            adj11 = verLocPtr[v - StartIndex];
-                                            adj12 = verLocPtr[v - StartIndex + 1];
-                                            for (k1 = adj11; k1 < adj12; k1++)
-                                            {
-                                                w = verLocInd[k1];
-                                                if ((w < StartIndex) || (w > EndIndex))
-                                                { // A ghost
+                                        else option = 4;// End of Else: w == -1
+                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    }
+                                } // End of task
+                            }     // End of If (candidateMate[v-StartIndex] == u
+
+                        } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else
+                        { // Neighbor is a ghost vertex
+
+#pragma omp critical
+                            {
+                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                                if (v != Mate[u - StartIndex]) option = 5; // u is local
+                            } // End of critical
+                        }     // End of Else //A Ghost Vertex
 
+
+                        switch (option)
+                        {
+                        case -1:
+                            // No things to do
+                            break;
+                        case 1:
+                            // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+#pragma omp atomic
+                            (*myCardPtr)++;
 #ifdef PRINT_DEBUG_INFO_
-                                                    cout << "\n(" << myRank << ")Sending a failure message: ";
-                                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                                    fflush(stdout);
+                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                            fflush(stdout);
 #endif
-
-                                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                                    assert(ghostOwner != -1);
-                                                    assert(ghostOwner != myRank);
+                        case 2:
+                            // Found a dominating edge, it is a ghost
+                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
 #pragma omp atomic
-                                                    PCounter[ghostOwner]++;
+                            PCounter[ghostOwner]++;
 #pragma omp atomic
-                                                    (*msgIndPtr)++;
+                            (*msgIndPtr)++;
 #pragma omp atomic
-                                                    (*NumMessagesBundledPtr)++;
-
-                                                    privateQLocalVtx.push_back(v);
-                                                    privateQGhostVtx.push_back(w);
-                                                    privateQMsgType.push_back(FAILURE);
-                                                    privateQOwner.push_back(ghostOwner);
-
-                                                } // End of if(GHOST)
-                                            }     // End of for loop
-                                        }         // End of Else: w == -1
-                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    }
-                                    } // End of task
-                                }     // End of If (candidateMate[v-StartIndex] == u
-
-                            } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                            else
-                            { // Neighbor is a ghost vertex
+                            (*NumMessagesBundledPtr)++;
+                            privateQLocalVtx.push_back(v);
+                            privateQGhostVtx.push_back(w);
+                            privateQMsgType.push_back(REQUEST);
+                            privateQOwner.push_back(ghostOwner);
+                            break;
+                        case 3:
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+#pragma omp atomic
+                            (*myCardPtr)++;
+                            break;
+                        case 4:
+                            // Could not find a dominating vertex
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                w = verLocInd[k1];
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // A ghost
 
-#pragma omp critical
-                                {
-                                    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                                        candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                                    if (v != Mate[u - StartIndex])
-                                    { // u is local
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    fflush(stdout);
+#endif
 
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
+#pragma omp atomic
+                                    PCounter[ghostOwner]++;
+#pragma omp atomic
+                                    (*msgIndPtr)++;
+#pragma omp atomic
+                                    (*NumMessagesBundledPtr)++;
+
+                                    privateQLocalVtx.push_back(v);
+                                    privateQGhostVtx.push_back(w);
+                                    privateQMsgType.push_back(FAILURE);
+                                    privateQOwner.push_back(ghostOwner);
+
+                                } // End of if(GHOST)
+                            }     // End of for loop
+                            break;
+                        default:
+                            
 #ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")Sending a success message: ";
-                                        cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
-                                        fflush(stdout);
+                                    cout << "\n(" << myRank << ")Sending a success message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                                    fflush(stdout);
 #endif
 
-                                        ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                                        assert(ghostOwner != -1);
-                                        assert(ghostOwner != myRank);
+                                    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
 #pragma omp atomic
-                                        PCounter[ghostOwner]++;
+                                    PCounter[ghostOwner]++;
 #pragma omp atomic
-                                        (*msgIndPtr)++;
+                                    (*msgIndPtr)++;
 #pragma omp atomic
-                                        (*NumMessagesBundledPtr)++;
-                                        privateQLocalVtx.push_back(u);
-                                        privateQGhostVtx.push_back(v);
-                                        privateQMsgType.push_back(SUCCESS);
-                                        privateQOwner.push_back(ghostOwner);
+                                    (*NumMessagesBundledPtr)++;
+                                    privateQLocalVtx.push_back(u);
+                                    privateQGhostVtx.push_back(v);
+                                    privateQMsgType.push_back(SUCCESS);
+                                    privateQOwner.push_back(ghostOwner);
 
-                                    } // End of If( v != Mate[u] )
+                            break;
+                        } //End of switch
 
-                                } // End of task
-                            }     // End of Else //A Ghost Vertex
-                        } // End of inner for
+                    } // End of inner for
 
-                        // TODO privateU.size() < UCHUNK could be commented but it generate errors, why?
-                        if (privateU.size() > UCHUNK || U.empty())
-                        {
+                    // TODO privateU.size() < UCHUNK could be commented but it generate errors, why?
+                    if (privateU.size() > UCHUNK || U.empty())
+                    {
 #pragma omp critical(U)
-                            {
-                                while (!privateU.empty())
-                                    U.push_back(privateU.pop_back());
-                            }
+                        {
+                            while (!privateU.empty())
+                                U.push_back(privateU.pop_back());
+                        }
 
 #ifndef error
 #pragma omp critical(privateMsg)
+                        {
+                            while (!privateQLocalVtx.empty())
                             {
-                                while (!privateQLocalVtx.empty())
-                                {
-                                    QLocalVtx.push_back(privateQLocalVtx.pop_back());
-                                    QGhostVtx.push_back(privateQGhostVtx.pop_back());
-                                    QMsgType.push_back(privateQMsgType.pop_back());
-                                    QOwner.push_back(privateQOwner.pop_back());
-                                }
+                                QLocalVtx.push_back(privateQLocalVtx.pop_back());
+                                QGhostVtx.push_back(privateQGhostVtx.pop_back());
+                                QMsgType.push_back(privateQMsgType.pop_back());
+                                QOwner.push_back(privateQOwner.pop_back());
                             }
+                        }
 
 #endif
-                        } // End of private.size()
-                    }
-                } // End of outer for
-            }     // End of while ( !U.empty() )
-            queuesTransfer(U, privateU, QLocalVtx,
-                           QGhostVtx,
-                           QMsgType, QOwner, privateQLocalVtx,
-                           privateQGhostVtx,
-                           privateQMsgType,
-                           privateQOwner);
+                    } // End of private.size()
+                }
+            } // End of outer for
+        }     // End of while ( !U.empty() )
+        queuesTransfer(U, privateU, QLocalVtx,
+                       QGhostVtx,
+                       QMsgType, QOwner, privateQLocalVtx,
+                       privateQGhostVtx,
+                       privateQMsgType,
+                       privateQOwner);
 
 #ifdef COUNT_LOCAL_VERTEX
-            printf("Count local vertexes: %ld for thread %d of processor %d\n",
-                   localVertices,
-                   omp_get_thread_num(),
-                   myRank);
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
 
 #endif
     } // End of parallel region

From d24c8c2d46591bf818fe14324e7af7ee2455b8bb Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 09:43:48 -0500
Subject: [PATCH 61/96] processCrossEdges is now atomic

---
 amgprec/impl/aggregator/processCrossEdge.cpp  | 24 +++++-----
 .../aggregator/processMatchedVertices.cpp     | 45 ++++++++++---------
 2 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
index ee367a61..45b3918e 100644
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -4,20 +4,20 @@ void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
                         MilanLongInt edge,
                         MilanLongInt *SPtr)
 {
-    // Decrement the counter:
     // Start: PARALLEL_PROCESS_CROSS_EDGE_B
-    if (Counter[edge] > 0)
-    {
-        Counter[edge] -= 1; // Decrement
-        if (Counter[edge] == 0)
-        {
-            (*SPtr)--; // Decrement S
+    MilanLongInt captureCounter;
+
+#pragma omp atomic capture
+    captureCounter = --Counter[edge]; // Decrement
+
+    if (captureCounter == 0)
+#pragma omp atomic
+        (*SPtr)--; // Decrement S
+
 #ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
-            fflush(stdout);
+    cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
+    fflush(stdout);
 #endif
-        }
 
-    } // End of if Counter[edge] > 0
-      // End: PARALLEL_PROCESS_CROSS_EDGE_B
+    // End: PARALLEL_PROCESS_CROSS_EDGE_B
 }
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index bfdbd6cb..87ea7a4f 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -130,8 +130,6 @@ void processMatchedVertices(
                                                     Mate[v - StartIndex] = w;     // v is a local vertex
                                                     GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
 
-                                                    // Decrement the counter:
-                                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr);
                                                 } // End of if CandidateMate[w] = v
                                             }     // End of if a Ghost Vertex
                                             else
@@ -150,7 +148,8 @@ void processMatchedVertices(
                                             }     // End of Else
 
                                         } // End of if(w >=0)
-                                        else option = 4;// End of Else: w == -1
+                                        else
+                                            option = 4; // End of Else: w == -1
                                         // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
                                     }
                                 } // End of task
@@ -164,10 +163,10 @@ void processMatchedVertices(
                             {
                                 if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
                                     candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                                if (v != Mate[u - StartIndex]) option = 5; // u is local
-                            } // End of critical
-                        }     // End of Else //A Ghost Vertex
-
+                                if (v != Mate[u - StartIndex])
+                                    option = 5; // u is local
+                            }                   // End of critical
+                        }                       // End of Else //A Ghost Vertex
 
                         switch (option)
                         {
@@ -184,6 +183,8 @@ void processMatchedVertices(
                             cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                             fflush(stdout);
 #endif
+                            // Decrement the counter:
+                            PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr);
                         case 2:
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
@@ -241,29 +242,29 @@ void processMatchedVertices(
                             }     // End of for loop
                             break;
                         default:
-                            
+
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a success message: ";
-                                    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
-                                    fflush(stdout);
+                            cout << "\n(" << myRank << ")Sending a success message: ";
+                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                            fflush(stdout);
 #endif
 
-                                    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
+                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
 #pragma omp atomic
-                                    PCounter[ghostOwner]++;
+                            PCounter[ghostOwner]++;
 #pragma omp atomic
-                                    (*msgIndPtr)++;
+                            (*msgIndPtr)++;
 #pragma omp atomic
-                                    (*NumMessagesBundledPtr)++;
-                                    privateQLocalVtx.push_back(u);
-                                    privateQGhostVtx.push_back(v);
-                                    privateQMsgType.push_back(SUCCESS);
-                                    privateQOwner.push_back(ghostOwner);
+                            (*NumMessagesBundledPtr)++;
+                            privateQLocalVtx.push_back(u);
+                            privateQGhostVtx.push_back(v);
+                            privateQMsgType.push_back(SUCCESS);
+                            privateQOwner.push_back(ghostOwner);
 
                             break;
-                        } //End of switch
+                        } // End of switch
 
                     } // End of inner for
 

From cb660e044d9203f3eee0e3e47edadef4f2a2061f Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 11:27:17 -0500
Subject: [PATCH 62/96] Remoe MateLock

---
 amgprec/impl/aggregator/MatchBoxPC.h                   |  7 ++-----
 ...eApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 10 ++--------
 amgprec/impl/aggregator/clean.cpp                      |  7 +------
 amgprec/impl/aggregator/initialize.cpp                 |  5 -----
 amgprec/impl/aggregator/processCrossEdge.cpp           |  2 ++
 amgprec/impl/aggregator/processMatchedVertices.cpp     |  3 +--
 6 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index fba63883..1ff2cb56 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -217,7 +217,6 @@ extern "C"
                     MilanLongInt *S,
                     MilanLongInt *verLocInd,
                     MilanLongInt *verLocPtr,
-                    omp_lock_t *MateLock,
                     map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                     vector<MilanLongInt> &Counter,
                     vector<MilanLongInt> &verGhostPtr,
@@ -249,8 +248,7 @@ extern "C"
                MilanLongInt msgInd,
                MilanLongInt *msgIndSent,
                MilanLongInt NumMessagesBundled,
-               MilanReal *msgPercent,
-               omp_lock_t *MateLock);
+               MilanReal *msgPercent);
 
     void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
                                            MilanLongInt *verLocPtr,
@@ -322,8 +320,7 @@ extern "C"
         staticQueue &privateQLocalVtx,
         staticQueue &privateQGhostVtx,
         staticQueue &privateQMsgType,
-        staticQueue &privateQOwner,
-        omp_lock_t *MateLock);
+        staticQueue &privateQOwner);
 
         void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
                                  MilanInt *BufferSizePtr,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 950e844b..8c02ddcf 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -190,14 +190,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanInt BufferSize;
     MilanLongInt *Buffer;
 
-    // Declare the locks
-    omp_lock_t MateLock[NLVer];
-
     initialize(NLVer, NLEdge, StartIndex,
                EndIndex, &numGhostEdges,
                &numGhostVertices, &S,
                verLocInd, verLocPtr,
-               MateLock,
                Ghost2LocalMap, Counter,
                verGhostPtr, verGhostInd,
                tempCounter, GMate,
@@ -307,8 +303,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                            privateQLocalVtx,
                            privateQGhostVtx,
                            privateQMsgType,
-                           privateQOwner,
-                           MateLock);
+                           privateQOwner);
 
     /////////////////////////////////////////////////////////////////////////////////////////
     ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
@@ -616,8 +611,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
           msgInd,
           msgIndSent,
           NumMessagesBundled,
-          msgPercent,
-          MateLock);
+          msgPercent);
 
     finishTime = MPI_Wtime();
     *ph2_time = finishTime - startTime; // Time taken for Phase-2
diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp
index d91076c9..29fa351d 100644
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@@ -16,8 +16,7 @@ void clean(MilanLongInt NLVer,
            MilanLongInt msgInd,
            MilanLongInt *msgIndSent,
            MilanLongInt NumMessagesBundled,
-           MilanReal *msgPercent,
-           omp_lock_t *MateLock)
+           MilanReal *msgPercent)
 {
     // Cleanup Phase
 
@@ -83,10 +82,6 @@ void clean(MilanLongInt NLVer,
                     *msgPercent = 0;
                 }
             }
-            // Destroy the locks
-#pragma omp taskloop num_tasks(NUM_THREAD)
-            for (int i = 0; i < NLVer; i++)
-                omp_destroy_lock(&MateLock[i]);
 
 #ifdef DEBUG_HANG_
             if (myRank == 0)
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 979cdcf5..8325e455 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -7,7 +7,6 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                        MilanLongInt *S,
                        MilanLongInt *verLocInd,
                        MilanLongInt *verLocPtr,
-                       omp_lock_t *MateLock,
                        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                        vector<MilanLongInt> &Counter,
                        vector<MilanLongInt> &verGhostPtr,
@@ -40,10 +39,6 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
 #pragma omp single
         {
-            // Initialize the locks
-#pragma omp taskloop num_tasks(NUM_THREAD)
-            for (i = 0; i < NLVer; i++)
-                omp_init_lock(&MateLock[i]);
 
 #ifdef TIME_TRACKER
             double Ghost2LocalInitialization = MPI_Wtime();
diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
index 45b3918e..30efd79d 100644
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -10,6 +10,8 @@ void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
 #pragma omp atomic capture
     captureCounter = --Counter[edge]; // Decrement
 
+    //assert(captureCounter >= 0);
+
     if (captureCounter == 0)
 #pragma omp atomic
         (*SPtr)--; // Decrement S
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 87ea7a4f..11d7466d 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -31,8 +31,7 @@ void processMatchedVertices(
     staticQueue &privateQLocalVtx,
     staticQueue &privateQGhostVtx,
     staticQueue &privateQMsgType,
-    staticQueue &privateQOwner,
-    omp_lock_t *MateLock)
+    staticQueue &privateQOwner)
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;

From 4f07a70ed13826586538bc5c15bf12a1cbbd59f1 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 11:48:52 -0500
Subject: [PATCH 63/96] initialize refactoring

---
 amgprec/impl/aggregator/findOwnerOfGhost.cpp |   2 +-
 amgprec/impl/aggregator/initialize.cpp       | 139 +++++++++----------
 2 files changed, 67 insertions(+), 74 deletions(-)

diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
index 1c41b439..109802de 100644
--- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@@ -5,7 +5,7 @@
 ///Find the owner of a ghost node:
 MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                                      MilanInt myRank, MilanInt numProcs) {
-  //MilanLongInt Size = mVerDistance.size();
+
   MilanLongInt mStartInd = mVerDistance[myRank];
   MilanInt Start = 0;
   MilanInt End = numProcs;
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 8325e455..3e7ac207 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -1,36 +1,35 @@
 #include "MatchBoxPC.h"
 
 void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
-                       MilanLongInt StartIndex, MilanLongInt EndIndex,
-                       MilanLongInt *numGhostEdgesPtr,
-                       MilanLongInt *numGhostVerticesPtr,
-                       MilanLongInt *S,
-                       MilanLongInt *verLocInd,
-                       MilanLongInt *verLocPtr,
-                       map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-                       vector<MilanLongInt> &Counter,
-                       vector<MilanLongInt> &verGhostPtr,
-                       vector<MilanLongInt> &verGhostInd,
-                       vector<MilanLongInt> &tempCounter,
-                       vector<MilanLongInt> &GMate,
-                       vector<MilanLongInt> &Message,
-                       vector<MilanLongInt> &QLocalVtx,
-                       vector<MilanLongInt> &QGhostVtx,
-                       vector<MilanLongInt> &QMsgType,
-                       vector<MilanInt> &QOwner,
-                       MilanLongInt *&candidateMate,
-                       staticQueue &U,
-                       staticQueue &privateU,
-                       staticQueue &privateQLocalVtx,
-                       staticQueue &privateQGhostVtx,
-                       staticQueue &privateQMsgType,
-                       staticQueue &privateQOwner)
+                MilanLongInt StartIndex, MilanLongInt EndIndex,
+                MilanLongInt *numGhostEdges,
+                MilanLongInt *numGhostVertices,
+                MilanLongInt *S,
+                MilanLongInt *verLocInd,
+                MilanLongInt *verLocPtr,
+                map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                vector<MilanLongInt> &Counter,
+                vector<MilanLongInt> &verGhostPtr,
+                vector<MilanLongInt> &verGhostInd,
+                vector<MilanLongInt> &tempCounter,
+                vector<MilanLongInt> &GMate,
+                vector<MilanLongInt> &Message,
+                vector<MilanLongInt> &QLocalVtx,
+                vector<MilanLongInt> &QGhostVtx,
+                vector<MilanLongInt> &QMsgType,
+                vector<MilanInt> &QOwner,
+                MilanLongInt *&candidateMate,
+                staticQueue &U,
+                staticQueue &privateU,
+                staticQueue &privateQLocalVtx,
+                staticQueue &privateQGhostVtx,
+                staticQueue &privateQMsgType,
+                staticQueue &privateQOwner)
 {
 
-    MilanLongInt insertMe = 0, numGhostEdges = 0, numGhostVertices = 0;
+    MilanLongInt insertMe = 0;
     MilanLongInt adj1, adj2;
     int i, v, k, w;
-
     // index that starts with zero to |Vg|  - 1
     map<MilanLongInt, MilanLongInt>::iterator storedAlready;
 
@@ -55,19 +54,17 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
              * only when a ghost edge is found and ghost edges are a minority,
              * circa 3.5% during the tests.
              */
-
 #pragma omp task depend(out \
-                        : numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, numGhostVertices)
+                        : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
             {
-
-#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
-                                                     : numGhostEdges)
+#pragma omp taskloop num_tasks(NUM_THREAD)
                 for (i = 0; i < NLEdge; i++)
                 { // O(m) - Each edge stored twice
                     insertMe = verLocInd[i];
                     if ((insertMe < StartIndex) || (insertMe > EndIndex))
                     { // Find a ghost
-                        numGhostEdges++;
+#pragma omp atomic
+                        (*numGhostEdges)++;
 #pragma omp critical
                         {
                             storedAlready = Ghost2LocalMap.find(insertMe);
@@ -76,24 +73,24 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                                 Counter[storedAlready->second]++; // Increment the counter
                             }
                             else
-                            {                                                // Insert an entry for the ghost:
-                                Ghost2LocalMap[insertMe] = numGhostVertices; // Add a map entry
-                                Counter.push_back(1);                        // Initialize the counter
-                                numGhostVertices++;                          // Increment the number of ghost vertices
-                            }                                                // End of else()
+                            {                                                 // Insert an entry for the ghost:
+                                Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
+                                Counter.push_back(1);                         // Initialize the counter
+                                (*numGhostVertices)++;                          // Increment the number of ghost vertices
+                            }                                                 // End of else()
                         }
                     } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
                 }     // End of for(ghost vertices)
             }         // end of task depend
 
-            // numGhostEdges = atomicNumGhostEdges;
+            // *numGhostEdges = atomicNumGhostEdges;
 #ifdef TIME_TRACKER
             Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
             fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")NGhosts:" << numGhostVertices << " GhostEdges: " << numGhostEdges;
+            cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges;
             if (!Ghost2LocalMap.empty())
             {
                 cout << "\n(" << myRank << ")Final Map : on process ";
@@ -111,16 +108,16 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
 #pragma omp task depend(out                                                       \
                         : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
-                                                                               : numGhostVertices, numGhostEdges)
+                                                                               : *numGhostVertices, *numGhostEdges)
             {
 
                 // Initialize adjacency Lists for Ghost Vertices:
                 try
                 {
-                    verGhostPtr.reserve(numGhostVertices + 1); // Pointer Vector
-                    tempCounter.reserve(numGhostVertices);     // Pointer Vector
-                    verGhostInd.reserve(numGhostEdges);        // Index Vector
-                    GMate.reserve(numGhostVertices);           // Ghost Mate Vector
+                    verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector
+                    tempCounter.reserve(*numGhostVertices);     // Pointer Vector
+                    verGhostInd.reserve(*numGhostEdges);        // Index Vector
+                    GMate.reserve(*numGhostVertices);           // Ghost Mate Vector
                 }
                 catch (length_error)
                 {
@@ -129,11 +126,11 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                     exit(1);
                 }
                 // Initialize the Vectors:
-                verGhostPtr.resize(numGhostVertices + 1, 0); // Pointer Vector
-                tempCounter.resize(numGhostVertices, 0);     // Temporary Counter
-                verGhostInd.resize(numGhostEdges, -1);       // Index Vector
-                GMate.resize(numGhostVertices, -1);          // Temporary Counter
-                verGhostPtr[0] = 0;                          // The first value
+                verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector
+                tempCounter.resize(*numGhostVertices, 0);     // Temporary Counter
+                verGhostInd.resize(*numGhostEdges, -1);       // Index Vector
+                GMate.resize(*numGhostVertices, -1);          // Temporary Counter
+                verGhostPtr[0] = 0;                           // The first value
 #ifdef PRINT_DEBUG_INFO_
                 cout << "\n(" << myRank << ")Ghost Vertex Pointer: ";
                 fflush(stdout);
@@ -143,13 +140,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
 #pragma omp task depend(out                      \
                         : verGhostPtr) depend(in \
-                                              : Counter, numGhostVertices)
+                                              : Counter, *numGhostVertices)
             {
 
 #ifdef TIME_TRACKER
                 double verGhostPtrInitialization = MPI_Wtime();
 #endif
-                for (i = 0; i < numGhostVertices; i++)
+                for (i = 0; i < *numGhostVertices; i++)
                 { // O(|Ghost Vertices|)
                     verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
@@ -165,8 +162,8 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             } // End of task
 
 #ifdef PRINT_DEBUG_INFO_
-            if (numGhostVertices > 0)
-                cout << verGhostPtr[numGhostVertices] << "\n";
+            if (*numGhostVertices > 0)
+                cout << verGhostPtr[*numGhostVertices] << "\n";
             fflush(stdout);
 #endif
 
@@ -220,22 +217,22 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
 #ifdef PRINT_DEBUG_INFO_
             cout << "\n(" << myRank << ")Ghost Vertex Index: ";
-            for (v = 0; v < numGhostEdges; v++)
+            for (v = 0; v < *numGhostEdges; v++)
                 cout << verGhostInd[v] << "\t";
             cout << endl;
             fflush(stdout);
 #endif
 
-#pragma omp task depend(in                          \
-                        : numGhostEdges) depend(out \
-                                                : QLocalVtx, QGhostVtx, QMsgType, QOwner)
+#pragma omp task depend(in                           \
+                        : *numGhostEdges) depend(out \
+                                                 : QLocalVtx, QGhostVtx, QMsgType, QOwner)
             {
                 try
                 {
-                    QLocalVtx.reserve(numGhostEdges); // Local Vertex
-                    QGhostVtx.reserve(numGhostEdges); // Ghost Vertex
-                    QMsgType.reserve(numGhostEdges);  // Message Type (Request/Failure)
-                    QOwner.reserve(numGhostEdges);    // Owner of the ghost: COmpute once and use later
+                    QLocalVtx.reserve(*numGhostEdges); // Local Vertex
+                    QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex
+                    QMsgType.reserve(*numGhostEdges);  // Message Type (Request/Failure)
+                    QOwner.reserve(*numGhostEdges);    // Owner of the ghost: COmpute once and use later
                 }
                 catch (length_error)
                 {
@@ -268,24 +265,20 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
             fflush(stdout);
 #endif
 
-#pragma omp task depend(in                                            \
-                        : numGhostEdges, numGhostVertices) depend(out \
-                                                                  : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
+#pragma omp task depend(in                              \
+                        : *numGhostVertices) depend(out \
+                                                    : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
             {
 
-                // The values calculated in this function are sent back to the calling function
-                *numGhostEdgesPtr = numGhostEdges;
-                *numGhostVerticesPtr = numGhostVertices;
-
                 // Allocate Data Structures:
                 /*
                  * candidateMate was a vector and has been replaced with an array
                  * there is no point in using the vector (or maybe there is (???))
                  * so I replaced it with an array wich is slightly faster
                  */
-                candidateMate = new MilanLongInt[NLVer + numGhostVertices];
+                candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)];
 
-                *S = numGhostVertices; // Initialize S with number of Ghost Vertices
+                *S = (*numGhostVertices); // Initialize S with number of Ghost Vertices
 
                 /*
                  * Create the Queue Data Structure for the Dominating Set
@@ -295,13 +288,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                  * of a staticQueue I had to destroy the previous object and instantiate
                  * a new one of the correct size.
                  */
-                new (&U) staticQueue(NLVer + numGhostVertices);
+                new (&U) staticQueue(NLVer + (*numGhostVertices));
 
                 // TODO how can I decide a more meaningfull size?
-                MilanLongInt size = numGhostVertices;
+                MilanLongInt size = (*numGhostVertices);
 
                 // Initialize the privte data structure
-                new (&privateU) staticQueue(NLVer + numGhostVertices); // TODO how can I put a meaningfull size?
+                new (&privateU) staticQueue(NLVer + (*numGhostVertices)); // TODO how can I put a meaningfull size?
                 new (&privateQLocalVtx) staticQueue(size);
                 new (&privateQGhostVtx) staticQueue(size);
                 new (&privateQMsgType) staticQueue(size);

From a71fe82752d79028d98d5585e8ebc108fc6cc58c Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 12:03:48 -0500
Subject: [PATCH 64/96] PROCESS_CROSS_EDGE refactoring

---
 amgprec/impl/aggregator/MatchBoxPC.h                      | 3 +--
 ...EdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 2 +-
 amgprec/impl/aggregator/processCrossEdge.cpp              | 5 ++---
 amgprec/impl/aggregator/processExposedVertex.cpp          | 2 +-
 amgprec/impl/aggregator/processMatchedVertices.cpp        | 2 +-
 amgprec/impl/aggregator/processMessages.cpp               | 8 ++++----
 6 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 1ff2cb56..1e84b7ca 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -287,8 +287,7 @@ extern "C"
                                            staticQueue &privateQMsgType,
                                            staticQueue &privateQOwner);
 
-    void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
-                            MilanLongInt edge,
+    void PROCESS_CROSS_EDGE(MilanLongInt *edge,
                             MilanLongInt *SPtr);
 
     void processMatchedVertices(
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 8c02ddcf..ab031f68 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -456,7 +456,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                         fflush(stdout);
 #endif
 
-                                        PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S);
+                                        PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S);
 
                                     } // End of if CandidateMate[w] = v
                                 }     // End of if a Ghost Vertex
diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
index 30efd79d..30af9f20 100644
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -1,14 +1,13 @@
 #include "MatchBoxPC.h"
 
-void PROCESS_CROSS_EDGE(vector<MilanLongInt> &Counter,
-                        MilanLongInt edge,
+void PROCESS_CROSS_EDGE(MilanLongInt *edge,
                         MilanLongInt *SPtr)
 {
     // Start: PARALLEL_PROCESS_CROSS_EDGE_B
     MilanLongInt captureCounter;
 
 #pragma omp atomic capture
-    captureCounter = --Counter[edge]; // Decrement
+    captureCounter = --(*edge); // Decrement
 
     //assert(captureCounter >= 0);
 
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 97840b19..a2ea6c8d 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -115,7 +115,7 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
                             //TODO refactor this!!
                             // Decrement the counter:
-                            PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S);
+                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S);
                         } // End of if CandidateMate[w] = v
 
                     } // End of if a Ghost Vertex
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 11d7466d..fe983285 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -183,7 +183,7 @@ void processMatchedVertices(
                             fflush(stdout);
 #endif
                             // Decrement the counter:
-                            PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr);
+                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
                         case 2:
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 7e5c3915..c812e96d 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -194,7 +194,7 @@ void processMessages(
                     fflush(stdout);
 #endif
 
-                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S);
+                    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
                 } // End of if ( candidateMate[v-StartIndex] == u )e
             }     // End of if ( Mate[v] == -1 )
         }         // End of REQUEST
@@ -207,7 +207,7 @@ void processMessages(
                 fflush(stdout);
 #endif
                 GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
-                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S);
+                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
 #ifdef DEBUG_GHOST_
                 if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
                 {
@@ -260,7 +260,7 @@ void processMessages(
                                     fflush(stdout);
 #endif
 
-                                    PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], S);
+                                    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
                                 } // End of if CandidateMate[w] = v
                             }     // End of if a Ghost Vertex
                             else
@@ -319,7 +319,7 @@ void processMessages(
                 fflush(stdout);
 #endif
                 GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
-                PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); // Decrease the counter
+                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
             }                                                      // End of else: CASE III
         }                                                          // End of else: CASE I
     }

From 3e945c75b44ccf75a097deaabb5700286e052b2c Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 13:20:49 -0500
Subject: [PATCH 65/96] Refactoring, removed all useless Pointer passed in
 functions

---
 amgprec/impl/aggregator/processCrossEdge.cpp  |  4 +-
 .../impl/aggregator/processExposedVertex.cpp  | 96 +++++++++----------
 amgprec/impl/aggregator/processMessages.cpp   | 25 +++--
 .../impl/aggregator/sendBundledMessages.cpp   | 58 +++++------
 4 files changed, 85 insertions(+), 98 deletions(-)

diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp
index 30af9f20..e844f127 100644
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@@ -1,7 +1,7 @@
 #include "MatchBoxPC.h"
 
 void PROCESS_CROSS_EDGE(MilanLongInt *edge,
-                        MilanLongInt *SPtr)
+                        MilanLongInt *S)
 {
     // Start: PARALLEL_PROCESS_CROSS_EDGE_B
     MilanLongInt captureCounter;
@@ -13,7 +13,7 @@ void PROCESS_CROSS_EDGE(MilanLongInt *edge,
 
     if (captureCounter == 0)
 #pragma omp atomic
-        (*SPtr)--; // Decrement S
+        (*S)--; // Decrement S
 
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index a2ea6c8d..c28a7c66 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -1,45 +1,43 @@
 #include "MatchBoxPC.h"
 
 void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
-                                              MilanLongInt *candidateMate,
-                                              MilanLongInt *verLocInd,
-                                              MilanLongInt *verLocPtr,
-                                              MilanLongInt StartIndex,
-                                              MilanLongInt EndIndex,
-                                              MilanLongInt *Mate,
-                                              vector<MilanLongInt> &GMate,
-                                              map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-                                              MilanReal *edgeLocWeight,
-                                              MilanLongInt *myCardPtr,
-                                              MilanLongInt *msgIndPtr,
-                                              MilanLongInt *NumMessagesBundledPtr,
-                                              MilanLongInt *SPtr,
-                                              MilanLongInt *verDistance,
-                                              MilanLongInt *PCounter,
-                                              vector<MilanLongInt> &Counter,
-                                              MilanInt myRank,
-                                              MilanInt numProcs,
-                                              staticQueue &U,
-                                              staticQueue &privateU,
-                                              vector<MilanLongInt> &QLocalVtx,
-                                              vector<MilanLongInt> &QGhostVtx,
-                                              vector<MilanLongInt> &QMsgType,
-                                              vector<MilanInt> &QOwner,
-                                              staticQueue &privateQLocalVtx,
-                                              staticQueue &privateQGhostVtx,
-                                              staticQueue &privateQMsgType,
-                                              staticQueue &privateQOwner)
+                                       MilanLongInt *candidateMate,
+                                       MilanLongInt *verLocInd,
+                                       MilanLongInt *verLocPtr,
+                                       MilanLongInt StartIndex,
+                                       MilanLongInt EndIndex,
+                                       MilanLongInt *Mate,
+                                       vector<MilanLongInt> &GMate,
+                                       map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                       MilanReal *edgeLocWeight,
+                                       MilanLongInt *myCard,
+                                       MilanLongInt *msgInd,
+                                       MilanLongInt *NumMessagesBundled,
+                                       MilanLongInt *S,
+                                       MilanLongInt *verDistance,
+                                       MilanLongInt *PCounter,
+                                       vector<MilanLongInt> &Counter,
+                                       MilanInt myRank,
+                                       MilanInt numProcs,
+                                       staticQueue &U,
+                                       staticQueue &privateU,
+                                       vector<MilanLongInt> &QLocalVtx,
+                                       vector<MilanLongInt> &QGhostVtx,
+                                       vector<MilanLongInt> &QMsgType,
+                                       vector<MilanInt> &QOwner,
+                                       staticQueue &privateQLocalVtx,
+                                       staticQueue &privateQGhostVtx,
+                                       staticQueue &privateQMsgType,
+                                       staticQueue &privateQOwner)
 {
 
-    MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0, S = *SPtr;
-    MilanLongInt myCard = 0, msgInd = 0;
-    MilanLongInt NumMessagesBundled = 0;
+    MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
     MilanInt ghostOwner = 0;
 
 #pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
     {
 #pragma omp for reduction(+ \
-                          : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
+                          : PCounter[:numProcs]) schedule(static)
         for (v = 0; v < NLVer; v++)
         {
             // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
@@ -76,8 +74,8 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
                 if (w >= 0)
                 {
-
-                    myCard++;
+#pragma omp atomic
+                    (*myCard)++;
                     if ((w < StartIndex) || (w > EndIndex))
                     { // w is a ghost vertex
 #ifdef PRINT_DEBUG_INFO_
@@ -85,9 +83,10 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                         cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                         fflush(stdout);
 #endif
-
-                        msgInd++;
-                        NumMessagesBundled++;
+#pragma omp atomic
+                        (*msgInd)++;
+#pragma omp atomic
+                        (*NumMessagesBundled)++;
                         ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                         assert(ghostOwner != -1);
                         assert(ghostOwner != myRank);
@@ -97,7 +96,6 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                         privateQGhostVtx.push_back(w);
                         privateQMsgType.push_back(REQUEST);
                         privateQOwner.push_back(ghostOwner);
-                        
 
                         if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
                         {
@@ -113,9 +111,9 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                             fflush(stdout);
 #endif
 
-                            //TODO refactor this!!
-                            // Decrement the counter:
-                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S);
+                            // TODO refactor this!!
+                            //  Decrement the counter:
+                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
                         } // End of if CandidateMate[w] = v
 
                     } // End of if a Ghost Vertex
@@ -159,9 +157,10 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                     cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                     fflush(stdout);
 #endif
-
-                    msgInd++;
-                    NumMessagesBundled++;
+#pragma omp atomic
+                    (*msgInd)++;
+#pragma omp atomic
+                    (*NumMessagesBundled)++;
                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                     assert(ghostOwner != -1);
                     assert(ghostOwner != myRank);
@@ -184,14 +183,5 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                        privateQMsgType,
                        privateQOwner);
 
-//TODO move this outside of the parallel region!!
-#pragma omp master
-        {
-            *myCardPtr = myCard;
-            *msgIndPtr = msgInd;
-            *NumMessagesBundledPtr = NumMessagesBundled;
-            *SPtr = S;
-        }
-
     } // End of parallel region
 }
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index c812e96d..43fc5ce9 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -9,9 +9,9 @@ void processMessages(
     vector<MilanLongInt> &Counter,
     MilanLongInt StartIndex,
     MilanLongInt EndIndex,
-    MilanLongInt *myCardPtr,
-    MilanLongInt *msgIndPtr,
-    MilanLongInt *msgActualPtr,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *msgActual,
     MilanReal *edgeLocWeight,
     MilanLongInt *verDistance,
     MilanLongInt *verLocPtr,
@@ -30,7 +30,7 @@ void processMessages(
 
     MilanInt Sender;
     MPI_Status computeStatus;
-    MilanLongInt bundleSize, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w;
+    MilanLongInt bundleSize, w;
     MilanLongInt adj11, adj12, k1;
     MilanLongInt ghostOwner;
     int error_codeC;
@@ -188,7 +188,7 @@ void processMessages(
                     Mate[v - StartIndex] = u;     // v is local
                     U.push_back(v);
                     U.push_back(u);
-                    myCard++;
+                    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
                     cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
                     fflush(stdout);
@@ -246,15 +246,15 @@ void processMessages(
                                 assert(ghostOwner != myRank);
 
                                 MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                msgInd++;
-                                msgActual++;
+                                (*msgInd)++;
+                                (*msgActual)++;
                                 if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
                                 {
                                     Mate[v - StartIndex] = w;     // v is local
                                     GMate[Ghost2LocalMap[w]] = v; // w is ghost
                                     U.push_back(v);
                                     U.push_back(w);
-                                    myCard++;
+                                    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
                                     cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
                                     fflush(stdout);
@@ -272,7 +272,7 @@ void processMessages(
                                     // Q.push_back(u);
                                     U.push_back(v);
                                     U.push_back(w);
-                                    myCard++;
+                                    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
                                     cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
                                     fflush(stdout);
@@ -303,8 +303,8 @@ void processMessages(
                                     assert(ghostOwner != -1);
                                     assert(ghostOwner != myRank);
                                     MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                    msgInd++;
-                                    msgActual++;
+                                    (*msgInd)++;
+                                    (*msgActual)++;
                                 } // End of if(GHOST)
                             }     // End of for loop
                         }         // End of Else: w == -1
@@ -324,8 +324,5 @@ void processMessages(
         }                                                          // End of else: CASE I
     }
 
-    *myCardPtr = myCard;
-    *msgIndPtr = msgInd;
-    *msgActualPtr = msgActual;
     return;
 }
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp
index f3dd3e46..8665759c 100644
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@@ -1,15 +1,15 @@
 #include "MatchBoxPC.h"
 
-void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
-                                MilanInt *BufferSizePtr,
+void sendBundledMessages(MilanLongInt *numGhostEdges,
+                                MilanInt *BufferSize,
                                 MilanLongInt *Buffer,
                                 vector<MilanLongInt> &PCumulative,
                                 vector<MilanLongInt> &PMessageBundle,
                                 vector<MilanLongInt> &PSizeInfoMessages,
                                 MilanLongInt *PCounter,
                                 MilanLongInt NumMessagesBundled,
-                                MilanLongInt *msgActualPtr,
-                                MilanLongInt *MessageIndexPtr,
+                                MilanLongInt *msgActual,
+                                MilanLongInt *MessageIndex,
                                 MilanInt numProcs,
                                 MilanInt myRank,
                                 MPI_Comm comm,
@@ -21,8 +21,8 @@ void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
                                 vector<MPI_Status> &SStatus)
 {
 
-    MilanLongInt myIndex = 0, msgActual = *msgActualPtr, MessageIndex = *MessageIndexPtr, numGhostEdges = *numGhostEdgesPtr, numMessagesToSend;
-    MilanInt i = 0, OneMessageSize = 0, BufferSize = *BufferSizePtr;
+    MilanLongInt myIndex = 0, numMessagesToSend;
+    MilanInt i = 0, OneMessageSize = 0;
 
 #ifdef DEBUG_HANG_
     if (myRank == 0)
@@ -105,7 +105,7 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
 // Send the Messages
 #pragma omp task depend(inout                                                  \
                         : SRequest, PSizeInfoMessages, PCumulative) depend(out \
-                                                                           : msgActual, MessageIndex)
+                                                                           : *msgActual, *MessageIndex)
 {
     for (i = 0; i < numProcs; i++)
     {                    // Changed by Fabio to be an integer, addresses needs to be integers!
@@ -124,9 +124,9 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
         if (PSizeInfoMessages[i * 3 + 0] > 0)
         { // Send only if it is a nonempty packet
             MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
-                      &SRequest[MessageIndex]);
-            msgActual++;
-            MessageIndex++;
+                      &SRequest[(*MessageIndex)]);
+            (*msgActual)++;
+            (*MessageIndex)++;
             // Now Send the message with the data packet:
 #ifdef PRINT_DEBUG_INFO_
             cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
@@ -136,8 +136,8 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
             fflush(stdout);
 #endif
             MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
-                      TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[MessageIndex]);
-            MessageIndex++;
+                      TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*MessageIndex)]);
+            (*MessageIndex)++;
         } // End of if size > 0
     }
 }
@@ -154,16 +154,16 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
     QOwner.clear();
 }
 
-#pragma omp task depend(inout : OneMessageSize, BufferSize) depend(out : numMessagesToSend) depend(in : numGhostEdges)
+#pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges)
 {
 
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
-    cout << "\n(" << myRank << ")Total number of potential message X 2 = " << numGhostEdges * 2;
+    cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
+    cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
     cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
-    if (numGhostEdges > 0)
+    if (*numGhostEdges > 0)
     {
-        cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(numGhostEdges * 2)) * 100.0 << "% \n";
+        cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
     }
     fflush(stdout);
 #endif
@@ -177,39 +177,39 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
     // Request, Success, Failure.
     // But only two will be sent from a given processor.
     // Substract the number of messages that have already been sent as bundled messages:
-    numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled;
-    BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
+    numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
+    *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
 }
 
-#pragma omp task depend(out : Buffer) depend(in : BufferSize)
+#pragma omp task depend(out : Buffer) depend(in : *BufferSize)
 {
     Buffer = 0;
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
     cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
-    cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges;
+    cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
     cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
-    cout << "\n(" << myRank << ")BufferSize = " << BufferSize;
+    cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
     cout << "\n(" << myRank << ")Attaching Buffer on.. ";
     fflush(stdout);
 #endif
-    if (BufferSize > 0)
+    if ((*BufferSize) > 0)
     {
-        Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory
+        Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
         if (Buffer == 0)
         {
             cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
             cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
             exit(1);
         }
-        MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer
+        MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
     }
 }
 }
 }
 
-*MessageIndexPtr = MessageIndex;
-*msgActualPtr = msgActual;
-*numGhostEdgesPtr = numGhostEdges;
-*BufferSizePtr = BufferSize;
+//*MessageIndexPtr = MessageIndex;
+//*msgActualPtr = msgActual;
+//*numGhostEdgesPtr = numGhostEdges;
+//*BufferSizePtr = BufferSize;
 }
\ No newline at end of file

From 44f174a5714063b50cb08a689b237258aea2f75b Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sun, 17 Jul 2022 13:44:58 -0500
Subject: [PATCH 66/96] findOwnerOfGhost optimization and refactor

---
 amgprec/impl/aggregator/findOwnerOfGhost.cpp | 57 +++++++-------------
 1 file changed, 19 insertions(+), 38 deletions(-)

diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
index 109802de..b9d60614 100644
--- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@@ -1,48 +1,29 @@
 #include "MatchBoxPC.h"
 
-
-//TODO parallelize this
-///Find the owner of a ghost node:
+/// Find the owner of a ghost node:
 MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
-                                     MilanInt myRank, MilanInt numProcs) {
+                          MilanInt myRank, MilanInt numProcs)
+{
 
   MilanLongInt mStartInd = mVerDistance[myRank];
   MilanInt Start = 0;
   MilanInt End = numProcs;
   MilanInt Current = 0;
 
-#if 0
-  if ( vtxIndex < mStartInd )
-    End = myRank;
-  else
-    Start = myRank;
-#endif
+  while (Start <= End)
+  {
+    Current = (End + Start) / 2;
+    // CASE-1:
+    if (mVerDistance[Current] == vtxIndex) return Current;
+    else // CASE 2:
+      if (mVerDistance[Current] > vtxIndex)
+        End = Current - 1;
+      else // CASE 3:
+        Start = Current + 1;
+  } // End of While()
+
+ if (mVerDistance[Current] > vtxIndex)
+      return (Current - 1);
 
-  while ( Start <= End ) {
-    Current = (End + Start)/2;
-    //CASE-1:
-    if ( mVerDistance[Current] == vtxIndex ) {
-      while ( mVerDistance[Current+1] == vtxIndex ) {
-	Current++;
-	if ( Current == numProcs )
-	  return (-1);
-      }
-      return (Current);
-    }
-    else { //CASE 2:
-      if ( mVerDistance[Current] > vtxIndex )
-	End = Current - 1;
-      else //CASE 3:
-	Start = Current + 1;
-    }
-  } //End of While()
-  if ( Current == 0 )
-    return (Current);
-  else {
-    if ( mVerDistance[Current] > vtxIndex )
-      return (Current-1);
-    else
-      return (Current);
-  } //End of else
-  return (-1); //It should not reach here!
-} //End of findOwnerOfGhost()
+  return Current;
+} // End of findOwnerOfGhost()

From 22d9baf29608a3f3b190f33ac735361528f56ae9 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Mon, 18 Jul 2022 14:11:53 -0500
Subject: [PATCH 67/96] isAlreadyMatched substituted with atomic read in one
 place

---
 amgprec/impl/aggregator/MatchBoxPC.h          |   1 +
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 204 +++++++++---------
 amgprec/impl/aggregator/isAlreadyMatched.cpp  |   1 +
 .../aggregator/processMatchedVertices.cpp     |  15 +-
 4 files changed, 108 insertions(+), 113 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 1e84b7ca..e8a2e2cc 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -292,6 +292,7 @@ extern "C"
 
     void processMatchedVertices(
         MilanLongInt NLVer,
+        vector<MilanLongInt> &UChunkBeingProcessed,
         staticQueue &U,
         staticQueue &privateU,
         MilanLongInt StartIndex,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index ab031f68..a4fb68e5 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -274,8 +274,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
 
+    // TODO what would be the optimal UCHUNK
+    vector<MilanLongInt> UChunkBeingProcessed;
+    UChunkBeingProcessed.reserve(UCHUNK);
+
     processMatchedVertices(NLVer,
-                           //UChunkBeingProcessed,
+                           UChunkBeingProcessed,
                            U,
                            privateU,
                            StartIndex,
@@ -382,117 +386,47 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     v = verLocInd[k];
                     if ((v >= StartIndex) && (v <= EndIndex))
                     {                                  // v is a Local Vertex:
-                        if (Mate[v - StartIndex] >= 0) // v is already matched
-                            continue;
+                        //if (Mate[v - StartIndex] >= 0) // v is already matched
+                        //    continue;
 #ifdef PRINT_DEBUG_INFO_
                         cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
                         fflush(stdout);
 #endif
-                        if (candidateMate[v - StartIndex] == u)
-                        { // Only if pointing to the matched vertex
-                            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-                            adj11 = verLocPtr[v - StartIndex];
-                            adj12 = verLocPtr[v - StartIndex + 1];
-                            w = -1;
-                            heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
-                            for (k1 = adj11; k1 < adj12; k1++)
-                            {
-                                if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex))
-                                {                                                  // Is it a ghost vertex?
-                                    if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched
-                                        continue;
-                                }
-                                else
-                                {                                              // A local vertex
-                                    if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched
-                                        continue;
-                                }
-
-                                if ((edgeLocWeight[k1] > heaviestEdgeWt) ||
-                                    ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1])))
-                                {
-                                    heaviestEdgeWt = edgeLocWeight[k1];
-                                    w = verLocInd[k1];
-                                }
-                            } // End of for loop
-                            candidateMate[v - StartIndex] = w;
-                            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")" << v << " Points to: " << w;
-                            fflush(stdout);
-#endif
-                            // If found a dominating edge:
-                            if (w >= 0)
-                            {
-                                if ((w < StartIndex) || (w > EndIndex))
-                                { // w is a ghost
-                                    // Build the Message Packet:
-                                    Message[0] = v;       // LOCAL
-                                    Message[1] = w;       // GHOST
-                                    Message[2] = REQUEST; // TYPE
-                                    // Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a request message:";
-                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    fflush(stdout);
-#endif
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
-                                    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                    msgInd++;
-                                    msgActual++;
-                                    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                    {
-                                        Mate[v - StartIndex] = w;     // v is local
-                                        GMate[Ghost2LocalMap[w]] = v; // w is ghost
-                                        // Q.push_back(u);
-                                        U.push_back(v);
-                                        U.push_back(w);
-                                        myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                        fflush(stdout);
-#endif
-
-                                        PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S);
-
-                                    } // End of if CandidateMate[w] = v
-                                }     // End of if a Ghost Vertex
-                                else
-                                { // w is a local vertex
-                                    if (candidateMate[w - StartIndex] == v)
-                                    {
-                                        Mate[v - StartIndex] = w; // v is local
-                                        Mate[w - StartIndex] = v; // w is local
-                                        // Q.push_back(u);
-                                        U.push_back(v);
-                                        U.push_back(w);
-                                        myCard++;
+                        // If the current vertex is pointing to a matched vertex and is not matched
+                        if (Mate[v - StartIndex] < 0)
+                        {
+                            if (candidateMate[v - StartIndex] == u)
+                            { // Only if pointing to the matched vertex
+                              // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                         verLocPtr[v - StartIndex + 1],
+                                                         edgeLocWeight, 0,
+                                                         verLocInd,
+                                                         StartIndex,
+                                                         EndIndex,
+                                                         GMate,
+                                                         Mate,
+                                                         Ghost2LocalMap);
+
+                                candidateMate[v - StartIndex] = w;
+                                // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
 #ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                        fflush(stdout);
+                                cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                fflush(stdout);
 #endif
-                                    } // End of if(CandidateMate(w) = v
-                                }     // End of Else
-                            }         // End of if(w >=0)
-                            else
-                            { // no dominating edge found: w == -1
-                                adj11 = verLocPtr[v - StartIndex];
-                                adj12 = verLocPtr[v - StartIndex + 1];
-                                for (k1 = adj11; k1 < adj12; k1++)
+                                // If found a dominating edge:
+                                if (w >= 0)
                                 {
-                                    w = verLocInd[k1];
                                     if ((w < StartIndex) || (w > EndIndex))
-                                    { // A ghost
+                                    { // w is a ghost
                                         // Build the Message Packet:
                                         Message[0] = v;       // LOCAL
                                         Message[1] = w;       // GHOST
-                                        Message[2] = FAILURE; // TYPE
+                                        Message[2] = REQUEST; // TYPE
                                         // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")Sending a failure message: ";
+                                        cout << "\n(" << myRank << ")Sending a request message:";
                                         cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                         fflush(stdout);
 #endif
@@ -502,12 +436,72 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                         MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
                                         msgInd++;
                                         msgActual++;
-                                    } // End of if(GHOST)
-                                }     // End of for loop
-                            }         // End of Else: w == -1
-                            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        } // End of If (candidateMate[v-StartIndex] == u)
-                    }     // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                        {
+                                            Mate[v - StartIndex] = w;     // v is local
+                                            GMate[Ghost2LocalMap[w]] = v; // w is ghost
+                                            // Q.push_back(u);
+                                            U.push_back(v);
+                                            U.push_back(w);
+                                            myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                            fflush(stdout);
+#endif
+
+                                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S);
+
+                                        } // End of if CandidateMate[w] = v
+                                    }     // End of if a Ghost Vertex
+                                    else
+                                    { // w is a local vertex
+                                        if (candidateMate[w - StartIndex] == v)
+                                        {
+                                            Mate[v - StartIndex] = w; // v is local
+                                            Mate[w - StartIndex] = v; // w is local
+                                            // Q.push_back(u);
+                                            U.push_back(v);
+                                            U.push_back(w);
+                                            myCard++;
+#ifdef PRINT_DEBUG_INFO_
+                                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                            fflush(stdout);
+#endif
+                                        } // End of if(CandidateMate(w) = v
+                                    }     // End of Else
+                                }         // End of if(w >=0)
+                                else
+                                { // no dominating edge found: w == -1
+                                    adj11 = verLocPtr[v - StartIndex];
+                                    adj12 = verLocPtr[v - StartIndex + 1];
+                                    for (k1 = adj11; k1 < adj12; k1++)
+                                    {
+                                        w = verLocInd[k1];
+                                        if ((w < StartIndex) || (w > EndIndex))
+                                        { // A ghost
+                                            // Build the Message Packet:
+                                            Message[0] = v;       // LOCAL
+                                            Message[1] = w;       // GHOST
+                                            Message[2] = FAILURE; // TYPE
+                                            // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                            cout << "\n(" << myRank << ")Sending a failure message: ";
+                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            fflush(stdout);
+#endif
+                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                            assert(ghostOwner != -1);
+                                            assert(ghostOwner != myRank);
+                                            MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                            msgInd++;
+                                            msgActual++;
+                                        } // End of if(GHOST)
+                                    }     // End of for loop
+                                }         // End of Else: w == -1
+                                // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                            } // End of If (candidateMate[v-StartIndex] == u)
+                        }
+                    } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
                     else
                     { // Neighbor v is a ghost vertex
                         if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp
index 38ae73f5..d4efd416 100644
--- a/amgprec/impl/aggregator/isAlreadyMatched.cpp
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@@ -1,5 +1,6 @@
 #include "MatchBoxPC.h"
 
+//TODO can be optimized!!
 /**
  * //TODO documentation
  * @param k
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index fe983285..0054ffa2 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -4,6 +4,7 @@
 
 void processMatchedVertices(
     MilanLongInt NLVer,
+    vector<MilanLongInt> &UChunkBeingProcessed,
     staticQueue &U,
     staticQueue &privateU,
     MilanLongInt StartIndex,
@@ -36,6 +37,8 @@ void processMatchedVertices(
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
     int option;
+    MilanLongInt mateVal;
+
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
     fflush(stdout);
@@ -45,14 +48,9 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
     {
 
-        // TODO what would be the optimal UCHUNK
-        // TODO refactor
-        vector<MilanLongInt> UChunkBeingProcessed;
-        UChunkBeingProcessed.reserve(UCHUNK);
-
         while (!U.empty())
         {
 
@@ -86,9 +84,10 @@ void processMatchedVertices(
                             cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
                             fflush(stdout);
 #endif
-
+#pragma omp atomic read
+                            mateVal = Mate[v - StartIndex];
                             // If the current vertex is pointing to a matched vertex and is not matched
-                            if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                            if (mateVal < 0)
                             {
 #pragma omp critical
                                 {

From cdf92ea2b247e1b35f5e6ea6413b7104076654e7 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 20 Jul 2022 15:37:29 -0500
Subject: [PATCH 68/96] processMatchedVerticess add send messages with error

---
 amgprec/impl/aggregator/MatchBoxPC.h          | 49 ++++++----
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 60 ++++++++++--
 .../aggregator/processMatchedVertices.cpp     | 98 +++++++++++++++----
 amgprec/impl/aggregator/processMessages.cpp   |  4 +-
 .../impl/aggregator/sendBundledMessages.cpp   | 17 ++--
 amgprec/stZnqhkT                              |  1 +
 6 files changed, 169 insertions(+), 60 deletions(-)
 create mode 100644 amgprec/stZnqhkT

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index e8a2e2cc..dafe381e 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -76,6 +76,8 @@ const MilanLongInt SIZEINFO = 4;
 const int ComputeTag = 7; // Predefined tag
 const int BundleTag = 9;  // Predefined tag
 
+static vector<MilanLongInt> DEFAULT_VECTOR;
+
 // MPI type map
 template <typename T>
 MPI_Datatype TypeMap();
@@ -320,27 +322,32 @@ extern "C"
         staticQueue &privateQLocalVtx,
         staticQueue &privateQGhostVtx,
         staticQueue &privateQMsgType,
-        staticQueue &privateQOwner);
-
-        void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
-                                 MilanInt *BufferSizePtr,
-                                 MilanLongInt *Buffer,
-                                 vector<MilanLongInt> &PCumulative,
-                                 vector<MilanLongInt> &PMessageBundle,
-                                 vector<MilanLongInt> &PSizeInfoMessages,
-                                 MilanLongInt *PCounter,
-                                 MilanLongInt NumMessagesBundled,
-                                 MilanLongInt *msgActualPtr,
-                                 MilanLongInt *MessageIndexPtr,
-                                 MilanInt numProcs,
-                                 MilanInt myRank,
-                                 MPI_Comm comm,
-                                 vector<MilanLongInt> &QLocalVtx,
-                                 vector<MilanLongInt> &QGhostVtx,
-                                 vector<MilanLongInt> &QMsgType,
-                                 vector<MilanInt> &QOwner,
-                                 vector<MPI_Request> &SRequest,
-                                 vector<MPI_Status> &SStatus);
+        staticQueue &privateQOwner,
+        bool sendMessages = false,
+        MPI_Comm comm = NULL,
+        MilanLongInt *msgActual = nullptr,
+        MilanLongInt *msgInd = nullptr,
+        vector<MilanLongInt> &Message = DEFAULT_VECTOR);
+
+    void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
+                             MilanInt *BufferSizePtr,
+                             MilanLongInt *Buffer,
+                             vector<MilanLongInt> &PCumulative,
+                             vector<MilanLongInt> &PMessageBundle,
+                             vector<MilanLongInt> &PSizeInfoMessages,
+                             MilanLongInt *PCounter,
+                             MilanLongInt NumMessagesBundled,
+                             MilanLongInt *msgActualPtr,
+                             MilanLongInt *MessageIndexPtr,
+                             MilanInt numProcs,
+                             MilanInt myRank,
+                             MPI_Comm comm,
+                             vector<MilanLongInt> &QLocalVtx,
+                             vector<MilanLongInt> &QGhostVtx,
+                             vector<MilanLongInt> &QMsgType,
+                             vector<MilanInt> &QOwner,
+                             vector<MPI_Request> &SRequest,
+                             vector<MPI_Status> &SStatus);
 
     void processMessages(
         MilanLongInt NLVer,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index a4fb68e5..99fd57c3 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -368,9 +368,51 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
-        while (/*!Q.empty()*/ !U.empty())
+        ///*
+
+//#define error
+#ifdef error
+        processMatchedVertices(NLVer,
+                               UChunkBeingProcessed,
+                               U,
+                               privateU,
+                               StartIndex,
+                               EndIndex,
+                               &myCard,
+                               &msgInd,
+                               &NumMessagesBundled,
+                               &S,
+                               verLocPtr,
+                               verLocInd,
+                               verDistance,
+                               PCounter,
+                               Counter,
+                               myRank,
+                               numProcs,
+                               candidateMate,
+                               GMate,
+                               Mate,
+                               Ghost2LocalMap,
+                               edgeLocWeight,
+                               QLocalVtx,
+                               QGhostVtx,
+                               QMsgType,
+                               QOwner,
+                               privateQLocalVtx,
+                               privateQGhostVtx,
+                               privateQMsgType,
+                               privateQOwner,
+                               true,
+                               comm,
+                               &msgActual,
+                               &msgInd,
+                               Message);
+#endif
+#ifndef error
+
+        while (!U.empty())
         {
-            // Q.pop_front();
+
             u = U.pop_front(); // Get an element from the queue
 #ifdef PRINT_DEBUG_INFO_
             cout << "\n(" << myRank << ")u: " << u;
@@ -385,9 +427,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                 {
                     v = verLocInd[k];
                     if ((v >= StartIndex) && (v <= EndIndex))
-                    {                                  // v is a Local Vertex:
-                        //if (Mate[v - StartIndex] >= 0) // v is already matched
-                        //    continue;
+                    { // v is a Local Vertex:
+                      // if (Mate[v - StartIndex] >= 0) // v is already matched
+                      //    continue;
 #ifdef PRINT_DEBUG_INFO_
                         cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
                         fflush(stdout);
@@ -500,8 +542,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                 }         // End of Else: w == -1
                                 // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
                             } // End of If (candidateMate[v-StartIndex] == u)
-                        }
-                    } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        }     // if (Mate[v - StartIndex] < 0)
+                    }         // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
                     else
                     { // Neighbor v is a ghost vertex
                         if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
@@ -536,7 +578,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                     }     // End of Else //A Ghost Vertex
                 }         // End of For Loop adj(u)
             }             // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
-        }                 // End of while ( /*!Q.empty()*/ !U.empty() )
+        }                 // End of while ( !U.empty() )
+#endif
+
         ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 
         //// BREAK IF NO MESSAGES EXPECTED /////////
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 0054ffa2..296bfa15 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -32,7 +32,12 @@ void processMatchedVertices(
     staticQueue &privateQLocalVtx,
     staticQueue &privateQGhostVtx,
     staticQueue &privateQMsgType,
-    staticQueue &privateQOwner)
+    staticQueue &privateQOwner,
+    bool sendMessages,
+    MPI_Comm comm,
+    MilanLongInt *msgActual,
+    MilanLongInt *msgInd,
+    vector<MilanLongInt> &Message)
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
@@ -48,7 +53,7 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
     {
 
         while (!U.empty())
@@ -144,16 +149,14 @@ void processMatchedVertices(
 #endif
                                                 } // End of if(CandidateMate(w) = v
                                             }     // End of Else
-
-                                        } // End of if(w >=0)
+                                        }         // End of if(w >=0)
                                         else
                                             option = 4; // End of Else: w == -1
                                         // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    }
-                                } // End of task
-                            }     // End of If (candidateMate[v-StartIndex] == u
-
-                        } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                                    } // End of If (candidateMate[v-StartIndex] == u
+                                }     // End of task
+                            }         // mateval < 0
+                        }             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
                         else
                         { // Neighbor is a ghost vertex
 
@@ -184,16 +187,36 @@ void processMatchedVertices(
                             // Decrement the counter:
                             PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
                         case 2:
+
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                             assert(ghostOwner != -1);
                             assert(ghostOwner != myRank);
+                            if (sendMessages)
+                            {
+                                // Build the Message Packet:
+                                Message[0] = v;       // LOCAL
+                                Message[1] = w;       // GHOST
+                                Message[2] = REQUEST; // TYPE
+                                                      // Send a Request (Asynchronous)
+                                                      //#pragma omp master
+                                                      //                                {
+                                MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+//                                }
 #pragma omp atomic
-                            PCounter[ghostOwner]++;
+                                (*msgActual)++;
+                            }
+                            else
+                            {
 #pragma omp atomic
-                            (*msgIndPtr)++;
+                                PCounter[ghostOwner]++;
 #pragma omp atomic
-                            (*NumMessagesBundledPtr)++;
+                                (*NumMessagesBundledPtr)++;
+                            }
+
+#pragma omp atomic
+                            (*msgIndPtr)++;
+
                             privateQLocalVtx.push_back(v);
                             privateQGhostVtx.push_back(w);
                             privateQMsgType.push_back(REQUEST);
@@ -224,12 +247,30 @@ void processMatchedVertices(
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                     assert(ghostOwner != -1);
                                     assert(ghostOwner != myRank);
+                                    if (sendMessages)
+                                    {
+                                        // Build the Message Packet:
+                                        Message[0] = v;       // LOCAL
+                                        Message[1] = w;       // GHOST
+                                        Message[2] = FAILURE; // TYPE
+                                                              // Send a Request (Asynchronous)
+                                                              //#pragma omp master
+                                                              //                                        {
+                                        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+//                                        }
+#pragma omp atomic
+                                        (*msgActual)++;
+                                    }
+                                    else
+                                    {
 #pragma omp atomic
-                                    PCounter[ghostOwner]++;
+                                        PCounter[ghostOwner]++;
 #pragma omp atomic
-                                    (*msgIndPtr)++;
+                                        (*NumMessagesBundledPtr)++;
+                                    }
+
 #pragma omp atomic
-                                    (*NumMessagesBundledPtr)++;
+                                    (*msgIndPtr)++;
 
                                     privateQLocalVtx.push_back(v);
                                     privateQGhostVtx.push_back(w);
@@ -239,6 +280,7 @@ void processMatchedVertices(
                                 } // End of if(GHOST)
                             }     // End of for loop
                             break;
+                        case 5:
                         default:
 
 #ifdef PRINT_DEBUG_INFO_
@@ -250,12 +292,32 @@ void processMatchedVertices(
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
                             assert(ghostOwner != -1);
                             assert(ghostOwner != myRank);
+                            if (sendMessages)
+                            {
+                                // Build the Message Packet:
+                                Message[0] = u;       // LOCAL
+                                Message[1] = v;       // GHOST
+                                Message[2] = SUCCESS; // TYPE
+
+                                // Send a Request (Asynchronous)
+                                //#pragma omp master
+                                //                                {
+                                MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+//                                }
 #pragma omp atomic
-                            PCounter[ghostOwner]++;
+                                (*msgActual)++;
+                            }
+                            else
+                            {
 #pragma omp atomic
-                            (*msgIndPtr)++;
+                                (*NumMessagesBundledPtr)++;
 #pragma omp atomic
-                            (*NumMessagesBundledPtr)++;
+                                PCounter[ghostOwner]++;
+                            }
+
+#pragma omp atomic
+                            (*msgIndPtr)++;
+
                             privateQLocalVtx.push_back(u);
                             privateQGhostVtx.push_back(v);
                             privateQMsgType.push_back(SUCCESS);
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 43fc5ce9..bb21396b 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -104,7 +104,7 @@ void processMessages(
         ReceiveBuffer.resize(bundleSize, -1); // Initialize
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
-        for (i = 0; i < bundleSize; i++)
+        for (int i = 0; i < bundleSize; i++)
             cout << ReceiveBuffer[i] << ",";
         cout << endl;
         fflush(stdout);
@@ -119,7 +119,7 @@ void processMessages(
         }
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << ")Message Bundle After: " << endl;
-        for (i = 0; i < bundleSize; i++)
+        for (int i = 0; i < bundleSize; i++)
             cout << ReceiveBuffer[i] << ",";
         cout << endl;
         fflush(stdout);
diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp
index 8665759c..f7fd2f78 100644
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@@ -9,7 +9,7 @@ void sendBundledMessages(MilanLongInt *numGhostEdges,
                                 MilanLongInt *PCounter,
                                 MilanLongInt NumMessagesBundled,
                                 MilanLongInt *msgActual,
-                                MilanLongInt *MessageIndex,
+                                MilanLongInt *msgInd,
                                 MilanInt numProcs,
                                 MilanInt myRank,
                                 MPI_Comm comm,
@@ -105,7 +105,7 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
 // Send the Messages
 #pragma omp task depend(inout                                                  \
                         : SRequest, PSizeInfoMessages, PCumulative) depend(out \
-                                                                           : *msgActual, *MessageIndex)
+                                                                           : *msgActual, *msgInd)
 {
     for (i = 0; i < numProcs; i++)
     {                    // Changed by Fabio to be an integer, addresses needs to be integers!
@@ -124,9 +124,9 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
         if (PSizeInfoMessages[i * 3 + 0] > 0)
         { // Send only if it is a nonempty packet
             MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
-                      &SRequest[(*MessageIndex)]);
+                      &SRequest[(*msgInd)]);
             (*msgActual)++;
-            (*MessageIndex)++;
+            (*msgInd)++;
             // Now Send the message with the data packet:
 #ifdef PRINT_DEBUG_INFO_
             cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
@@ -136,8 +136,8 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
             fflush(stdout);
 #endif
             MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
-                      TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*MessageIndex)]);
-            (*MessageIndex)++;
+                      TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
+            (*msgInd)++;
         } // End of if size > 0
     }
 }
@@ -207,9 +207,4 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
 }
 }
 }
-
-//*MessageIndexPtr = MessageIndex;
-//*msgActualPtr = msgActual;
-//*numGhostEdgesPtr = numGhostEdges;
-//*BufferSizePtr = BufferSize;
 }
\ No newline at end of file
diff --git a/amgprec/stZnqhkT b/amgprec/stZnqhkT
new file mode 100644
index 00000000..8b277f0d
--- /dev/null
+++ b/amgprec/stZnqhkT
@@ -0,0 +1 @@
+!<arch>

From abf258e2e8a337870bdb7404975307861867fe69 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 20 Jul 2022 15:45:29 -0500
Subject: [PATCH 69/96] isAlreadyMatched is now atomic

---
 amgprec/impl/aggregator/clean.cpp            |  2 --
 amgprec/impl/aggregator/isAlreadyMatched.cpp | 29 ++++++++++++++------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp
index 29fa351d..62f366b2 100644
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@@ -1,8 +1,6 @@
 #include "MatchBoxPC.h"
 
 // TODO comment
-// TODO use task
-// TODO destroy the locks
 
 void clean(MilanLongInt NLVer,
            MilanInt myRank,
diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp
index d4efd416..a7d65c15 100644
--- a/amgprec/impl/aggregator/isAlreadyMatched.cpp
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@@ -1,6 +1,5 @@
 #include "MatchBoxPC.h"
 
-//TODO can be optimized!!
 /**
  * //TODO documentation
  * @param k
@@ -13,14 +12,14 @@
  * @return
  */
 bool isAlreadyMatched(MilanLongInt node,
-                             MilanLongInt StartIndex,
-                             MilanLongInt EndIndex,
-                             vector <MilanLongInt> &GMate,
-                             MilanLongInt* Mate,
-                             map <MilanLongInt, MilanLongInt> &Ghost2LocalMap
-) {
+                      MilanLongInt StartIndex,
+                      MilanLongInt EndIndex,
+                      vector<MilanLongInt> &GMate,
+                      MilanLongInt *Mate,
+                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
+{
 
-    bool result = false;
+    /*
 #pragma omp critical(Mate)
     {
         if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
@@ -30,6 +29,18 @@ bool isAlreadyMatched(MilanLongInt node,
         }
 
     }
+    */
+    MilanLongInt val;
+    if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex
+    { 
+#pragma omp atomic read
+        val = GMate[Ghost2LocalMap[node]];
+        return val >= 0; // Already matched
+    }
+
+    // If not ghost vertex
+#pragma omp atomic read
+    val = Mate[node - StartIndex];
 
-    return result;
+    return val >= 0; // Already matched
 }
\ No newline at end of file

From 9b065602a8573eaf0c0f8f105e5b44ccb4fcc203 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Wed, 20 Jul 2022 16:24:37 -0500
Subject: [PATCH 70/96] Fixed race condition in processExposedVertices

---
 .../impl/aggregator/processExposedVertex.cpp  | 200 ++++++++++--------
 1 file changed, 108 insertions(+), 92 deletions(-)

diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index c28a7c66..c53f2f53 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -32,14 +32,14 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 {
 
     MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
-    MilanInt ghostOwner = 0;
+    MilanInt ghostOwner = 0, option;
 
-#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
     {
-#pragma omp for reduction(+ \
-                          : PCounter[:numProcs]) schedule(static)
+#pragma omp for reduction(+ : PCounter[:numProcs]) schedule(static)
         for (v = 0; v < NLVer; v++)
         {
+            option = -1;
             // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
             k = candidateMate[v];
             candidateMate[v] = verLocInd[k];
@@ -58,29 +58,75 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
             if (w >= 0)
             {
 
-                if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
-                {
-                    w = computeCandidateMate(verLocPtr[v],
-                                             verLocPtr[v + 1],
-                                             edgeLocWeight, 0,
-                                             verLocInd,
-                                             StartIndex,
-                                             EndIndex,
-                                             GMate,
-                                             Mate,
-                                             Ghost2LocalMap);
-                    candidateMate[v] = w;
-                }
-
-                if (w >= 0)
+#pragma omp critical(processExposed)
                 {
+                    if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                    {
+                        w = computeCandidateMate(verLocPtr[v],
+                                                 verLocPtr[v + 1],
+                                                 edgeLocWeight, 0,
+                                                 verLocInd,
+                                                 StartIndex,
+                                                 EndIndex,
+                                                 GMate,
+                                                 Mate,
+                                                 Ghost2LocalMap);
+                        candidateMate[v] = w;
+                    }
+
+                    if (w >= 0)
+                    {
 #pragma omp atomic
-                    (*myCard)++;
+                        (*myCard)++;
+                        if ((w < StartIndex) || (w > EndIndex))
+                        { // w is a ghost vertex
+                            option = 2;
+
+                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
+                            {
+                                option = 1;
+                                Mate[v] = w;
+                                GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
+
+                            } // End of if CandidateMate[w] = v
+
+                        } // End of if a Ghost Vertex
+                        else
+                        { // w is a local vertex
+
+                            if (candidateMate[w - StartIndex] == (v + StartIndex))
+                            {
+                                option = 3;
+                                Mate[v] = w;                           // v is local
+                                Mate[w - StartIndex] = v + StartIndex; // w is local
+
+#ifdef PRINT_DEBUG_INFO_
+                                cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
+                                fflush(stdout);
+#endif
+
+                            } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                        }     // End of Else
+
+                    } // End of second if
+
+                } // End critical processExposed
+
+            } // End of if(w >=0)
+            else
+            {
+                // This piece of code is executed a really small amount of times
+                adj11 = verLocPtr[v];
+                adj12 = verLocPtr[v + 1];
+                for (k1 = adj11; k1 < adj12; k1++)
+                {
+                    w = verLocInd[k1];
                     if ((w < StartIndex) || (w > EndIndex))
-                    { // w is a ghost vertex
+                    { // A ghost
+
 #ifdef PRINT_DEBUG_INFO_
-                        cout << "\n(" << myRank << ")Sending a request message (291):";
-                        cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                        cout << "\n(" << myRank << ")Sending a failure message: ";
+                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                         fflush(stdout);
 #endif
 #pragma omp atomic
@@ -94,86 +140,56 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
                         privateQLocalVtx.push_back(v + StartIndex);
                         privateQGhostVtx.push_back(w);
-                        privateQMsgType.push_back(REQUEST);
+                        privateQMsgType.push_back(FAILURE);
                         privateQOwner.push_back(ghostOwner);
 
-                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
-                        {
-
-                            privateU.push_back(v + StartIndex);
-                            privateU.push_back(w);
-                            Mate[v] = w;
-                            // FIXME could this instruction create errors?
-                            GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
-
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
-                            fflush(stdout);
-#endif
-
-                            // TODO refactor this!!
-                            //  Decrement the counter:
-                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
-                        } // End of if CandidateMate[w] = v
-
-                    } // End of if a Ghost Vertex
-                    else
-                    { // w is a local vertex
-
-                        if (candidateMate[w - StartIndex] == (v + StartIndex))
-                        {
-                            privateU.push_back(v + StartIndex);
-                            privateU.push_back(w);
+                    } // End of if(GHOST)
+                }     // End of for loop
+            }
+            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 
-                            Mate[v] = w; // v is local
-                            // FIXME this instruction could create errors
-                            Mate[w - StartIndex] = v + StartIndex; // w is local
+            switch (option)
+            {
+            case -1:
+                break;
+            case 1:
+                privateU.push_back(v + StartIndex);
+                privateU.push_back(w);
 
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
-                            fflush(stdout);
+                cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
+                fflush(stdout);
 #endif
 
-                        } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-                    }     // End of Else
-
-                    continue;
-                } // End of second if
-
-            } // End of if(w >=0)
-
-            // This piece of code is executed a really small amount of times, I will not allocate a
-            // huge amount of memory for the private data structures.
-            adj11 = verLocPtr[v];
-            adj12 = verLocPtr[v + 1];
-            for (k1 = adj11; k1 < adj12; k1++)
-            {
-                w = verLocInd[k1];
-                if ((w < StartIndex) || (w > EndIndex))
-                { // A ghost
-
+                //  Decrement the counter:
+                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
+            case 2:
 #ifdef PRINT_DEBUG_INFO_
-                    cout << "\n(" << myRank << ")Sending a failure message: ";
-                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                    fflush(stdout);
+                cout << "\n(" << myRank << ")Sending a request message (291):";
+                cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                fflush(stdout);
 #endif
 #pragma omp atomic
-                    (*msgInd)++;
+                (*msgInd)++;
 #pragma omp atomic
-                    (*NumMessagesBundled)++;
-                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                    assert(ghostOwner != -1);
-                    assert(ghostOwner != myRank);
-                    PCounter[ghostOwner]++;
-
-                    privateQLocalVtx.push_back(v + StartIndex);
-                    privateQGhostVtx.push_back(w);
-                    privateQMsgType.push_back(FAILURE);
-                    privateQOwner.push_back(ghostOwner);
-
-                } // End of if(GHOST)
-            }     // End of for loop
-            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                (*NumMessagesBundled)++;
+                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                assert(ghostOwner != -1);
+                assert(ghostOwner != myRank);
+                PCounter[ghostOwner]++;
+
+                privateQLocalVtx.push_back(v + StartIndex);
+                privateQGhostVtx.push_back(w);
+                privateQMsgType.push_back(REQUEST);
+                privateQOwner.push_back(ghostOwner);
+                break;
+            case 3:
+            default:
+                privateU.push_back(v + StartIndex);
+                privateU.push_back(w);
+                break;
+            }
+
         } // End of for ( v=0; v < NLVer; v++ )
 
         queuesTransfer(U, privateU, QLocalVtx,

From 9d1a416f9966ef461c4aaccbccbcf01bb5b90539 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Thu, 21 Jul 2022 15:45:31 -0500
Subject: [PATCH 71/96] add rm to exec.sh

---
 exec.sh | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/exec.sh b/exec.sh
index 50edf4ad..1181f776 100755
--- a/exec.sh
+++ b/exec.sh
@@ -1,4 +1,20 @@
-rm amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
+cd amgprec/impl/aggregator/
+rm MatchBoxPC.o 
+rm sendBundledMessages.o 
+rm initialize.o 
+rm extractUChunk.o 
+rm isAlreadyMatched.o 
+rm findOwnerOfGhost.o 
+rm computeCandidateMate.o 
+rm parallelComputeCandidateMateB.o 
+rm processMatchedVertices.o 
+rm processCrossEdge.o 
+rm queueTransfer.o 
+rm processMessages.o 
+rm processExposedVertex.o 
+rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o 
+rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
+cd ../../../
 make all
 cd samples/advanced/pdegen
 make amg_d_pde3d

From e328f3969c69b61d5110daf95ceb4f0f700fe82b Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Fri, 22 Jul 2022 07:25:09 -0500
Subject: [PATCH 72/96] queueTransfer optimization in processMatchedVertices

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  4 +-
 .../aggregator/processMatchedVertices.cpp     | 58 ++++++-------------
 amgprec/impl/aggregator/processMessages.cpp   |  7 +++
 3 files changed, 27 insertions(+), 42 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index dafe381e..cb7d95e2 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -65,8 +65,8 @@
 
 using namespace std;
 
-#define NUM_THREAD 4
-#define UCHUNK 5
+const int NUM_THREAD = 2;
+const int UCHUNK = 50;
 
 const MilanLongInt REQUEST = 1;
 const MilanLongInt SUCCESS = 2;
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 296bfa15..5e233ce9 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -199,10 +199,11 @@ void processMatchedVertices(
                                 Message[1] = w;       // GHOST
                                 Message[2] = REQUEST; // TYPE
                                                       // Send a Request (Asynchronous)
-                                                      //#pragma omp master
-                                                      //                                {
+
+                                printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                fflush(stdout);
                                 MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-//                                }
+
 #pragma omp atomic
                                 (*msgActual)++;
                             }
@@ -254,10 +255,10 @@ void processMatchedVertices(
                                         Message[1] = w;       // GHOST
                                         Message[2] = FAILURE; // TYPE
                                                               // Send a Request (Asynchronous)
-                                                              //#pragma omp master
-                                                              //                                        {
+
+                                        printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                        fflush(stdout);
                                         MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-//                                        }
 #pragma omp atomic
                                         (*msgActual)++;
                                     }
@@ -300,10 +301,9 @@ void processMatchedVertices(
                                 Message[2] = SUCCESS; // TYPE
 
                                 // Send a Request (Asynchronous)
-                                //#pragma omp master
-                                //                                {
+                                // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                fflush(stdout);
                                 MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-//                                }
 #pragma omp atomic
                                 (*msgActual)++;
                             }
@@ -327,39 +327,17 @@ void processMatchedVertices(
                         } // End of switch
 
                     } // End of inner for
-
-                    // TODO privateU.size() < UCHUNK could be commented but it generate errors, why?
-                    if (privateU.size() > UCHUNK || U.empty())
-                    {
-#pragma omp critical(U)
-                        {
-                            while (!privateU.empty())
-                                U.push_back(privateU.pop_back());
-                        }
-
-#ifndef error
-#pragma omp critical(privateMsg)
-                        {
-                            while (!privateQLocalVtx.empty())
-                            {
-                                QLocalVtx.push_back(privateQLocalVtx.pop_back());
-                                QGhostVtx.push_back(privateQGhostVtx.pop_back());
-                                QMsgType.push_back(privateQMsgType.pop_back());
-                                QOwner.push_back(privateQOwner.pop_back());
-                            }
-                        }
-
-#endif
-                    } // End of private.size()
                 }
             } // End of outer for
-        }     // End of while ( !U.empty() )
-        queuesTransfer(U, privateU, QLocalVtx,
-                       QGhostVtx,
-                       QMsgType, QOwner, privateQLocalVtx,
-                       privateQGhostVtx,
-                       privateQMsgType,
-                       privateQOwner);
+
+            queuesTransfer(U, privateU, QLocalVtx,
+                           QGhostVtx,
+                           QMsgType, QOwner, privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
+
+        } // End of while ( !U.empty() )
 
 #ifdef COUNT_LOCAL_VERTEX
         printf("Count local vertexes: %ld for thread %d of processor %d\n",
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index bb21396b..474453e3 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -28,6 +28,8 @@ void processMessages(
     staticQueue &U)
 {
 
+//#define PRINT_DEBUG_INFO_
+
     MilanInt Sender;
     MPI_Status computeStatus;
     MilanLongInt bundleSize, w;
@@ -92,6 +94,8 @@ void processMessages(
 
     if (Message[2] == SIZEINFO)
     {
+        //printf("Inizio sizeinfo\n");
+        fflush(stdout);
 
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
@@ -124,6 +128,9 @@ void processMessages(
         cout << endl;
         fflush(stdout);
 #endif
+
+        //printf("Fine sizeinfo\n");
+        fflush(stdout);
     }
     else
     { // Just a single message:

From aa45e2fe936db3d150aedb01409dfe933984cb5e Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 05:14:26 -0500
Subject: [PATCH 73/96] processMatchedVerticesAndSendMessages.cpp unoptimized

---
 amgprec/impl/aggregator/Makefile              |   2 +-
 amgprec/impl/aggregator/MatchBoxPC.h          |  46 ++-
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 178 +-------
 .../aggregator/processMatchedVertices.cpp     |  78 +---
 .../processMatchedVerticesAndSendMessages.cpp | 380 ++++++++++++++++++
 amgprec/impl/aggregator/processMessages.cpp   |  16 +-
 6 files changed, 435 insertions(+), 265 deletions(-)
 create mode 100644 amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp

diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile
index f1760822..b3b1ac94 100644
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@@ -70,6 +70,7 @@ findOwnerOfGhost.o \
 computeCandidateMate.o \
 parallelComputeCandidateMateB.o \
 processMatchedVertices.o \
+processMatchedVerticesAndSendMessages.o \
 processCrossEdge.o \
 queueTransfer.o \
 processMessages.o \
@@ -77,7 +78,6 @@ processExposedVertex.o \
 algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
 algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
 
-
 OBJS = $(FOBJS) $(MPCOBJS)
 
 LIBNAME=libamg_prec.a
diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index cb7d95e2..8bba9540 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -65,8 +65,8 @@
 
 using namespace std;
 
-const int NUM_THREAD = 2;
-const int UCHUNK = 50;
+const int NUM_THREAD = 4;
+const int UCHUNK = 10;
 
 const MilanLongInt REQUEST = 1;
 const MilanLongInt SUCCESS = 2;
@@ -293,6 +293,38 @@ extern "C"
                             MilanLongInt *SPtr);
 
     void processMatchedVertices(
+        MilanLongInt NLVer,
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        staticQueue &U,
+        staticQueue &privateU,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *NumMessagesBundledPtr,
+        MilanLongInt *SPtr,
+        MilanLongInt *verLocPtr,
+        MilanLongInt *verLocInd,
+        MilanLongInt *verDistance,
+        MilanLongInt *PCounter,
+        vector<MilanLongInt> &Counter,
+        MilanInt myRank,
+        MilanInt numProcs,
+        MilanLongInt *candidateMate,
+        vector<MilanLongInt> &GMate,
+        MilanLongInt *Mate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        MilanReal *edgeLocWeight,
+        vector<MilanLongInt> &QLocalVtx,
+        vector<MilanLongInt> &QGhostVtx,
+        vector<MilanLongInt> &QMsgType,
+        vector<MilanInt> &QOwner,
+        staticQueue &privateQLocalVtx,
+        staticQueue &privateQGhostVtx,
+        staticQueue &privateQMsgType,
+        staticQueue &privateQOwner);
+
+    void processMatchedVerticesAndSendMessages(
         MilanLongInt NLVer,
         vector<MilanLongInt> &UChunkBeingProcessed,
         staticQueue &U,
@@ -323,11 +355,11 @@ extern "C"
         staticQueue &privateQGhostVtx,
         staticQueue &privateQMsgType,
         staticQueue &privateQOwner,
-        bool sendMessages = false,
-        MPI_Comm comm = NULL,
-        MilanLongInt *msgActual = nullptr,
-        MilanLongInt *msgInd = nullptr,
-        vector<MilanLongInt> &Message = DEFAULT_VECTOR);
+        bool sendMessages,
+        MPI_Comm comm,
+        MilanLongInt *msgActual,
+        MilanLongInt *msgInd,
+        vector<MilanLongInt> &Message);
 
     void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
                              MilanInt *BufferSizePtr,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 99fd57c3..d8e8bfb7 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -368,11 +368,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
-        ///*
 
-//#define error
-#ifdef error
-        processMatchedVertices(NLVer,
+        processMatchedVerticesAndSendMessages(NLVer,
                                UChunkBeingProcessed,
                                U,
                                privateU,
@@ -407,179 +404,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                &msgActual,
                                &msgInd,
                                Message);
-#endif
-#ifndef error
-
-        while (!U.empty())
-        {
-
-            u = U.pop_front(); // Get an element from the queue
-#ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")u: " << u;
-            fflush(stdout);
-#endif
-            if ((u >= StartIndex) && (u <= EndIndex))
-            { // Process Only If a Local Vertex
-                // Get the Adjacency list for u
-                adj1 = verLocPtr[u - StartIndex]; // Pointer
-                adj2 = verLocPtr[u - StartIndex + 1];
-                for (k = adj1; k < adj2; k++)
-                {
-                    v = verLocInd[k];
-                    if ((v >= StartIndex) && (v <= EndIndex))
-                    { // v is a Local Vertex:
-                      // if (Mate[v - StartIndex] >= 0) // v is already matched
-                      //    continue;
-#ifdef PRINT_DEBUG_INFO_
-                        cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
-                        fflush(stdout);
-#endif
-                        // If the current vertex is pointing to a matched vertex and is not matched
-                        if (Mate[v - StartIndex] < 0)
-                        {
-                            if (candidateMate[v - StartIndex] == u)
-                            { // Only if pointing to the matched vertex
-                              // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                         verLocPtr[v - StartIndex + 1],
-                                                         edgeLocWeight, 0,
-                                                         verLocInd,
-                                                         StartIndex,
-                                                         EndIndex,
-                                                         GMate,
-                                                         Mate,
-                                                         Ghost2LocalMap);
-
-                                candidateMate[v - StartIndex] = w;
-                                // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-#ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")" << v << " Points to: " << w;
-                                fflush(stdout);
-#endif
-                                // If found a dominating edge:
-                                if (w >= 0)
-                                {
-                                    if ((w < StartIndex) || (w > EndIndex))
-                                    { // w is a ghost
-                                        // Build the Message Packet:
-                                        Message[0] = v;       // LOCAL
-                                        Message[1] = w;       // GHOST
-                                        Message[2] = REQUEST; // TYPE
-                                        // Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")Sending a request message:";
-                                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                        fflush(stdout);
-#endif
-                                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                        assert(ghostOwner != -1);
-                                        assert(ghostOwner != myRank);
-                                        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                        msgInd++;
-                                        msgActual++;
-                                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                        {
-                                            Mate[v - StartIndex] = w;     // v is local
-                                            GMate[Ghost2LocalMap[w]] = v; // w is ghost
-                                            // Q.push_back(u);
-                                            U.push_back(v);
-                                            U.push_back(w);
-                                            myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                            fflush(stdout);
-#endif
-
-                                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S);
-
-                                        } // End of if CandidateMate[w] = v
-                                    }     // End of if a Ghost Vertex
-                                    else
-                                    { // w is a local vertex
-                                        if (candidateMate[w - StartIndex] == v)
-                                        {
-                                            Mate[v - StartIndex] = w; // v is local
-                                            Mate[w - StartIndex] = v; // w is local
-                                            // Q.push_back(u);
-                                            U.push_back(v);
-                                            U.push_back(w);
-                                            myCard++;
-#ifdef PRINT_DEBUG_INFO_
-                                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                            fflush(stdout);
-#endif
-                                        } // End of if(CandidateMate(w) = v
-                                    }     // End of Else
-                                }         // End of if(w >=0)
-                                else
-                                { // no dominating edge found: w == -1
-                                    adj11 = verLocPtr[v - StartIndex];
-                                    adj12 = verLocPtr[v - StartIndex + 1];
-                                    for (k1 = adj11; k1 < adj12; k1++)
-                                    {
-                                        w = verLocInd[k1];
-                                        if ((w < StartIndex) || (w > EndIndex))
-                                        { // A ghost
-                                            // Build the Message Packet:
-                                            Message[0] = v;       // LOCAL
-                                            Message[1] = w;       // GHOST
-                                            Message[2] = FAILURE; // TYPE
-                                            // Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                                            cout << "\n(" << myRank << ")Sending a failure message: ";
-                                            cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                            fflush(stdout);
-#endif
-                                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                            assert(ghostOwner != -1);
-                                            assert(ghostOwner != myRank);
-                                            MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                            msgInd++;
-                                            msgActual++;
-                                        } // End of if(GHOST)
-                                    }     // End of for loop
-                                }         // End of Else: w == -1
-                                // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                            } // End of If (candidateMate[v-StartIndex] == u)
-                        }     // if (Mate[v - StartIndex] < 0)
-                    }         // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                    else
-                    { // Neighbor v is a ghost vertex
-                        if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                            candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                        if (v != Mate[u - StartIndex])
-                        { // u is a local vertex
-                            // Build the Message Packet:
-                            Message[0] = u;       // LOCAL
-                            Message[1] = v;       // GHOST
-                            Message[2] = SUCCESS; // TYPE
-                            // Send a Request (Asynchronous)
-#ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")Sending a success message: ";
-                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            fflush(stdout);
-#endif
-                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-                            MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                            msgInd++;
-                            msgActual++;
-#ifdef DEBUG_GHOST_
-                            if ((u < StartIndex) || (u > EndIndex))
-                            {
-                                cout << "\n(" << myRank << ") " << __LINE__ << " From Send: should not happen: u= " << u << " v= " << v << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
-                                fflush(stdout);
-                            }
-#endif
-
-                        } // End of If( v != Mate[u] )
-                    }     // End of Else //A Ghost Vertex
-                }         // End of For Loop adj(u)
-            }             // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex
-        }                 // End of while ( !U.empty() )
-#endif
 
         ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 5e233ce9..edb1f788 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -1,7 +1,5 @@
 #include "MatchBoxPC.h"
 
-//#define error
-
 void processMatchedVertices(
     MilanLongInt NLVer,
     vector<MilanLongInt> &UChunkBeingProcessed,
@@ -32,12 +30,7 @@ void processMatchedVertices(
     staticQueue &privateQLocalVtx,
     staticQueue &privateQGhostVtx,
     staticQueue &privateQMsgType,
-    staticQueue &privateQOwner,
-    bool sendMessages,
-    MPI_Comm comm,
-    MilanLongInt *msgActual,
-    MilanLongInt *msgInd,
-    vector<MilanLongInt> &Message)
+    staticQueue &privateQOwner)
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
@@ -53,7 +46,7 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
     {
 
         while (!U.empty())
@@ -192,29 +185,10 @@ void processMatchedVertices(
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                             assert(ghostOwner != -1);
                             assert(ghostOwner != myRank);
-                            if (sendMessages)
-                            {
-                                // Build the Message Packet:
-                                Message[0] = v;       // LOCAL
-                                Message[1] = w;       // GHOST
-                                Message[2] = REQUEST; // TYPE
-                                                      // Send a Request (Asynchronous)
-
-                                printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                fflush(stdout);
-                                MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-
 #pragma omp atomic
-                                (*msgActual)++;
-                            }
-                            else
-                            {
+                            PCounter[ghostOwner]++;
 #pragma omp atomic
-                                PCounter[ghostOwner]++;
-#pragma omp atomic
-                                (*NumMessagesBundledPtr)++;
-                            }
-
+                            (*NumMessagesBundledPtr)++;
 #pragma omp atomic
                             (*msgIndPtr)++;
 
@@ -248,28 +222,10 @@ void processMatchedVertices(
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                     assert(ghostOwner != -1);
                                     assert(ghostOwner != myRank);
-                                    if (sendMessages)
-                                    {
-                                        // Build the Message Packet:
-                                        Message[0] = v;       // LOCAL
-                                        Message[1] = w;       // GHOST
-                                        Message[2] = FAILURE; // TYPE
-                                                              // Send a Request (Asynchronous)
-
-                                        printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                        fflush(stdout);
-                                        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-#pragma omp atomic
-                                        (*msgActual)++;
-                                    }
-                                    else
-                                    {
 #pragma omp atomic
-                                        PCounter[ghostOwner]++;
+                                    PCounter[ghostOwner]++;
 #pragma omp atomic
-                                        (*NumMessagesBundledPtr)++;
-                                    }
-
+                                    (*NumMessagesBundledPtr)++;
 #pragma omp atomic
                                     (*msgIndPtr)++;
 
@@ -293,27 +249,11 @@ void processMatchedVertices(
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
                             assert(ghostOwner != -1);
                             assert(ghostOwner != myRank);
-                            if (sendMessages)
-                            {
-                                // Build the Message Packet:
-                                Message[0] = u;       // LOCAL
-                                Message[1] = v;       // GHOST
-                                Message[2] = SUCCESS; // TYPE
-
-                                // Send a Request (Asynchronous)
-                                // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                fflush(stdout);
-                                MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-#pragma omp atomic
-                                (*msgActual)++;
-                            }
-                            else
-                            {
+
 #pragma omp atomic
-                                (*NumMessagesBundledPtr)++;
+                            (*NumMessagesBundledPtr)++;
 #pragma omp atomic
-                                PCounter[ghostOwner]++;
-                            }
+                            PCounter[ghostOwner]++;
 
 #pragma omp atomic
                             (*msgIndPtr)++;
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
new file mode 100644
index 00000000..e61d561f
--- /dev/null
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -0,0 +1,380 @@
+#include "MatchBoxPC.h"
+
+void processMatchedVerticesAndSendMessages(
+    MilanLongInt NLVer,
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    staticQueue &U,
+    staticQueue &privateU,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCardPtr,
+    MilanLongInt *msgIndPtr,
+    MilanLongInt *NumMessagesBundledPtr,
+    MilanLongInt *SPtr,
+    MilanLongInt *verLocPtr,
+    MilanLongInt *verLocInd,
+    MilanLongInt *verDistance,
+    MilanLongInt *PCounter,
+    vector<MilanLongInt> &Counter,
+    MilanInt myRank,
+    MilanInt numProcs,
+    MilanLongInt *candidateMate,
+    vector<MilanLongInt> &GMate,
+    MilanLongInt *Mate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    MilanReal *edgeLocWeight,
+    vector<MilanLongInt> &QLocalVtx,
+    vector<MilanLongInt> &QGhostVtx,
+    vector<MilanLongInt> &QMsgType,
+    vector<MilanInt> &QOwner,
+    staticQueue &privateQLocalVtx,
+    staticQueue &privateQGhostVtx,
+    staticQueue &privateQMsgType,
+    staticQueue &privateQOwner,
+    bool sendMessages,
+    MPI_Comm comm,
+    MilanLongInt *msgActual,
+    MilanLongInt *msgInd,
+    vector<MilanLongInt> &Message)
+{
+
+    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
+    int option;
+    MilanLongInt mateVal;
+
+    vector<MilanLongInt> messagesToSend;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+#ifdef COUNT_LOCAL_VERTEX
+    MilanLongInt localVertices = 0;
+#endif
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
+    {
+
+        while (!U.empty())
+        {
+
+            extractUChunk(UChunkBeingProcessed, U, privateU);
+
+            for (MilanLongInt u : UChunkBeingProcessed)
+            {
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")u: " << u;
+                fflush(stdout);
+#endif
+                if ((u >= StartIndex) && (u <= EndIndex))
+                { // Process Only the Local Vertices
+
+#ifdef COUNT_LOCAL_VERTEX
+                    localVertices++;
+#endif
+
+                    // Get the Adjacency list for u
+                    adj1 = verLocPtr[u - StartIndex]; // Pointer
+                    adj2 = verLocPtr[u - StartIndex + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        option = -1;
+                        v = verLocInd[k];
+
+                        if ((v >= StartIndex) && (v <= EndIndex))
+                        { // If Local Vertex:
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                            fflush(stdout);
+#endif
+#pragma omp atomic read
+                            mateVal = Mate[v - StartIndex];
+                            // If the current vertex is pointing to a matched vertex and is not matched
+                            if (mateVal < 0)
+                            {
+#pragma omp critical
+                                {
+                                    if (candidateMate[v - StartIndex] == u)
+                                    {
+                                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                        w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                                 verLocPtr[v - StartIndex + 1],
+                                                                 edgeLocWeight, 0,
+                                                                 verLocInd,
+                                                                 StartIndex,
+                                                                 EndIndex,
+                                                                 GMate,
+                                                                 Mate,
+                                                                 Ghost2LocalMap);
+
+                                        candidateMate[v - StartIndex] = w;
+
+#ifdef PRINT_DEBUG_INFO_
+                                        cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                        fflush(stdout);
+#endif
+                                        // If found a dominating edge:
+                                        if (w >= 0)
+                                        {
+
+                                            if ((w < StartIndex) || (w > EndIndex))
+                                            { // A ghost
+#ifdef PRINT_DEBUG_INFO_
+                                                cout << "\n(" << myRank << ")Sending a request message:";
+                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+#endif
+                                                option = 2;
+
+                                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                                {
+                                                    option = 1;
+                                                    Mate[v - StartIndex] = w;     // v is a local vertex
+                                                    GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+
+                                                } // End of if CandidateMate[w] = v
+                                            }     // End of if a Ghost Vertex
+                                            else
+                                            { // w is a local vertex
+                                                if (candidateMate[w - StartIndex] == v)
+                                                {
+                                                    option = 3;
+                                                    Mate[v - StartIndex] = w; // v is a local vertex
+                                                    Mate[w - StartIndex] = v; // w is a local vertex
+
+#ifdef PRINT_DEBUG_INFO_
+                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                    fflush(stdout);
+#endif
+                                                } // End of if(CandidateMate(w) = v
+                                            }     // End of Else
+                                        }         // End of if(w >=0)
+                                        else
+                                            option = 4; // End of Else: w == -1
+                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    } // End of If (candidateMate[v-StartIndex] == u
+                                }     // End of task
+                            }         // mateval < 0
+                        }             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else
+                        { // Neighbor is a ghost vertex
+
+#pragma omp critical
+                            {
+                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                                if (v != Mate[u - StartIndex])
+                                    option = 5; // u is local
+                            }                   // End of critical
+                        }                       // End of Else //A Ghost Vertex
+
+                        switch (option)
+                        {
+                        case -1:
+                            // No things to do
+                            break;
+                        case 1:
+                            // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+#pragma omp atomic
+                            (*myCardPtr)++;
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                            fflush(stdout);
+#endif
+                            // Decrement the counter:
+                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
+                        case 2:
+
+                            // Found a dominating edge, it is a ghost
+                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
+                            if (sendMessages)
+                            {
+                                // Build the Message Packet:
+                                Message[0] = v;       // LOCAL
+                                Message[1] = w;       // GHOST
+                                Message[2] = REQUEST; // TYPE
+                                                      // Send a Request (Asynchronous)
+
+                                //printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                //fflush(stdout);
+#pragma omp critical(sendMessage)
+                                {
+                                    messagesToSend.push_back(v);
+                                    messagesToSend.push_back(w);
+                                    messagesToSend.push_back(REQUEST);
+                                    messagesToSend.push_back(ghostOwner);
+                                }
+                                // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+
+#pragma omp atomic
+                                (*msgActual)++;
+                            }
+                            else
+                            {
+#pragma omp atomic
+                                PCounter[ghostOwner]++;
+#pragma omp atomic
+                                (*NumMessagesBundledPtr)++;
+                            }
+
+#pragma omp atomic
+                            (*msgIndPtr)++;
+
+                            privateQLocalVtx.push_back(v);
+                            privateQGhostVtx.push_back(w);
+                            privateQMsgType.push_back(REQUEST);
+                            privateQOwner.push_back(ghostOwner);
+                            break;
+                        case 3:
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+#pragma omp atomic
+                            (*myCardPtr)++;
+                            break;
+                        case 4:
+                            // Could not find a dominating vertex
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                w = verLocInd[k1];
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    fflush(stdout);
+#endif
+
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    assert(ghostOwner != -1);
+                                    assert(ghostOwner != myRank);
+                                    if (sendMessages)
+                                    {
+                                        // Build the Message Packet:
+                                        Message[0] = v;       // LOCAL
+                                        Message[1] = w;       // GHOST
+                                        Message[2] = FAILURE; // TYPE
+                                                              // Send a Request (Asynchronous)
+
+                                        //printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                        //fflush(stdout);
+#pragma omp critical(sendMessage)
+                                        {
+                                            messagesToSend.push_back(v);
+                                            messagesToSend.push_back(w);
+                                            messagesToSend.push_back(FAILURE);
+                                            messagesToSend.push_back(ghostOwner);
+                                        }
+                                        // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+#pragma omp atomic
+                                        (*msgActual)++;
+                                    }
+                                    else
+                                    {
+#pragma omp atomic
+                                        PCounter[ghostOwner]++;
+#pragma omp atomic
+                                        (*NumMessagesBundledPtr)++;
+                                    }
+
+#pragma omp atomic
+                                    (*msgIndPtr)++;
+
+                                    privateQLocalVtx.push_back(v);
+                                    privateQGhostVtx.push_back(w);
+                                    privateQMsgType.push_back(FAILURE);
+                                    privateQOwner.push_back(ghostOwner);
+
+                                } // End of if(GHOST)
+                            }     // End of for loop
+                            break;
+                        case 5:
+                        default:
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")Sending a success message: ";
+                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                            fflush(stdout);
+#endif
+
+                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                            assert(ghostOwner != -1);
+                            assert(ghostOwner != myRank);
+                            if (sendMessages)
+                            {
+                                // Build the Message Packet:
+                                Message[0] = u;       // LOCAL
+                                Message[1] = v;       // GHOST
+                                Message[2] = SUCCESS; // TYPE
+
+                                // Send a Request (Asynchronous)
+                                //printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                //fflush(stdout);
+#pragma omp critical(sendMessage)
+                                {
+                                    messagesToSend.push_back(u);
+                                    messagesToSend.push_back(v);
+                                    messagesToSend.push_back(SUCCESS);
+                                    messagesToSend.push_back(ghostOwner);
+                                }
+                                // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+#pragma omp atomic
+                                (*msgActual)++;
+                            }
+                            else
+                            {
+#pragma omp atomic
+                                (*NumMessagesBundledPtr)++;
+#pragma omp atomic
+                                PCounter[ghostOwner]++;
+                            }
+
+#pragma omp atomic
+                            (*msgIndPtr)++;
+
+                            privateQLocalVtx.push_back(u);
+                            privateQGhostVtx.push_back(v);
+                            privateQMsgType.push_back(SUCCESS);
+                            privateQOwner.push_back(ghostOwner);
+
+                            break;
+                        } // End of switch
+
+                    } // End of inner for
+                }
+            } // End of outer for
+
+            queuesTransfer(U, privateU, QLocalVtx,
+                           QGhostVtx,
+                           QMsgType, QOwner, privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
+
+        } // End of while ( !U.empty() )
+
+#ifdef COUNT_LOCAL_VERTEX
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
+
+#endif
+    } // End of parallel region
+
+    for (int i = 0; i < messagesToSend.size(); i += 4)
+    {
+        Message[0] = messagesToSend[i];       
+        Message[1] = messagesToSend[i + 1];       
+        Message[2] = messagesToSend[i + 2];
+        ghostOwner = messagesToSend[i + 3]; 
+        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+    }
+}
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 474453e3..c6cb2531 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -28,7 +28,7 @@ void processMessages(
     staticQueue &U)
 {
 
-//#define PRINT_DEBUG_INFO_
+    //#define PRINT_DEBUG_INFO_
 
     MilanInt Sender;
     MPI_Status computeStatus;
@@ -94,8 +94,6 @@ void processMessages(
 
     if (Message[2] == SIZEINFO)
     {
-        //printf("Inizio sizeinfo\n");
-        fflush(stdout);
 
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
@@ -128,9 +126,6 @@ void processMessages(
         cout << endl;
         fflush(stdout);
 #endif
-
-        //printf("Fine sizeinfo\n");
-        fflush(stdout);
     }
     else
     { // Just a single message:
@@ -162,8 +157,7 @@ void processMessages(
     fflush(stdout);
 #endif
 
-
-    //Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
+    // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
     for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3)
     {
         u = ReceiveBuffer[bundleCounter - 3];            // GHOST
@@ -325,10 +319,10 @@ void processMessages(
                 cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
                 fflush(stdout);
 #endif
-                GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
+                GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
                 PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
-            }                                                      // End of else: CASE III
-        }                                                          // End of else: CASE I
+            }                                                       // End of else: CASE III
+        }                                                           // End of else: CASE I
     }
 
     return;

From 5efee2004175bad45761608e74dd05d55bc2f5ad Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 05:52:27 -0500
Subject: [PATCH 74/96] Optimization, replaced all useless atomic with
 reduction

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  1 -
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |  1 -
 amgprec/impl/aggregator/initialize.cpp        |  6 +-
 .../impl/aggregator/processExposedVertex.cpp  | 21 +++--
 .../aggregator/processMatchedVertices.cpp     | 58 +++++++-------
 .../processMatchedVerticesAndSendMessages.cpp | 78 +++++++++----------
 amgprec/impl/aggregator/processMessages.cpp   |  8 +-
 7 files changed, 81 insertions(+), 92 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 8bba9540..d4b8c04c 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -358,7 +358,6 @@ extern "C"
         bool sendMessages,
         MPI_Comm comm,
         MilanLongInt *msgActual,
-        MilanLongInt *msgInd,
         vector<MilanLongInt> &Message);
 
     void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index d8e8bfb7..7b47c7c9 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -402,7 +402,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                true,
                                comm,
                                &msgActual,
-                               &msgInd,
                                Message);
 
         ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 3e7ac207..477f5f6d 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -57,13 +57,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 #pragma omp task depend(out \
                         : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
             {
-#pragma omp taskloop num_tasks(NUM_THREAD)
+#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
+                                                     : numGhostEdges[:1])
                 for (i = 0; i < NLEdge; i++)
                 { // O(m) - Each edge stored twice
                     insertMe = verLocInd[i];
                     if ((insertMe < StartIndex) || (insertMe > EndIndex))
                     { // Find a ghost
-#pragma omp atomic
                         (*numGhostEdges)++;
 #pragma omp critical
                         {
@@ -76,7 +76,7 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                             {                                                 // Insert an entry for the ghost:
                                 Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
                                 Counter.push_back(1);                         // Initialize the counter
-                                (*numGhostVertices)++;                          // Increment the number of ghost vertices
+                                (*numGhostVertices)++;                        // Increment the number of ghost vertices
                             }                                                 // End of else()
                         }
                     } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index c53f2f53..91035372 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -34,9 +34,13 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
     MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
     MilanInt ghostOwner = 0, option;
 
-#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner)                                                          \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) \
+        num_threads(NUM_THREAD)
+
     {
-#pragma omp for reduction(+ : PCounter[:numProcs]) schedule(static)
+#pragma omp for reduction(+ \
+                          : PCounter[:numProcs], myCard[:1], msgInd[:1], NumMessagesBundled[:1]) schedule(static)
         for (v = 0; v < NLVer; v++)
         {
             option = -1;
@@ -76,7 +80,6 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
                     if (w >= 0)
                     {
-#pragma omp atomic
                         (*myCard)++;
                         if ((w < StartIndex) || (w > EndIndex))
                         { // w is a ghost vertex
@@ -129,13 +132,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                         cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                         fflush(stdout);
 #endif
-#pragma omp atomic
                         (*msgInd)++;
-#pragma omp atomic
                         (*NumMessagesBundled)++;
                         ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        assert(ghostOwner != -1);
-                        assert(ghostOwner != myRank);
+                        // assert(ghostOwner != -1);
+                        // assert(ghostOwner != myRank);
                         PCounter[ghostOwner]++;
 
                         privateQLocalVtx.push_back(v + StartIndex);
@@ -169,13 +170,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                 cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                 fflush(stdout);
 #endif
-#pragma omp atomic
                 (*msgInd)++;
-#pragma omp atomic
                 (*NumMessagesBundled)++;
                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                assert(ghostOwner != -1);
-                assert(ghostOwner != myRank);
+                // assert(ghostOwner != -1);
+                // assert(ghostOwner != myRank);
                 PCounter[ghostOwner]++;
 
                 privateQLocalVtx.push_back(v + StartIndex);
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index edb1f788..1e7b2641 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -7,9 +7,9 @@ void processMatchedVertices(
     staticQueue &privateU,
     MilanLongInt StartIndex,
     MilanLongInt EndIndex,
-    MilanLongInt *myCardPtr,
-    MilanLongInt *msgIndPtr,
-    MilanLongInt *NumMessagesBundledPtr,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *NumMessagesBundled,
     MilanLongInt *SPtr,
     MilanLongInt *verLocPtr,
     MilanLongInt *verLocInd,
@@ -46,7 +46,14 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                                    \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
+        num_threads(NUM_THREAD)                                                                                                                            \
+            reduction(+                                                                                                                                    \
+                      : msgInd[:1], PCounter                                                                                                               \
+                      [:numProcs], myCard                                                                                                                  \
+                      [:1], NumMessagesBundled                                                                                                             \
+                      [:1])
     {
 
         while (!U.empty())
@@ -171,8 +178,8 @@ void processMatchedVertices(
                             // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+
+                            (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
                             cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                             fflush(stdout);
@@ -183,14 +190,11 @@ void processMatchedVertices(
 
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-#pragma omp atomic
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
                             PCounter[ghostOwner]++;
-#pragma omp atomic
-                            (*NumMessagesBundledPtr)++;
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*NumMessagesBundled)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(v);
                             privateQGhostVtx.push_back(w);
@@ -200,8 +204,8 @@ void processMatchedVertices(
                         case 3:
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+
+                            (*myCard)++;
                             break;
                         case 4:
                             // Could not find a dominating vertex
@@ -220,14 +224,12 @@ void processMatchedVertices(
 #endif
 
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
-#pragma omp atomic
+                                    // assert(ghostOwner != -1);
+                                    // assert(ghostOwner != myRank);
+
                                     PCounter[ghostOwner]++;
-#pragma omp atomic
-                                    (*NumMessagesBundledPtr)++;
-#pragma omp atomic
-                                    (*msgIndPtr)++;
+                                    (*NumMessagesBundled)++;
+                                    (*msgInd)++;
 
                                     privateQLocalVtx.push_back(v);
                                     privateQGhostVtx.push_back(w);
@@ -247,16 +249,12 @@ void processMatchedVertices(
 #endif
 
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
 
-#pragma omp atomic
-                            (*NumMessagesBundledPtr)++;
-#pragma omp atomic
+                            (*NumMessagesBundled)++;
                             PCounter[ghostOwner]++;
-
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(u);
                             privateQGhostVtx.push_back(v);
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
index e61d561f..7775d193 100644
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -7,9 +7,9 @@ void processMatchedVerticesAndSendMessages(
     staticQueue &privateU,
     MilanLongInt StartIndex,
     MilanLongInt EndIndex,
-    MilanLongInt *myCardPtr,
-    MilanLongInt *msgIndPtr,
-    MilanLongInt *NumMessagesBundledPtr,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *NumMessagesBundled,
     MilanLongInt *SPtr,
     MilanLongInt *verLocPtr,
     MilanLongInt *verLocInd,
@@ -34,7 +34,6 @@ void processMatchedVerticesAndSendMessages(
     bool sendMessages,
     MPI_Comm comm,
     MilanLongInt *msgActual,
-    MilanLongInt *msgInd,
     vector<MilanLongInt> &Message)
 {
 
@@ -53,7 +52,16 @@ void processMatchedVerticesAndSendMessages(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
+    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) \ 
+default(shared) \ 
+num_threads(NUM_THREAD)                                                                 \
+        reduction(+                                                                     \
+                  : msgInd[:1], PCounter                                                \
+                  [:numProcs], myCard                                                   \
+                  [:1], NumMessagesBundled                                              \
+                  [:1], msgActual                                                       \
+                  [:1])
     {
 
         while (!U.empty())
@@ -178,8 +186,7 @@ void processMatchedVerticesAndSendMessages(
                             // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+                            (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
                             cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                             fflush(stdout);
@@ -190,8 +197,8 @@ void processMatchedVerticesAndSendMessages(
 
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
                             if (sendMessages)
                             {
                                 // Build the Message Packet:
@@ -200,8 +207,8 @@ void processMatchedVerticesAndSendMessages(
                                 Message[2] = REQUEST; // TYPE
                                                       // Send a Request (Asynchronous)
 
-                                //printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                //fflush(stdout);
+                                // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                // fflush(stdout);
 #pragma omp critical(sendMessage)
                                 {
                                     messagesToSend.push_back(v);
@@ -211,19 +218,15 @@ void processMatchedVerticesAndSendMessages(
                                 }
                                 // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 
-#pragma omp atomic
                                 (*msgActual)++;
                             }
                             else
                             {
-#pragma omp atomic
                                 PCounter[ghostOwner]++;
-#pragma omp atomic
-                                (*NumMessagesBundledPtr)++;
+                                (*NumMessagesBundled)++;
                             }
 
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(v);
                             privateQGhostVtx.push_back(w);
@@ -233,8 +236,7 @@ void processMatchedVerticesAndSendMessages(
                         case 3:
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+                            (*myCard)++;
                             break;
                         case 4:
                             // Could not find a dominating vertex
@@ -253,8 +255,8 @@ void processMatchedVerticesAndSendMessages(
 #endif
 
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
+                                    // assert(ghostOwner != -1);
+                                    // assert(ghostOwner != myRank);
                                     if (sendMessages)
                                     {
                                         // Build the Message Packet:
@@ -263,8 +265,8 @@ void processMatchedVerticesAndSendMessages(
                                         Message[2] = FAILURE; // TYPE
                                                               // Send a Request (Asynchronous)
 
-                                        //printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                        //fflush(stdout);
+                                        // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                        // fflush(stdout);
 #pragma omp critical(sendMessage)
                                         {
                                             messagesToSend.push_back(v);
@@ -273,19 +275,15 @@ void processMatchedVerticesAndSendMessages(
                                             messagesToSend.push_back(ghostOwner);
                                         }
                                         // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-#pragma omp atomic
                                         (*msgActual)++;
                                     }
                                     else
                                     {
-#pragma omp atomic
                                         PCounter[ghostOwner]++;
-#pragma omp atomic
-                                        (*NumMessagesBundledPtr)++;
+                                        (*NumMessagesBundled)++;
                                     }
 
-#pragma omp atomic
-                                    (*msgIndPtr)++;
+                                    (*msgInd)++;
 
                                     privateQLocalVtx.push_back(v);
                                     privateQGhostVtx.push_back(w);
@@ -305,8 +303,8 @@ void processMatchedVerticesAndSendMessages(
 #endif
 
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
                             if (sendMessages)
                             {
                                 // Build the Message Packet:
@@ -315,8 +313,8 @@ void processMatchedVerticesAndSendMessages(
                                 Message[2] = SUCCESS; // TYPE
 
                                 // Send a Request (Asynchronous)
-                                //printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                //fflush(stdout);
+                                // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                // fflush(stdout);
 #pragma omp critical(sendMessage)
                                 {
                                     messagesToSend.push_back(u);
@@ -325,19 +323,15 @@ void processMatchedVerticesAndSendMessages(
                                     messagesToSend.push_back(ghostOwner);
                                 }
                                 // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-#pragma omp atomic
                                 (*msgActual)++;
                             }
                             else
                             {
-#pragma omp atomic
-                                (*NumMessagesBundledPtr)++;
-#pragma omp atomic
+                                (*NumMessagesBundled)++;
                                 PCounter[ghostOwner]++;
                             }
 
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(u);
                             privateQGhostVtx.push_back(v);
@@ -371,10 +365,10 @@ void processMatchedVerticesAndSendMessages(
 
     for (int i = 0; i < messagesToSend.size(); i += 4)
     {
-        Message[0] = messagesToSend[i];       
-        Message[1] = messagesToSend[i + 1];       
+        Message[0] = messagesToSend[i];
+        Message[1] = messagesToSend[i + 1];
         Message[2] = messagesToSend[i + 2];
-        ghostOwner = messagesToSend[i + 3]; 
+        ghostOwner = messagesToSend[i + 3];
         MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
     }
 }
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index c6cb2531..4150a330 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -243,8 +243,8 @@ void processMessages(
                                 fflush(stdout);
 #endif
                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                assert(ghostOwner != -1);
-                                assert(ghostOwner != myRank);
+                                //assert(ghostOwner != -1);
+                                //assert(ghostOwner != myRank);
 
                                 MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
                                 (*msgInd)++;
@@ -301,8 +301,8 @@ void processMessages(
                                     fflush(stdout);
 #endif
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
+                                    //assert(ghostOwner != -1);
+                                    //assert(ghostOwner != myRank);
                                     MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
                                     (*msgInd)++;
                                     (*msgActual)++;

From 1ab166b38b975f2b561fb4e592ec627c58effbbc Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 08:24:50 -0500
Subject: [PATCH 75/96] Improved performance of
 processMatchedVerticesAndSendMessages.cpp

---
 amgprec/impl/aggregator/MatchBoxPC.h          |   1 -
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |   1 -
 .../processMatchedVerticesAndSendMessages.cpp | 153 ++++++++----------
 3 files changed, 63 insertions(+), 92 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index d4b8c04c..8fcc495b 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -355,7 +355,6 @@ extern "C"
         staticQueue &privateQGhostVtx,
         staticQueue &privateQMsgType,
         staticQueue &privateQOwner,
-        bool sendMessages,
         MPI_Comm comm,
         MilanLongInt *msgActual,
         vector<MilanLongInt> &Message);
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 7b47c7c9..612ac95f 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -399,7 +399,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                privateQGhostVtx,
                                privateQMsgType,
                                privateQOwner,
-                               true,
                                comm,
                                &msgActual,
                                Message);
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
index 7775d193..49235870 100644
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -31,7 +31,6 @@ void processMatchedVerticesAndSendMessages(
     staticQueue &privateQGhostVtx,
     staticQueue &privateQMsgType,
     staticQueue &privateQOwner,
-    bool sendMessages,
     MPI_Comm comm,
     MilanLongInt *msgActual,
     vector<MilanLongInt> &Message)
@@ -41,7 +40,7 @@ void processMatchedVerticesAndSendMessages(
     int option;
     MilanLongInt mateVal;
 
-    vector<MilanLongInt> messagesToSend;
+    vector<MilanLongInt> privatemessagesToSend, messagesToSend;
 
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
@@ -52,16 +51,15 @@ void processMatchedVerticesAndSendMessages(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
-    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) \ 
-default(shared) \ 
-num_threads(NUM_THREAD)                                                                 \
-        reduction(+                                                                     \
-                  : msgInd[:1], PCounter                                                \
-                  [:numProcs], myCard                                                   \
-                  [:1], NumMessagesBundled                                              \
-                  [:1], msgActual                                                       \
-                  [:1])
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option, privatemessagesToSend)                                                      \
+    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
+        num_threads(NUM_THREAD)                                                                                                                                     \
+            reduction(+                                                                                                                                             \
+                      : msgInd[:1], PCounter                                                                                                                        \
+                      [:numProcs], myCard                                                                                                                           \
+                      [:1], NumMessagesBundled                                                                                                                      \
+                      [:1], msgActual                                                                                                                               \
+                      [:1])
     {
 
         while (!U.empty())
@@ -199,33 +197,22 @@ num_threads(NUM_THREAD)
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                             // assert(ghostOwner != -1);
                             // assert(ghostOwner != myRank);
-                            if (sendMessages)
-                            {
-                                // Build the Message Packet:
-                                Message[0] = v;       // LOCAL
-                                Message[1] = w;       // GHOST
-                                Message[2] = REQUEST; // TYPE
-                                                      // Send a Request (Asynchronous)
-
-                                // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                // fflush(stdout);
-#pragma omp critical(sendMessage)
-                                {
-                                    messagesToSend.push_back(v);
-                                    messagesToSend.push_back(w);
-                                    messagesToSend.push_back(REQUEST);
-                                    messagesToSend.push_back(ghostOwner);
-                                }
-                                // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-
-                                (*msgActual)++;
-                            }
-                            else
-                            {
-                                PCounter[ghostOwner]++;
-                                (*NumMessagesBundled)++;
-                            }
 
+                            // Build the Message Packet:
+                            Message[0] = v;       // LOCAL
+                            Message[1] = w;       // GHOST
+                            Message[2] = REQUEST; // TYPE
+                                                  // Send a Request (Asynchronous)
+
+                            // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                            // fflush(stdout);
+                            privatemessagesToSend.push_back(v);
+                            privatemessagesToSend.push_back(w);
+                            privatemessagesToSend.push_back(REQUEST);
+                            privatemessagesToSend.push_back(ghostOwner);
+                            // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+
+                            (*msgActual)++;
                             (*msgInd)++;
 
                             privateQLocalVtx.push_back(v);
@@ -257,32 +244,22 @@ num_threads(NUM_THREAD)
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                                     // assert(ghostOwner != -1);
                                     // assert(ghostOwner != myRank);
-                                    if (sendMessages)
-                                    {
-                                        // Build the Message Packet:
-                                        Message[0] = v;       // LOCAL
-                                        Message[1] = w;       // GHOST
-                                        Message[2] = FAILURE; // TYPE
-                                                              // Send a Request (Asynchronous)
-
-                                        // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                        // fflush(stdout);
-#pragma omp critical(sendMessage)
-                                        {
-                                            messagesToSend.push_back(v);
-                                            messagesToSend.push_back(w);
-                                            messagesToSend.push_back(FAILURE);
-                                            messagesToSend.push_back(ghostOwner);
-                                        }
-                                        // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                        (*msgActual)++;
-                                    }
-                                    else
-                                    {
-                                        PCounter[ghostOwner]++;
-                                        (*NumMessagesBundled)++;
-                                    }
 
+                                    // Build the Message Packet:
+                                    Message[0] = v;       // LOCAL
+                                    Message[1] = w;       // GHOST
+                                    Message[2] = FAILURE; // TYPE
+                                                          // Send a Request (Asynchronous)
+
+                                    // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                    // fflush(stdout);
+                                    privatemessagesToSend.push_back(v);
+                                    privatemessagesToSend.push_back(w);
+                                    privatemessagesToSend.push_back(FAILURE);
+                                    privatemessagesToSend.push_back(ghostOwner);
+                                    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+
+                                    (*msgActual)++;
                                     (*msgInd)++;
 
                                     privateQLocalVtx.push_back(v);
@@ -305,32 +282,22 @@ num_threads(NUM_THREAD)
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
                             // assert(ghostOwner != -1);
                             // assert(ghostOwner != myRank);
-                            if (sendMessages)
-                            {
-                                // Build the Message Packet:
-                                Message[0] = u;       // LOCAL
-                                Message[1] = v;       // GHOST
-                                Message[2] = SUCCESS; // TYPE
-
-                                // Send a Request (Asynchronous)
-                                // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                // fflush(stdout);
-#pragma omp critical(sendMessage)
-                                {
-                                    messagesToSend.push_back(u);
-                                    messagesToSend.push_back(v);
-                                    messagesToSend.push_back(SUCCESS);
-                                    messagesToSend.push_back(ghostOwner);
-                                }
-                                // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                (*msgActual)++;
-                            }
-                            else
-                            {
-                                (*NumMessagesBundled)++;
-                                PCounter[ghostOwner]++;
-                            }
 
+                            // Build the Message Packet:
+                            Message[0] = u;       // LOCAL
+                            Message[1] = v;       // GHOST
+                            Message[2] = SUCCESS; // TYPE
+
+                            // Send a Request (Asynchronous)
+                            // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                            // fflush(stdout);)
+                            privatemessagesToSend.push_back(u);
+                            privatemessagesToSend.push_back(v);
+                            privatemessagesToSend.push_back(SUCCESS);
+                            privatemessagesToSend.push_back(ghostOwner);
+                            // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+
+                            (*msgActual)++;
                             (*msgInd)++;
 
                             privateQLocalVtx.push_back(u);
@@ -340,11 +307,17 @@ num_threads(NUM_THREAD)
 
                             break;
                         } // End of switch
-
-                    } // End of inner for
+                    }     // End of inner for
                 }
             } // End of outer for
 
+#pragma omp critical(sendMessageTransfer)
+            {
+                messagesToSend.insert(messagesToSend.end(), privatemessagesToSend.begin(), privatemessagesToSend.end());
+
+                privatemessagesToSend.clear();
+            }
+
             queuesTransfer(U, privateU, QLocalVtx,
                            QGhostVtx,
                            QMsgType, QOwner, privateQLocalVtx,

From 066c1a5e62de13c322d1101834fa4a492e7af72b Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 09:27:35 -0500
Subject: [PATCH 76/96] optimization processMatchedVerticesAndSendMessages.cpp

---
 amgprec/impl/aggregator/MatchBoxPC.h          |   4 -
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |   4 -
 .../processMatchedVerticesAndSendMessages.cpp | 100 ++++++++----------
 3 files changed, 42 insertions(+), 66 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 8fcc495b..01cc0589 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -351,10 +351,6 @@ extern "C"
         vector<MilanLongInt> &QGhostVtx,
         vector<MilanLongInt> &QMsgType,
         vector<MilanInt> &QOwner,
-        staticQueue &privateQLocalVtx,
-        staticQueue &privateQGhostVtx,
-        staticQueue &privateQMsgType,
-        staticQueue &privateQOwner,
         MPI_Comm comm,
         MilanLongInt *msgActual,
         vector<MilanLongInt> &Message);
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 612ac95f..c1210ea7 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -395,10 +395,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                QGhostVtx,
                                QMsgType,
                                QOwner,
-                               privateQLocalVtx,
-                               privateQGhostVtx,
-                               privateQMsgType,
-                               privateQOwner,
                                comm,
                                &msgActual,
                                Message);
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
index 49235870..9d4077a7 100644
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -27,20 +27,22 @@ void processMatchedVerticesAndSendMessages(
     vector<MilanLongInt> &QGhostVtx,
     vector<MilanLongInt> &QMsgType,
     vector<MilanInt> &QOwner,
-    staticQueue &privateQLocalVtx,
-    staticQueue &privateQGhostVtx,
-    staticQueue &privateQMsgType,
-    staticQueue &privateQOwner,
     MPI_Comm comm,
     MilanLongInt *msgActual,
     vector<MilanLongInt> &Message)
 {
 
+    MilanLongInt initialSize = QLocalVtx.size();
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
     int option;
     MilanLongInt mateVal;
 
-    vector<MilanLongInt> privatemessagesToSend, messagesToSend;
+    // TODO reserve!!!
+    vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
+    privateQLocalVtx.reserve(100000);
+    privateQGhostVtx.reserve(100000);
+    privateQMsgType.reserve(100000);
+    privateQOwner.reserve(100000);
 
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
@@ -51,7 +53,7 @@ void processMatchedVerticesAndSendMessages(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option, privatemessagesToSend)                                                      \
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                      \
     firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
         num_threads(NUM_THREAD)                                                                                                                                     \
             reduction(+                                                                                                                                             \
@@ -195,21 +197,12 @@ void processMatchedVerticesAndSendMessages(
 
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            // assert(ghostOwner != -1);
-                            // assert(ghostOwner != myRank);
 
                             // Build the Message Packet:
-                            Message[0] = v;       // LOCAL
-                            Message[1] = w;       // GHOST
-                            Message[2] = REQUEST; // TYPE
-                                                  // Send a Request (Asynchronous)
-
-                            // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                            // fflush(stdout);
-                            privatemessagesToSend.push_back(v);
-                            privatemessagesToSend.push_back(w);
-                            privatemessagesToSend.push_back(REQUEST);
-                            privatemessagesToSend.push_back(ghostOwner);
+                            // Message[0] = v;       // LOCAL
+                            // Message[1] = w;       // GHOST
+                            // Message[2] = REQUEST; // TYPE
+                            // Send a Request (Asynchronous)
                             // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 
                             (*msgActual)++;
@@ -242,21 +235,12 @@ void processMatchedVerticesAndSendMessages(
 #endif
 
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    // assert(ghostOwner != -1);
-                                    // assert(ghostOwner != myRank);
 
                                     // Build the Message Packet:
-                                    Message[0] = v;       // LOCAL
-                                    Message[1] = w;       // GHOST
-                                    Message[2] = FAILURE; // TYPE
-                                                          // Send a Request (Asynchronous)
-
-                                    // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                    // fflush(stdout);
-                                    privatemessagesToSend.push_back(v);
-                                    privatemessagesToSend.push_back(w);
-                                    privatemessagesToSend.push_back(FAILURE);
-                                    privatemessagesToSend.push_back(ghostOwner);
+                                    // Message[0] = v;       // LOCAL
+                                    // Message[1] = w;       // GHOST
+                                    // Message[2] = FAILURE; // TYPE
+                                    // Send a Request (Asynchronous)
                                     // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 
                                     (*msgActual)++;
@@ -280,21 +264,12 @@ void processMatchedVerticesAndSendMessages(
 #endif
 
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            // assert(ghostOwner != -1);
-                            // assert(ghostOwner != myRank);
 
                             // Build the Message Packet:
-                            Message[0] = u;       // LOCAL
-                            Message[1] = v;       // GHOST
-                            Message[2] = SUCCESS; // TYPE
-
+                            // Message[0] = u;       // LOCAL
+                            // Message[1] = v;       // GHOST
+                            // Message[2] = SUCCESS; // TYPE
                             // Send a Request (Asynchronous)
-                            // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                            // fflush(stdout);)
-                            privatemessagesToSend.push_back(u);
-                            privatemessagesToSend.push_back(v);
-                            privatemessagesToSend.push_back(SUCCESS);
-                            privatemessagesToSend.push_back(ghostOwner);
                             // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 
                             (*msgActual)++;
@@ -311,19 +286,25 @@ void processMatchedVerticesAndSendMessages(
                 }
             } // End of outer for
 
+#pragma omp critical(U)
+            {
+                while (!privateU.empty())
+                    U.push_back(privateU.pop_back());
+            }
+
 #pragma omp critical(sendMessageTransfer)
             {
-                messagesToSend.insert(messagesToSend.end(), privatemessagesToSend.begin(), privatemessagesToSend.end());
 
-                privatemessagesToSend.clear();
-            }
+                QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+                QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+                QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+                QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
 
-            queuesTransfer(U, privateU, QLocalVtx,
-                           QGhostVtx,
-                           QMsgType, QOwner, privateQLocalVtx,
-                           privateQGhostVtx,
-                           privateQMsgType,
-                           privateQOwner);
+                privateQLocalVtx.clear();
+                privateQGhostVtx.clear();
+                privateQMsgType.clear();
+                privateQOwner.clear();
+            }
 
         } // End of while ( !U.empty() )
 
@@ -336,12 +317,15 @@ void processMatchedVerticesAndSendMessages(
 #endif
     } // End of parallel region
 
-    for (int i = 0; i < messagesToSend.size(); i += 4)
+    //Send the messages
+    for (int i = initialSize; i < QOwner.size(); i++)
     {
-        Message[0] = messagesToSend[i];
-        Message[1] = messagesToSend[i + 1];
-        Message[2] = messagesToSend[i + 2];
-        ghostOwner = messagesToSend[i + 3];
+
+        Message[0] = QLocalVtx[i];
+        Message[1] = QGhostVtx[i];
+        Message[2] = QMsgType[i];
+        ghostOwner = QOwner[i];
+
         MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
     }
 }

From 500403dbdac33e4a9af4a6a125bbbbc561d79e49 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 11:13:21 -0500
Subject: [PATCH 77/96] Replaced some staticQueues with vectors for performance
 reasons

---
 amgprec/impl/aggregator/MatchBoxPC.h          | 36 +++++-----
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 65 ++++++++++---------
 amgprec/impl/aggregator/initialize.cpp        | 17 ++---
 .../impl/aggregator/processExposedVertex.cpp  | 15 +++--
 .../aggregator/processMatchedVertices.cpp     |  8 +--
 .../processMatchedVerticesAndSendMessages.cpp | 40 ++++--------
 amgprec/impl/aggregator/queueTransfer.cpp     | 38 ++++++-----
 7 files changed, 110 insertions(+), 109 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 01cc0589..1066f8ef 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -189,10 +189,10 @@ extern "C"
                         vector<MilanLongInt> &QGhostVtx,
                         vector<MilanLongInt> &QMsgType,
                         vector<MilanInt> &QOwner,
-                        staticQueue &privateQLocalVtx,
-                        staticQueue &privateQGhostVtx,
-                        staticQueue &privateQMsgType,
-                        staticQueue &privateQOwner);
+                        vector<MilanLongInt> &privateQLocalVtx,
+                        vector<MilanLongInt> &privateQGhostVtx,
+                        vector<MilanLongInt> &privateQMsgType,
+                        vector<MilanInt> &privateQOwner);
 
     bool isAlreadyMatched(MilanLongInt node,
                           MilanLongInt StartIndex,
@@ -233,10 +233,10 @@ extern "C"
                     MilanLongInt *&candidateMate,
                     staticQueue &U,
                     staticQueue &privateU,
-                    staticQueue &privateQLocalVtx,
-                    staticQueue &privateQGhostVtx,
-                    staticQueue &privateQMsgType,
-                    staticQueue &privateQOwner);
+                    vector<MilanLongInt> &privateQLocalVtx,
+                    vector<MilanLongInt> &privateQGhostVtx,
+                    vector<MilanLongInt> &privateQMsgType,
+                    vector<MilanInt> &privateQOwner);
 
     void clean(MilanLongInt NLVer,
                MilanInt myRank,
@@ -284,10 +284,10 @@ extern "C"
                                            vector<MilanLongInt> &QGhostVtx,
                                            vector<MilanLongInt> &QMsgType,
                                            vector<MilanInt> &QOwner,
-                                           staticQueue &privateQLocalVtx,
-                                           staticQueue &privateQGhostVtx,
-                                           staticQueue &privateQMsgType,
-                                           staticQueue &privateQOwner);
+                                           vector<MilanLongInt> &privateQLocalVtx,
+                                           vector<MilanLongInt> &privateQGhostVtx,
+                                           vector<MilanLongInt> &privateQMsgType,
+                                           vector<MilanInt> &privateQOwner);
 
     void PROCESS_CROSS_EDGE(MilanLongInt *edge,
                             MilanLongInt *SPtr);
@@ -319,10 +319,10 @@ extern "C"
         vector<MilanLongInt> &QGhostVtx,
         vector<MilanLongInt> &QMsgType,
         vector<MilanInt> &QOwner,
-        staticQueue &privateQLocalVtx,
-        staticQueue &privateQGhostVtx,
-        staticQueue &privateQMsgType,
-        staticQueue &privateQOwner);
+        vector<MilanLongInt> &privateQLocalVtx,
+        vector<MilanLongInt> &privateQGhostVtx,
+        vector<MilanLongInt> &privateQMsgType,
+        vector<MilanInt> &privateQOwner);
 
     void processMatchedVerticesAndSendMessages(
         MilanLongInt NLVer,
@@ -351,6 +351,10 @@ extern "C"
         vector<MilanLongInt> &QGhostVtx,
         vector<MilanLongInt> &QMsgType,
         vector<MilanInt> &QOwner,
+        vector<MilanLongInt> &privateQLocalVtx,
+        vector<MilanLongInt> &privateQGhostVtx,
+        vector<MilanLongInt> &privateQMsgType,
+        vector<MilanInt> &privateQOwner,
         MPI_Comm comm,
         MilanLongInt *msgActual,
         vector<MilanLongInt> &Message);
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index c1210ea7..4297391a 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -182,7 +182,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
     MilanLongInt S;
     MilanLongInt privateMyCard = 0;
-    staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
+    staticQueue U, privateU;
     vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
     vector<MPI_Request> SRequest;  // Requests that are used for each send message
     vector<MPI_Status> SStatus;    // Status of sent messages, used in MPI_Wait
@@ -190,6 +190,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     MilanInt BufferSize;
     MilanLongInt *Buffer;
 
+    vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType;
+    vector<MilanInt> privateQOwner;
+
     initialize(NLVer, NLEdge, StartIndex,
                EndIndex, &numGhostEdges,
                &numGhostVertices, &S,
@@ -370,34 +373,38 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         ///////////////////////////////////////////////////////////////////////////////////
 
         processMatchedVerticesAndSendMessages(NLVer,
-                               UChunkBeingProcessed,
-                               U,
-                               privateU,
-                               StartIndex,
-                               EndIndex,
-                               &myCard,
-                               &msgInd,
-                               &NumMessagesBundled,
-                               &S,
-                               verLocPtr,
-                               verLocInd,
-                               verDistance,
-                               PCounter,
-                               Counter,
-                               myRank,
-                               numProcs,
-                               candidateMate,
-                               GMate,
-                               Mate,
-                               Ghost2LocalMap,
-                               edgeLocWeight,
-                               QLocalVtx,
-                               QGhostVtx,
-                               QMsgType,
-                               QOwner,
-                               comm,
-                               &msgActual,
-                               Message);
+                                              UChunkBeingProcessed,
+                                              U,
+                                              privateU,
+                                              StartIndex,
+                                              EndIndex,
+                                              &myCard,
+                                              &msgInd,
+                                              &NumMessagesBundled,
+                                              &S,
+                                              verLocPtr,
+                                              verLocInd,
+                                              verDistance,
+                                              PCounter,
+                                              Counter,
+                                              myRank,
+                                              numProcs,
+                                              candidateMate,
+                                              GMate,
+                                              Mate,
+                                              Ghost2LocalMap,
+                                              edgeLocWeight,
+                                              QLocalVtx,
+                                              QGhostVtx,
+                                              QMsgType,
+                                              QOwner,
+                                              privateQLocalVtx,
+                                              privateQGhostVtx,
+                                              privateQMsgType,
+                                              privateQOwner,
+                                              comm,
+                                              &msgActual,
+                                              Message);
 
         ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 477f5f6d..47f424fd 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -21,10 +21,10 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                 MilanLongInt *&candidateMate,
                 staticQueue &U,
                 staticQueue &privateU,
-                staticQueue &privateQLocalVtx,
-                staticQueue &privateQGhostVtx,
-                staticQueue &privateQMsgType,
-                staticQueue &privateQOwner)
+                vector<MilanLongInt> &privateQLocalVtx,
+                vector<MilanLongInt> &privateQGhostVtx,
+                vector<MilanLongInt> &privateQMsgType,
+                vector<MilanInt> &privateQOwner)
 {
 
     MilanLongInt insertMe = 0;
@@ -295,10 +295,11 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 
                 // Initialize the privte data structure
                 new (&privateU) staticQueue(NLVer + (*numGhostVertices)); // TODO how can I put a meaningfull size?
-                new (&privateQLocalVtx) staticQueue(size);
-                new (&privateQGhostVtx) staticQueue(size);
-                new (&privateQMsgType) staticQueue(size);
-                new (&privateQOwner) staticQueue(size);
+                
+                privateQLocalVtx.reserve(*numGhostVertices);
+                privateQGhostVtx.reserve(*numGhostVertices);
+                privateQMsgType.reserve(*numGhostVertices);
+                privateQOwner.reserve(*numGhostVertices);
             } // end of task
 
         } // End of single region
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 91035372..c330e724 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -25,10 +25,10 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                        vector<MilanLongInt> &QGhostVtx,
                                        vector<MilanLongInt> &QMsgType,
                                        vector<MilanInt> &QOwner,
-                                       staticQueue &privateQLocalVtx,
-                                       staticQueue &privateQGhostVtx,
-                                       staticQueue &privateQMsgType,
-                                       staticQueue &privateQOwner)
+                                       vector<MilanLongInt> &privateQLocalVtx,
+                                       vector<MilanLongInt> &privateQGhostVtx,
+                                       vector<MilanLongInt> &privateQMsgType,
+                                       vector<MilanInt> &privateQOwner)
 {
 
     MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
@@ -39,8 +39,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
         num_threads(NUM_THREAD)
 
     {
-#pragma omp for reduction(+ \
-                          : PCounter[:numProcs], myCard[:1], msgInd[:1], NumMessagesBundled[:1]) schedule(static)
+#pragma omp for reduction(+                             \
+                          : PCounter[:numProcs], myCard \
+                          [:1], msgInd                  \
+                          [:1], NumMessagesBundled      \
+                          [:1]) schedule(static)
         for (v = 0; v < NLVer; v++)
         {
             option = -1;
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 1e7b2641..510c9877 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -27,10 +27,10 @@ void processMatchedVertices(
     vector<MilanLongInt> &QGhostVtx,
     vector<MilanLongInt> &QMsgType,
     vector<MilanInt> &QOwner,
-    staticQueue &privateQLocalVtx,
-    staticQueue &privateQGhostVtx,
-    staticQueue &privateQMsgType,
-    staticQueue &privateQOwner)
+    vector<MilanLongInt> &privateQLocalVtx,
+    vector<MilanLongInt> &privateQGhostVtx,
+    vector<MilanLongInt> &privateQMsgType,
+    vector<MilanInt> &privateQOwner)
 {
 
     MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
index 9d4077a7..debfc5ca 100644
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -27,6 +27,10 @@ void processMatchedVerticesAndSendMessages(
     vector<MilanLongInt> &QGhostVtx,
     vector<MilanLongInt> &QMsgType,
     vector<MilanInt> &QOwner,
+    vector<MilanLongInt> &privateQLocalVtx,
+    vector<MilanLongInt> &privateQGhostVtx,
+    vector<MilanLongInt> &privateQMsgType,
+    vector<MilanInt> &privateQOwner,
     MPI_Comm comm,
     MilanLongInt *msgActual,
     vector<MilanLongInt> &Message)
@@ -37,13 +41,6 @@ void processMatchedVerticesAndSendMessages(
     int option;
     MilanLongInt mateVal;
 
-    // TODO reserve!!!
-    vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner;
-    privateQLocalVtx.reserve(100000);
-    privateQGhostVtx.reserve(100000);
-    privateQMsgType.reserve(100000);
-    privateQOwner.reserve(100000);
-
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << "=========================************===============================" << endl;
     fflush(stdout);
@@ -53,7 +50,7 @@ void processMatchedVerticesAndSendMessages(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                      \
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                                             \
     firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
         num_threads(NUM_THREAD)                                                                                                                                     \
             reduction(+                                                                                                                                             \
@@ -286,25 +283,12 @@ void processMatchedVerticesAndSendMessages(
                 }
             } // End of outer for
 
-#pragma omp critical(U)
-            {
-                while (!privateU.empty())
-                    U.push_back(privateU.pop_back());
-            }
-
-#pragma omp critical(sendMessageTransfer)
-            {
-
-                QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
-                QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
-                QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
-                QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
-
-                privateQLocalVtx.clear();
-                privateQGhostVtx.clear();
-                privateQMsgType.clear();
-                privateQOwner.clear();
-            }
+            queuesTransfer(U, privateU, QLocalVtx,
+                           QGhostVtx,
+                           QMsgType, QOwner, privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
 
         } // End of while ( !U.empty() )
 
@@ -317,7 +301,7 @@ void processMatchedVerticesAndSendMessages(
 #endif
     } // End of parallel region
 
-    //Send the messages
+    // Send the messages
     for (int i = initialSize; i < QOwner.size(); i++)
     {
 
diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp
index cbae1fc2..0439a08c 100644
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@@ -1,32 +1,34 @@
 #include "MatchBoxPC.h"
 
 void queuesTransfer(staticQueue &U,
-                           staticQueue &privateU,
-                           vector<MilanLongInt> &QLocalVtx,
-                           vector<MilanLongInt> &QGhostVtx,
-                           vector<MilanLongInt> &QMsgType,
-                           vector<MilanInt> &QOwner,
-                           staticQueue &privateQLocalVtx,
-                           staticQueue &privateQGhostVtx,
-                           staticQueue &privateQMsgType,
-                           staticQueue &privateQOwner)
+                    staticQueue &privateU,
+                    vector<MilanLongInt> &QLocalVtx,
+                    vector<MilanLongInt> &QGhostVtx,
+                    vector<MilanLongInt> &QMsgType,
+                    vector<MilanInt> &QOwner,
+                    vector<MilanLongInt> &privateQLocalVtx,
+                    vector<MilanLongInt> &privateQGhostVtx,
+                    vector<MilanLongInt> &privateQMsgType,
+                    vector<MilanInt> &privateQOwner)
 {
 
-
 #pragma omp critical(U)
     {
         while (!privateU.empty())
             U.push_back(privateU.pop_back());
     }
 
-#pragma omp critical(privateMsg)
+#pragma omp critical(sendMessageTransfer)
     {
-        while (!privateQLocalVtx.empty())
-        {
-            QLocalVtx.push_back(privateQLocalVtx.pop_back());
-            QGhostVtx.push_back(privateQGhostVtx.pop_back());
-            QMsgType.push_back(privateQMsgType.pop_back());
-            QOwner.push_back(privateQOwner.pop_back());
-        }
+
+        QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+        QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+        QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+        QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
     }
+
+    privateQLocalVtx.clear();
+    privateQGhostVtx.clear();
+    privateQMsgType.clear();
+    privateQOwner.clear();
 }
\ No newline at end of file

From a259e8ab53cb000416233940207972badafd7daa Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 11:34:43 -0500
Subject: [PATCH 78/96] extractUChunch optimization

---
 amgprec/impl/aggregator/extractUChunk.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp
index b5bc1f5f..e26d1011 100644
--- a/amgprec/impl/aggregator/extractUChunk.cpp
+++ b/amgprec/impl/aggregator/extractUChunk.cpp
@@ -11,14 +11,18 @@ void extractUChunk(
     {
 
         if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
+        {
             while (!privateU.empty())
-                U.push_back(privateU.pop_front());
-
-        for (int i = 0; i < UCHUNK; i++)
-        { // Pop the new nodes
-            if (U.empty())
-                break;
-            UChunkBeingProcessed.push_back(U.pop_front());
+                UChunkBeingProcessed.push_back(privateU.pop_back());
+        }
+        else
+        {
+            for (int i = 0; i < UCHUNK; i++)
+            { // Pop the new nodes
+                if (U.empty())
+                    break;
+                UChunkBeingProcessed.push_back(U.pop_back());
+            }
         }
 
     } // End of critical U

From 6414d3aef32c0818babf5d3dc139c66ac5918328 Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 12:47:43 -0500
Subject: [PATCH 79/96] U and privateU are now vectors

---
 amgprec/impl/aggregator/MatchBoxPC.h          | 26 +++++++++----------
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |  3 +--
 amgprec/impl/aggregator/extractUChunk.cpp     | 12 +++++----
 amgprec/impl/aggregator/initialize.cpp        | 15 +++++------
 .../impl/aggregator/processExposedVertex.cpp  |  4 +--
 .../aggregator/processMatchedVertices.cpp     | 25 ++++++++++++++++--
 .../processMatchedVerticesAndSendMessages.cpp |  4 +--
 amgprec/impl/aggregator/processMessages.cpp   |  2 +-
 amgprec/impl/aggregator/queueTransfer.cpp     |  9 ++++---
 9 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 1066f8ef..a1fddb59 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -183,8 +183,8 @@ extern "C"
                                            MilanLongInt *verLocInd,
                                            MilanReal *edgeLocWeight);
 
-    void queuesTransfer(staticQueue &U,
-                        staticQueue &privateU,
+    void queuesTransfer(vector<MilanLongInt> &U,
+                        vector<MilanLongInt> &privateU,
                         vector<MilanLongInt> &QLocalVtx,
                         vector<MilanLongInt> &QGhostVtx,
                         vector<MilanLongInt> &QMsgType,
@@ -231,8 +231,8 @@ extern "C"
                     vector<MilanLongInt> &QMsgType,
                     vector<MilanInt> &QOwner,
                     MilanLongInt *&candidateMate,
-                    staticQueue &U,
-                    staticQueue &privateU,
+                    vector<MilanLongInt> &U,
+                    vector<MilanLongInt> &privateU,
                     vector<MilanLongInt> &privateQLocalVtx,
                     vector<MilanLongInt> &privateQGhostVtx,
                     vector<MilanLongInt> &privateQMsgType,
@@ -278,8 +278,8 @@ extern "C"
                                            vector<MilanLongInt> &Counter,
                                            MilanInt myRank,
                                            MilanInt numProcs,
-                                           staticQueue &U,
-                                           staticQueue &privateU,
+                                           vector<MilanLongInt> &U,
+                                           vector<MilanLongInt> &privateU,
                                            vector<MilanLongInt> &QLocalVtx,
                                            vector<MilanLongInt> &QGhostVtx,
                                            vector<MilanLongInt> &QMsgType,
@@ -295,8 +295,8 @@ extern "C"
     void processMatchedVertices(
         MilanLongInt NLVer,
         vector<MilanLongInt> &UChunkBeingProcessed,
-        staticQueue &U,
-        staticQueue &privateU,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU,
         MilanLongInt StartIndex,
         MilanLongInt EndIndex,
         MilanLongInt *myCardPtr,
@@ -327,8 +327,8 @@ extern "C"
     void processMatchedVerticesAndSendMessages(
         MilanLongInt NLVer,
         vector<MilanLongInt> &UChunkBeingProcessed,
-        staticQueue &U,
-        staticQueue &privateU,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU,
         MilanLongInt StartIndex,
         MilanLongInt EndIndex,
         MilanLongInt *myCardPtr,
@@ -404,12 +404,12 @@ extern "C"
         MilanLongInt u,
         MilanLongInt v,
         MilanLongInt *SPtr,
-        staticQueue &U);
+        vector<MilanLongInt> &U);
 
     void extractUChunk(
         vector<MilanLongInt> &UChunkBeingProcessed,
-        staticQueue &U,
-        staticQueue &privateU);
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU);
 
     void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         MilanLongInt NLVer, MilanLongInt NLEdge,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index 4297391a..bb2dd5a7 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -182,7 +182,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
     MilanLongInt S;
     MilanLongInt privateMyCard = 0;
-    staticQueue U, privateU;
     vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
     vector<MPI_Request> SRequest;  // Requests that are used for each send message
     vector<MPI_Status> SStatus;    // Status of sent messages, used in MPI_Wait
@@ -192,6 +191,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType;
     vector<MilanInt> privateQOwner;
+    vector<MilanLongInt> U, privateU;
 
     initialize(NLVer, NLEdge, StartIndex,
                EndIndex, &numGhostEdges,
@@ -240,7 +240,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
      * TODO: Test when it's actually more efficient to execute this code
      *       in parallel.
      */
-
     PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
                                       candidateMate,
                                       verLocInd,
diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp
index e26d1011..923a0b51 100644
--- a/amgprec/impl/aggregator/extractUChunk.cpp
+++ b/amgprec/impl/aggregator/extractUChunk.cpp
@@ -2,8 +2,8 @@
 
 void extractUChunk(
     vector<MilanLongInt> &UChunkBeingProcessed,
-    staticQueue &U,
-    staticQueue &privateU)
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU)
 {
 
     UChunkBeingProcessed.clear();
@@ -13,7 +13,8 @@ void extractUChunk(
         if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
         {
             while (!privateU.empty())
-                UChunkBeingProcessed.push_back(privateU.pop_back());
+                UChunkBeingProcessed.push_back(privateU.back());
+            privateU.pop_back();
         }
         else
         {
@@ -21,9 +22,10 @@ void extractUChunk(
             { // Pop the new nodes
                 if (U.empty())
                     break;
-                UChunkBeingProcessed.push_back(U.pop_back());
+                UChunkBeingProcessed.push_back(U.back());
+                U.pop_back();
             }
         }
 
-    } // End of critical U
+    } // End of critical U // End of critical U
 }
\ No newline at end of file
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 47f424fd..17a4169e 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -19,8 +19,8 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                 vector<MilanLongInt> &QMsgType,
                 vector<MilanInt> &QOwner,
                 MilanLongInt *&candidateMate,
-                staticQueue &U,
-                staticQueue &privateU,
+                vector<MilanLongInt> &U,
+                vector<MilanLongInt> &privateU,
                 vector<MilanLongInt> &privateQLocalVtx,
                 vector<MilanLongInt> &privateQGhostVtx,
                 vector<MilanLongInt> &privateQMsgType,
@@ -288,18 +288,15 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                  * of a staticQueue I had to destroy the previous object and instantiate
                  * a new one of the correct size.
                  */
-                new (&U) staticQueue(NLVer + (*numGhostVertices));
+                //new (&U) staticQueue(NLVer + (*numGhostVertices));
+                U.reserve(NLVer + (*numGhostVertices));
 
-                // TODO how can I decide a more meaningfull size?
-                MilanLongInt size = (*numGhostVertices);
-
-                // Initialize the privte data structure
-                new (&privateU) staticQueue(NLVer + (*numGhostVertices)); // TODO how can I put a meaningfull size?
-                
+                // Initialize the private vectors                
                 privateQLocalVtx.reserve(*numGhostVertices);
                 privateQGhostVtx.reserve(*numGhostVertices);
                 privateQMsgType.reserve(*numGhostVertices);
                 privateQOwner.reserve(*numGhostVertices);
+                privateU.reserve(*numGhostVertices);
             } // end of task
 
         } // End of single region
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index c330e724..49227158 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -19,8 +19,8 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                        vector<MilanLongInt> &Counter,
                                        MilanInt myRank,
                                        MilanInt numProcs,
-                                       staticQueue &U,
-                                       staticQueue &privateU,
+                                       vector<MilanLongInt> &U,
+                                       vector<MilanLongInt> &privateU,
                                        vector<MilanLongInt> &QLocalVtx,
                                        vector<MilanLongInt> &QGhostVtx,
                                        vector<MilanLongInt> &QMsgType,
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index 510c9877..e96dcc1d 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -3,8 +3,8 @@
 void processMatchedVertices(
     MilanLongInt NLVer,
     vector<MilanLongInt> &UChunkBeingProcessed,
-    staticQueue &U,
-    staticQueue &privateU,
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU,
     MilanLongInt StartIndex,
     MilanLongInt EndIndex,
     MilanLongInt *myCard,
@@ -275,6 +275,27 @@ void processMatchedVertices(
                            privateQMsgType,
                            privateQOwner);
 
+#pragma omp critical(U)
+            {
+                U.insert(U.end(), privateU.begin(), privateU.end());
+            }
+
+            privateU.clear();
+
+#pragma omp critical(sendMessageTransfer)
+            {
+
+                QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+                QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+                QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+                QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
+            }
+
+            privateQLocalVtx.clear();
+            privateQGhostVtx.clear();
+            privateQMsgType.clear();
+            privateQOwner.clear();
+
         } // End of while ( !U.empty() )
 
 #ifdef COUNT_LOCAL_VERTEX
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
index debfc5ca..3322a05b 100644
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -3,8 +3,8 @@
 void processMatchedVerticesAndSendMessages(
     MilanLongInt NLVer,
     vector<MilanLongInt> &UChunkBeingProcessed,
-    staticQueue &U,
-    staticQueue &privateU,
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU,
     MilanLongInt StartIndex,
     MilanLongInt EndIndex,
     MilanLongInt *myCard,
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 4150a330..804790c6 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -25,7 +25,7 @@ void processMessages(
     MilanLongInt u,
     MilanLongInt v,
     MilanLongInt *S,
-    staticQueue &U)
+    vector<MilanLongInt> &U)
 {
 
     //#define PRINT_DEBUG_INFO_
diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp
index 0439a08c..7200b43d 100644
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@@ -1,7 +1,7 @@
 #include "MatchBoxPC.h"
 
-void queuesTransfer(staticQueue &U,
-                    staticQueue &privateU,
+void queuesTransfer(vector<MilanLongInt> &U,
+                    vector<MilanLongInt> &privateU,
                     vector<MilanLongInt> &QLocalVtx,
                     vector<MilanLongInt> &QGhostVtx,
                     vector<MilanLongInt> &QMsgType,
@@ -14,10 +14,11 @@ void queuesTransfer(staticQueue &U,
 
 #pragma omp critical(U)
     {
-        while (!privateU.empty())
-            U.push_back(privateU.pop_back());
+        U.insert(U.end(), privateU.begin(), privateU.end());
     }
 
+    privateU.clear();
+
 #pragma omp critical(sendMessageTransfer)
     {
 

From d59c9e6c0a56e0399b4884d2ca11c5d3ebe5556c Mon Sep 17 00:00:00 2001
From: Salvatore Filippone <salvatore.filippone@uniroma2.it>
Date: Tue, 22 Nov 2022 03:02:51 -0500
Subject: [PATCH 80/96] Updates towards OpenMP version.

---
 amgprec/amg_d_matchboxp_mod.f90            | 2 +-
 amgprec/impl/aggregator/Makefile           | 1 +
 amgprec/impl/aggregator/clean.cpp          | 4 ++--
 samples/advanced/pdegen/Makefile           | 2 +-
 samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/amgprec/amg_d_matchboxp_mod.f90 b/amgprec/amg_d_matchboxp_mod.f90
index a18d62d6..f6cb38ff 100644
--- a/amgprec/amg_d_matchboxp_mod.f90
+++ b/amgprec/amg_d_matchboxp_mod.f90
@@ -1109,7 +1109,7 @@ contains
     verlocptr(:)   = verlocptr(:)  + 1
     verlocind(:)   = verlocind(:) + 1
     verdistance(:) = verdistance(:) + 1
-
+    if (me==0) write(0,*) 'Ph0/1/2 time ',ph0_time, ph1_time, ph2_time
     if (debug_sync) then
       call psb_barrier(ictxt)
       if (me == 0) write(0,*)' Done MatchBoxP '
diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile
index 00e43088..11027ac1 100644
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@@ -67,6 +67,7 @@ initialize.o \
 extractUChunk.o \
 isAlreadyMatched.o \
 findOwnerOfGhost.o \
+clean.o \
 computeCandidateMate.o \
 parallelComputeCandidateMateB.o \
 processMatchedVertices.o \
diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp
index 62f366b2..f316aee7 100644
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@@ -33,7 +33,7 @@ void clean(MilanLongInt NLVer,
                 cout << "\n(" << myRank << ") Waitall " << endl;
                 fflush(stdout);
 #endif
-                return;
+                //return;
 
                 MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
 
@@ -88,4 +88,4 @@ void clean(MilanLongInt NLVer,
 #endif
         }
     }
-}
\ No newline at end of file
+}
diff --git a/samples/advanced/pdegen/Makefile b/samples/advanced/pdegen/Makefile
index 8a49c73a..b5092a22 100644
--- a/samples/advanced/pdegen/Makefile
+++ b/samples/advanced/pdegen/Makefile
@@ -3,7 +3,7 @@ AMGINCDIR=$(AMGDIR)/include
 include $(AMGINCDIR)/Make.inc.amg4psblas
 AMGMODDIR=$(AMGDIR)/modules
 AMGLIBDIR=$(AMGDIR)/lib
-AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec -llapack -lblas
+AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec 
 FINCLUDES=$(FMFLAG). $(FMFLAG)$(AMGMODDIR) $(FMFLAG)$(AMGINCDIR) $(PSBLAS_INCLUDES) $(FIFLAG).
 
 LINKOPT=
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index b6c448c3..0cd5d6c5 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0020                       ! IDIM; domain size. Linear system size is IDIM**3
+0200                       ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC

From 32994c7ce82dbdae0fd908b6f87bebf15b9f8a2e Mon Sep 17 00:00:00 2001
From: Salvatore Filippone <salvatore.filippone@uniroma2.it>
Date: Tue, 13 Dec 2022 06:44:12 -0500
Subject: [PATCH 81/96] Better parameters in matchboxp_mod

---
 amgprec/amg_d_matchboxp_mod.f90 | 17 +++++++++--------
 amgprec/amg_s_matchboxp_mod.f90 | 15 ++++++++-------
 amgprec/stZnqhkT                |  1 -
 3 files changed, 17 insertions(+), 16 deletions(-)
 delete mode 100644 amgprec/stZnqhkT

diff --git a/amgprec/amg_d_matchboxp_mod.f90 b/amgprec/amg_d_matchboxp_mod.f90
index f6cb38ff..2df58797 100644
--- a/amgprec/amg_d_matchboxp_mod.f90
+++ b/amgprec/amg_d_matchboxp_mod.f90
@@ -146,6 +146,7 @@ contains
          & debug_ilaggr=.false., debug_sync=.false.
     integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
     logical, parameter :: do_timings=.true.
+    integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
 
     ictxt = desc_a%get_ctxt()
     call psb_info(ictxt,iam,np)
@@ -187,7 +188,7 @@ contains
     call desc_a%l2gip(ilv,info,owned=.false.)
 
     call psb_geall(ilaggr,desc_a,info)
-    ilaggr = -1
+    ilaggr = ilaggr_neginit
     call psb_geasb(ilaggr,desc_a,info)
     nr = a%get_nrows()
     nc = a%get_ncols()
@@ -259,7 +260,7 @@ contains
           cycle
         else
 
-          if (ilaggr(k) == -1) then
+          if (ilaggr(k) == ilaggr_neginit) then
 
             wk   = w(k)
             widx = w(idx)
@@ -267,7 +268,7 @@ contains
             nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
             if (nrmagg > epsilon(nrmagg)) then
               if (idx <= nr) then
-                if (ilaggr(idx) == -1) then
+                if (ilaggr(idx) == ilaggr_neginit) then
                   ! Now, if both vertices are local, the aggregate is local
                   ! (kinda obvious).
                   nlaggr(iam) = nlaggr(iam) + 1
@@ -294,7 +295,7 @@ contains
                     ilaggr(k)   = nlaggr(iam)
                     nlpairs = nlpairs+1
                   else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                   end if
                 else
                   ! Use a statistically unbiased tie-breaking rule,
@@ -309,7 +310,7 @@ contains
                     ilaggr(k)   = nlaggr(iam)
                     nlpairs = nlpairs+1
                   else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                   end if
                 end if
               end if
@@ -332,7 +333,7 @@ contains
       if (do_timings) call psb_tic(idx_phase3)
 
       ! Ok, now compute offsets, gather halo and fix non-local
-      ! aggregates  (those where ilaggr == -2)
+      ! aggregates  (those where ilaggr == ilaggr_nonlocal)
       call psb_sum(ictxt,nlaggr)
       ntaggr  = sum(nlaggr(0:np-1))
       naggrm1 = sum(nlaggr(0:iam-1))
@@ -347,7 +348,7 @@ contains
       call psb_halo(wtemp,desc_a,info)
       ! Cleanup as yet unmarked entries
       do k=1,nr
-        if (ilaggr(k) == -2) then
+        if (ilaggr(k) == ilaggr_nonlocal) then
           idx = mate(k)
           if (idx > nr) then
             i   = ilaggr(idx)
@@ -1109,7 +1110,7 @@ contains
     verlocptr(:)   = verlocptr(:)  + 1
     verlocind(:)   = verlocind(:) + 1
     verdistance(:) = verdistance(:) + 1
-    if (me==0) write(0,*) 'Ph0/1/2 time ',ph0_time, ph1_time, ph2_time
+
     if (debug_sync) then
       call psb_barrier(ictxt)
       if (me == 0) write(0,*)' Done MatchBoxP '
diff --git a/amgprec/amg_s_matchboxp_mod.f90 b/amgprec/amg_s_matchboxp_mod.f90
index 9061344f..9144d70e 100644
--- a/amgprec/amg_s_matchboxp_mod.f90
+++ b/amgprec/amg_s_matchboxp_mod.f90
@@ -146,6 +146,7 @@ contains
          & debug_ilaggr=.false., debug_sync=.false.
     integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
     logical, parameter :: do_timings=.true.
+    integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
 
     ictxt = desc_a%get_ctxt()
     call psb_info(ictxt,iam,np)
@@ -187,7 +188,7 @@ contains
     call desc_a%l2gip(ilv,info,owned=.false.)
 
     call psb_geall(ilaggr,desc_a,info)
-    ilaggr = -1
+    ilaggr = ilaggr_neginit
     call psb_geasb(ilaggr,desc_a,info)
     nr = a%get_nrows()
     nc = a%get_ncols()
@@ -259,7 +260,7 @@ contains
           cycle
         else
 
-          if (ilaggr(k) == -1) then
+          if (ilaggr(k) == ilaggr_neginit) then
 
             wk   = w(k)
             widx = w(idx)
@@ -267,7 +268,7 @@ contains
             nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
             if (nrmagg > epsilon(nrmagg)) then
               if (idx <= nr) then
-                if (ilaggr(idx) == -1) then
+                if (ilaggr(idx) == ilaggr_neginit) then
                   ! Now, if both vertices are local, the aggregate is local
                   ! (kinda obvious).
                   nlaggr(iam) = nlaggr(iam) + 1
@@ -294,7 +295,7 @@ contains
                     ilaggr(k)   = nlaggr(iam)
                     nlpairs = nlpairs+1
                   else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                   end if
                 else
                   ! Use a statistically unbiased tie-breaking rule,
@@ -309,7 +310,7 @@ contains
                     ilaggr(k)   = nlaggr(iam)
                     nlpairs = nlpairs+1
                   else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                   end if
                 end if
               end if
@@ -332,7 +333,7 @@ contains
       if (do_timings) call psb_tic(idx_phase3)
 
       ! Ok, now compute offsets, gather halo and fix non-local
-      ! aggregates  (those where ilaggr == -2)
+      ! aggregates  (those where ilaggr == ilaggr_nonlocal)
       call psb_sum(ictxt,nlaggr)
       ntaggr  = sum(nlaggr(0:np-1))
       naggrm1 = sum(nlaggr(0:iam-1))
@@ -347,7 +348,7 @@ contains
       call psb_halo(wtemp,desc_a,info)
       ! Cleanup as yet unmarked entries
       do k=1,nr
-        if (ilaggr(k) == -2) then
+        if (ilaggr(k) == ilaggr_nonlocal) then
           idx = mate(k)
           if (idx > nr) then
             i   = ilaggr(idx)
diff --git a/amgprec/stZnqhkT b/amgprec/stZnqhkT
deleted file mode 100644
index 8b277f0d..00000000
--- a/amgprec/stZnqhkT
+++ /dev/null
@@ -1 +0,0 @@
-!<arch>

From ebe9b451775dd0df0ac0d9c0ffa28db64e87da87 Mon Sep 17 00:00:00 2001
From: Salvatore Filippone <salvatore.filippone@uniroma2.it>
Date: Fri, 10 Feb 2023 07:50:58 -0500
Subject: [PATCH 82/96] Modify MATCHBOXP to fix OpenMP. Performance to be
 reviewed

---
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 175 ++++--
 .../impl/aggregator/computeCandidateMate.cpp  |  41 +-
 .../parallelComputeCandidateMateB.cpp         |   3 +-
 .../impl/aggregator/processExposedVertex.cpp  | 156 +++---
 .../aggregator/processMatchedVertices.cpp     | 437 +++++++--------
 .../processMatchedVerticesAndSendMessages.cpp | 517 +++++++++---------
 amgprec/impl/aggregator/processMessages.cpp   | 366 ++++++-------
 amgprec/impl/aggregator/queueTransfer.cpp     |   3 +-
 .../impl/aggregator/sendBundledMessages.cpp   | 241 ++++----
 9 files changed, 978 insertions(+), 961 deletions(-)

diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index bb2dd5a7..49b366a6 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -70,7 +70,7 @@
  Statistics: ph0_time, ph1_time, ph2_time: Runtimes
  Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
  */
-
+//#define DEBUG_HANG_
 #ifdef SERIAL_MPI
 #else
 
@@ -110,17 +110,24 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 #endif
 
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]";
+    cout << "\n(" << myRank << ") verDistance [" ;
+    for (int i = 0; i < numProcs; i++)
+      cout << verDistance[i] << "," << verDistance[i+1];
+    cout  << "]\n";
     fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
-    if (myRank == 0)
-        cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]";
+    if (myRank == 0) {
+      cout << "\n(" << myRank << ") verDistance [" ;
+      for (int i = 0; i < numProcs; i++)
+	cout << verDistance[i] << "," ;
+      cout  << verDistance[numProcs]<< "]\n";
+    }
     fflush(stdout);
 #endif
 
     MilanLongInt StartIndex = verDistance[myRank];       // The starting vertex owned by the current rank
-    MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
+    MilanLongInt EndIndex   = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
 
     MPI_Status computeStatus;
 
@@ -211,7 +218,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     finishTime = MPI_Wtime();
     *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization
-
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished initialization" << endl;
+    fflush(stdout);
+#endif
+   
     startTime = MPI_Wtime();
 
     /////////////////////////////////////////////////////////////////////////////////////////
@@ -233,6 +244,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                       edgeLocWeight,
                                       candidateMate);
 
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished Exposed Vertex" << endl;
+    fflush(stdout);
+#if 0
+    cout << myRank << " candidateMate after parallelCompute " <<endl;
+    for (int i=0; i<NLVer; i++) {
+      cout << candidateMate[i] << " " ;
+    }
+    cout << endl;
+#endif
+#endif
     /*
      * PARALLEL_PROCESS_EXPOSED_VERTEX_B
      * TODO: write comment
@@ -272,6 +294,18 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     tempCounter.clear(); // Do not need this any more
 
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished Exposed Vertex" << endl;
+    fflush(stdout);
+#if 0
+    cout << myRank << " Mate after Exposed Vertices " <<endl;
+    for (int i=0; i<NLVer; i++) {
+      cout << Mate[i] << " " ;
+    }
+    cout << endl;
+#endif
+#endif
+
     ///////////////////////////////////////////////////////////////////////////////////
     /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////
@@ -311,6 +345,19 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                            privateQMsgType,
                            privateQOwner);
 
+
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished Process Vertices" << endl;
+    fflush(stdout);
+#if 0
+    cout << myRank << " Mate after Matched Vertices " <<endl;
+    for (int i=0; i<NLVer; i++) {
+      cout << Mate[i] << " " ;
+    }
+    cout << endl;
+#endif
+#endif
+
     /////////////////////////////////////////////////////////////////////////////////////////
     ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////////
@@ -339,6 +386,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 
     finishTime = MPI_Wtime();
     *ph1_time = finishTime - startTime; // Time taken for Phase-1
+
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished sendBundles" << endl;
+    fflush(stdout);
+#endif
+ 
     *ph1_card = myCard;                 // Cardinality at the end of Phase-1
     startTime = MPI_Wtime();
     /////////////////////////////////////////////////////////////////////////////////////////
@@ -360,65 +413,72 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
     fflush(stdout);
 #endif
 
-    while (true)
-    {
+    while (true)  {
 #ifdef DEBUG_HANG_
-        if (myRank == 0)
-            cout << "\n(" << myRank << ") Main loop" << endl;
-        fflush(stdout);
+      //if (myRank == 0)
+      cout << "\n(" << myRank << ") Main loop" << endl;
+      fflush(stdout);
 #endif
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
         ///////////////////////////////////////////////////////////////////////////////////
-
-        processMatchedVerticesAndSendMessages(NLVer,
-                                              UChunkBeingProcessed,
-                                              U,
-                                              privateU,
-                                              StartIndex,
-                                              EndIndex,
-                                              &myCard,
-                                              &msgInd,
-                                              &NumMessagesBundled,
-                                              &S,
-                                              verLocPtr,
-                                              verLocInd,
-                                              verDistance,
-                                              PCounter,
-                                              Counter,
-                                              myRank,
-                                              numProcs,
-                                              candidateMate,
-                                              GMate,
-                                              Mate,
-                                              Ghost2LocalMap,
-                                              edgeLocWeight,
-                                              QLocalVtx,
-                                              QGhostVtx,
-                                              QMsgType,
-                                              QOwner,
-                                              privateQLocalVtx,
-                                              privateQGhostVtx,
-                                              privateQMsgType,
-                                              privateQOwner,
-                                              comm,
-                                              &msgActual,
-                                              Message);
-
-        ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
+      
+      processMatchedVerticesAndSendMessages(NLVer,
+					    UChunkBeingProcessed,
+					    U,
+					    privateU,
+					    StartIndex,
+					    EndIndex,
+					    &myCard,
+					    &msgInd,
+					    &NumMessagesBundled,
+					    &S,
+					    verLocPtr,
+					    verLocInd,
+					    verDistance,
+					    PCounter,
+					    Counter,
+					    myRank,
+					    numProcs,
+					    candidateMate,
+					    GMate,
+					    Mate,
+					    Ghost2LocalMap,
+					    edgeLocWeight,
+					    QLocalVtx,
+					    QGhostVtx,
+					    QMsgType,
+					    QOwner,
+					    privateQLocalVtx,
+					    privateQGhostVtx,
+					    privateQMsgType,
+					    privateQOwner,
+					    comm,
+					    &msgActual,
+					    Message);
+      
+      ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
 
         //// BREAK IF NO MESSAGES EXPECTED /////////
+#ifdef DEBUG_HANG_
+#if 0
+      cout << myRank << " Mate after ProcessMatchedAndSend phase "<<S <<endl;
+      for (int i=0; i<NLVer; i++) {
+	cout << Mate[i] << " " ;
+      }
+      cout << endl;
+#endif
+#endif
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << ")Deciding whether to break: S= " << S << endl;
 #endif
 
-        if (S == 0)
-        {
+        if (S == 0) {
 #ifdef DEBUG_HANG_
-            cout << "\n(" << myRank << ") Breaking out" << endl;
-            fflush(stdout);
+	  cout << "\n(" << myRank << ") Breaking out" << endl;
+	  fflush(stdout);
 #endif
-            break;
+	  break;
         }
         ///////////////////////////////////////////////////////////////////////////////////
         /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
@@ -451,6 +511,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                         U);
 
         ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
+#ifdef DEBUG_HANG_
+#if 0
+      cout << myRank << " Mate after ProcessMessages phase "<<S <<endl;
+      for (int i=0; i<NLVer; i++) {
+	cout << Mate[i] << " " ;
+      }
+      cout << endl;
+#endif
+#endif
 #ifdef PRINT_DEBUG_INFO_
         cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
         fflush(stdout);
@@ -482,4 +551,4 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
 // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp
index f79fc613..7d4e7ce8 100644
--- a/amgprec/impl/aggregator/computeCandidateMate.cpp
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@@ -17,16 +17,13 @@ MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
     int finalK;
-    for (int k = adj1; k < adj2; k++)
-    {
-
-        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))
-        {
-            heaviestEdgeWt = edgeLocWeight[k];
-            w = verLocInd[k];
-            finalK = k;
-        }
+    for (int k = adj1; k < adj2; k++) {
+      if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+	  ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))     {
+	heaviestEdgeWt = edgeLocWeight[k];
+	w = verLocInd[k];
+	finalK = k;
+      }
     } // End of for loop
     return finalK;
 }
@@ -60,19 +57,17 @@ MilanLongInt computeCandidateMate(MilanLongInt adj1,
 
     MilanInt w = -1;
     MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
-    for (k = adj1; k < adj2; k++)
-    {
-        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
-            continue;
-
-        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
-            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))
-        {
-            heaviestEdgeWt = edgeLocWeight[k];
-            w = verLocInd[k];
-        }
+    for (k = adj1; k < adj2; k++)   {
+      if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+	continue;
+      
+      if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+	  ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))     {
+	heaviestEdgeWt = edgeLocWeight[k];
+	w = verLocInd[k];
+      }
     } // End of for loop
       //  End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
-
+    
     return w;
-}
\ No newline at end of file
+}
diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
index 998edd9e..ffb8d2a3 100644
--- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
+++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
@@ -14,8 +14,7 @@ void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
     {
 
 #pragma omp for schedule(static)
-        for (v = 0; v < NLVer; v++)
-        {
+        for (v = 0; v < NLVer; v++) {
 #ifdef PRINT_DEBUG_INFO_
             cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
             fflush(stdout);
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index 49227158..2b38ec7a 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -29,23 +29,23 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                        vector<MilanLongInt> &privateQGhostVtx,
                                        vector<MilanLongInt> &privateQMsgType,
                                        vector<MilanInt> &privateQOwner)
-{
+{ 
 
     MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
-    MilanInt ghostOwner = 0, option;
+    MilanInt ghostOwner = 0, option, igw;
 
-#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner)                                                          \
-    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) \
-        num_threads(NUM_THREAD)
+#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner)    \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) \
+     default(shared)   num_threads(NUM_THREAD)
 
     {
 #pragma omp for reduction(+                             \
                           : PCounter[:numProcs], myCard \
                           [:1], msgInd                  \
                           [:1], NumMessagesBundled      \
-                          [:1]) schedule(static)
-        for (v = 0; v < NLVer; v++)
-        {
+                          [:1]) \
+        schedule(static)
+        for (v = 0; v < NLVer; v++) {
             option = -1;
             // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
             k = candidateMate[v];
@@ -67,91 +67,81 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
 #pragma omp critical(processExposed)
                 {
-                    if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
-                    {
-                        w = computeCandidateMate(verLocPtr[v],
-                                                 verLocPtr[v + 1],
-                                                 edgeLocWeight, 0,
-                                                 verLocInd,
-                                                 StartIndex,
-                                                 EndIndex,
-                                                 GMate,
-                                                 Mate,
-                                                 Ghost2LocalMap);
-                        candidateMate[v] = w;
+                    if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
+		      w = computeCandidateMate(verLocPtr[v],
+					       verLocPtr[v + 1],
+					       edgeLocWeight, 0,
+					       verLocInd,
+					       StartIndex,
+					       EndIndex,
+					       GMate,
+					       Mate,
+					       Ghost2LocalMap);
+		      candidateMate[v] = w;
                     }
-
-                    if (w >= 0)
-                    {
-                        (*myCard)++;
-                        if ((w < StartIndex) || (w > EndIndex))
-                        { // w is a ghost vertex
-                            option = 2;
-
-                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
-                            {
-                                option = 1;
-                                Mate[v] = w;
-                                GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
-
-                            } // End of if CandidateMate[w] = v
-
-                        } // End of if a Ghost Vertex
-                        else
-                        { // w is a local vertex
-
-                            if (candidateMate[w - StartIndex] == (v + StartIndex))
-                            {
-                                option = 3;
-                                Mate[v] = w;                           // v is local
-                                Mate[w - StartIndex] = v + StartIndex; // w is local
-
+		    
+                    if (w >= 0) {
+		      (*myCard)++;
+		      if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex
+			option = 2;
+			if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
+			  option = 1;
+			  Mate[v] = w;
+			  GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
+			  
+			} // End of if CandidateMate[w] = v
+
+		      } // End of if a Ghost Vertex
+		      else  { // w is a local vertex
+			
+			if (candidateMate[w - StartIndex] == (v + StartIndex)) {
+			  option = 3;
+			  Mate[v] = w;                           // v is local
+			  Mate[w - StartIndex] = v + StartIndex; // w is local
+			  
 #ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
-                                fflush(stdout);
+			  cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
+			  fflush(stdout);
 #endif
-
-                            } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
-                        }     // End of Else
-
+			  
+			} // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+		      }     // End of Else
+		      
                     } // End of second if
-
+		    
                 } // End critical processExposed
-
+		
             } // End of if(w >=0)
-            else
-            {
-                // This piece of code is executed a really small amount of times
-                adj11 = verLocPtr[v];
-                adj12 = verLocPtr[v + 1];
-                for (k1 = adj11; k1 < adj12; k1++)
-                {
-                    w = verLocInd[k1];
-                    if ((w < StartIndex) || (w > EndIndex))
-                    { // A ghost
+            else  {
+	      // This piece of code is executed a really small amount of times
+	      adj11 = verLocPtr[v];
+	      adj12 = verLocPtr[v + 1];
+	      for (k1 = adj11; k1 < adj12; k1++) {
+		w = verLocInd[k1];
+		if ((w < StartIndex) || (w > EndIndex)) { // A ghost
 
 #ifdef PRINT_DEBUG_INFO_
-                        cout << "\n(" << myRank << ")Sending a failure message: ";
-                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        fflush(stdout);
+		  cout << "\n(" << myRank << ")Sending a failure message: ";
+		  cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		  fflush(stdout);
 #endif
-                        (*msgInd)++;
-                        (*NumMessagesBundled)++;
-                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        // assert(ghostOwner != -1);
-                        // assert(ghostOwner != myRank);
-                        PCounter[ghostOwner]++;
-
-                        privateQLocalVtx.push_back(v + StartIndex);
-                        privateQGhostVtx.push_back(w);
-                        privateQMsgType.push_back(FAILURE);
-                        privateQOwner.push_back(ghostOwner);
-
-                    } // End of if(GHOST)
-                }     // End of for loop
+		  (*msgInd)++;
+		  (*NumMessagesBundled)++;
+		  ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		  // assert(ghostOwner != -1);
+		  // assert(ghostOwner != myRank);
+		  PCounter[ghostOwner]++;
+		  
+		  privateQLocalVtx.push_back(v + StartIndex);
+		  privateQGhostVtx.push_back(w);
+		  privateQMsgType.push_back(FAILURE);
+		  privateQOwner.push_back(ghostOwner);
+		  
+		} // End of if(GHOST)
+	      }     // End of for loop
             }
             // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-
+	    
             switch (option)
             {
             case -1:
@@ -202,4 +192,4 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                        privateQOwner);
 
     } // End of parallel region
-}
\ No newline at end of file
+}
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index e96dcc1d..d9363c39 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -46,264 +46,249 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                                    \
-    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
-        num_threads(NUM_THREAD)                                                                                                                            \
-            reduction(+                                                                                                                                    \
-                      : msgInd[:1], PCounter                                                                                                               \
-                      [:numProcs], myCard                                                                                                                  \
-                      [:1], NumMessagesBundled                                                                                                             \
+    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, \
+       privateQMsgType, privateQOwner, UChunkBeingProcessed) \
+       default(shared)  num_threads(NUM_THREAD)   \
+            reduction(+                           \
+                      : msgInd[:1], PCounter      \
+                      [:numProcs], myCard         \
+                      [:1], NumMessagesBundled    \
                       [:1])
     {
 
-        while (!U.empty())
-        {
-
-            extractUChunk(UChunkBeingProcessed, U, privateU);
-
-            for (MilanLongInt u : UChunkBeingProcessed)
-            {
+        while (!U.empty()) {
+	  
+	  extractUChunk(UChunkBeingProcessed, U, privateU);
+	  
+	  for (MilanLongInt u : UChunkBeingProcessed)   {
 #ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")u: " << u;
-                fflush(stdout);
+	    cout << "\n(" << myRank << ")u: " << u;
+	    fflush(stdout);
 #endif
-                if ((u >= StartIndex) && (u <= EndIndex))
-                { // Process Only the Local Vertices
-
+	    if ((u >= StartIndex) && (u <= EndIndex))     { // Process Only the Local Vertices
+	      
 #ifdef COUNT_LOCAL_VERTEX
-                    localVertices++;
+	      localVertices++;
 #endif
-
-                    // Get the Adjacency list for u
-                    adj1 = verLocPtr[u - StartIndex]; // Pointer
-                    adj2 = verLocPtr[u - StartIndex + 1];
-                    for (k = adj1; k < adj2; k++)
-                    {
-                        option = -1;
-                        v = verLocInd[k];
-
-                        if ((v >= StartIndex) && (v <= EndIndex))
-                        { // If Local Vertex:
-
+	      
+	      // Get the Adjacency list for u
+	      adj1 = verLocPtr[u - StartIndex]; // Pointer
+	      adj2 = verLocPtr[u - StartIndex + 1];
+	      for (k = adj1; k < adj2; k++) {
+		option = -1;
+		v = verLocInd[k];
+		
+		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
+		  
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
-                            fflush(stdout);
+		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+		  fflush(stdout);
 #endif
 #pragma omp atomic read
-                            mateVal = Mate[v - StartIndex];
-                            // If the current vertex is pointing to a matched vertex and is not matched
-                            if (mateVal < 0)
-                            {
+		  mateVal = Mate[v - StartIndex];
+		  // If the current vertex is pointing to a matched vertex and is not matched
+		  if (mateVal < 0)  {
 #pragma omp critical
-                                {
-                                    if (candidateMate[v - StartIndex] == u)
-                                    {
-                                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                        w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                                 verLocPtr[v - StartIndex + 1],
-                                                                 edgeLocWeight, 0,
-                                                                 verLocInd,
-                                                                 StartIndex,
-                                                                 EndIndex,
-                                                                 GMate,
-                                                                 Mate,
-                                                                 Ghost2LocalMap);
-
-                                        candidateMate[v - StartIndex] = w;
-
+		    {
+		      if (candidateMate[v - StartIndex] == u) {
+			  // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+			w = computeCandidateMate(verLocPtr[v - StartIndex],
+						 verLocPtr[v - StartIndex + 1],
+						 edgeLocWeight, 0,
+						 verLocInd,
+						 StartIndex,
+						 EndIndex,
+						 GMate,
+						 Mate,
+						 Ghost2LocalMap);
+			
+			candidateMate[v - StartIndex] = w;
+			
 #ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")" << v << " Points to: " << w;
-                                        fflush(stdout);
+			cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+			fflush(stdout);
 #endif
-                                        // If found a dominating edge:
-                                        if (w >= 0)
-                                        {
-
-                                            if ((w < StartIndex) || (w > EndIndex))
-                                            { // A ghost
+			// If found a dominating edge:
+			if (w >= 0)   {			  
+			  if ((w < StartIndex) || (w > EndIndex))  { // A ghost
 #ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")Sending a request message:";
-                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			    cout << "\n(" << myRank << ")Sending a request message:";
+			    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
-                                                option = 2;
-
-                                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                                {
-                                                    option = 1;
-                                                    Mate[v - StartIndex] = w;     // v is a local vertex
-                                                    GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
-
-                                                } // End of if CandidateMate[w] = v
-                                            }     // End of if a Ghost Vertex
-                                            else
-                                            { // w is a local vertex
-                                                if (candidateMate[w - StartIndex] == v)
-                                                {
-                                                    option = 3;
-                                                    Mate[v - StartIndex] = w; // v is a local vertex
-                                                    Mate[w - StartIndex] = v; // w is a local vertex
-
+			    option = 2;
+			    
+			    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)  {
+			      option = 1;
+			      Mate[v - StartIndex] = w;     // v is a local vertex
+			      GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+			      
+			    } // End of if CandidateMate[w] = v
+			  }     // End of if a Ghost Vertex
+			  else    { // w is a local vertex
+			    if (candidateMate[w - StartIndex] == v)  {
+			      option = 3;
+			      Mate[v - StartIndex] = w; // v is a local vertex
+			      Mate[w - StartIndex] = v; // w is a local vertex
+			      
 #ifdef PRINT_DEBUG_INFO_
-                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                                    fflush(stdout);
+			      cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+			      fflush(stdout);
 #endif
-                                                } // End of if(CandidateMate(w) = v
-                                            }     // End of Else
-                                        }         // End of if(w >=0)
-                                        else
-                                            option = 4; // End of Else: w == -1
-                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    } // End of If (candidateMate[v-StartIndex] == u
-                                }     // End of task
-                            }         // mateval < 0
-                        }             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                        else
-                        { // Neighbor is a ghost vertex
-
+			    } // End of if(CandidateMate(w) = v
+			  }     // End of Else
+			}         // End of if(w >=0)
+			else
+			  option = 4; // End of Else: w == -1
+			// End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+		      } // End of If (candidateMate[v-StartIndex] == u
+		    }     // End of task
+		  }         // mateval < 0
+		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+		else  { // Neighbor is a ghost vertex
+		  
 #pragma omp critical
-                            {
-                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                                if (v != Mate[u - StartIndex])
-                                    option = 5; // u is local
-                            }                   // End of critical
-                        }                       // End of Else //A Ghost Vertex
-
-                        switch (option)
-                        {
-                        case -1:
-                            // No things to do
-                            break;
-                        case 1:
-                            // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
-                            privateU.push_back(v);
-                            privateU.push_back(w);
-
-                            (*myCard)++;
+		  {
+		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+		    if (v != Mate[u - StartIndex])
+		      option = 5; // u is local
+		  }                   // End of critical
+		}                       // End of Else //A Ghost Vertex
+		
+		switch (option)
+		  {
+		  case -1:
+		    // No things to do
+		    break;
+		  case 1:
+		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    
+		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                            fflush(stdout);
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+		    fflush(stdout);
 #endif
-                            // Decrement the counter:
-                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
-                        case 2:
-
-                            // Found a dominating edge, it is a ghost
-                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            // assert(ghostOwner != -1);
-                            // assert(ghostOwner != myRank);
-                            PCounter[ghostOwner]++;
-                            (*NumMessagesBundled)++;
-                            (*msgInd)++;
-
-                            privateQLocalVtx.push_back(v);
-                            privateQGhostVtx.push_back(w);
-                            privateQMsgType.push_back(REQUEST);
-                            privateQOwner.push_back(ghostOwner);
-                            break;
-                        case 3:
-                            privateU.push_back(v);
-                            privateU.push_back(w);
-
-                            (*myCard)++;
-                            break;
-                        case 4:
-                            // Could not find a dominating vertex
-                            adj11 = verLocPtr[v - StartIndex];
-                            adj12 = verLocPtr[v - StartIndex + 1];
-                            for (k1 = adj11; k1 < adj12; k1++)
-                            {
-                                w = verLocInd[k1];
-                                if ((w < StartIndex) || (w > EndIndex))
-                                { // A ghost
-
+		    // Decrement the counter:
+		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
+		  case 2:
+		    
+		    // Found a dominating edge, it is a ghost
+		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		    // assert(ghostOwner != -1);
+		    // assert(ghostOwner != myRank);
+		    PCounter[ghostOwner]++;
+		    (*NumMessagesBundled)++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(v);
+		    privateQGhostVtx.push_back(w);
+		    privateQMsgType.push_back(REQUEST);
+		    privateQOwner.push_back(ghostOwner);
+		    break;
+		  case 3:
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    
+		    (*myCard)++;
+		    break;
+		  case 4:
+		    // Could not find a dominating vertex
+		    adj11 = verLocPtr[v - StartIndex];
+		    adj12 = verLocPtr[v - StartIndex + 1];
+		    for (k1 = adj11; k1 < adj12; k1++) {
+		      w = verLocInd[k1];
+		      if ((w < StartIndex) || (w > EndIndex)) { // A ghost
+			
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a failure message: ";
-                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    fflush(stdout);
+			cout << "\n(" << myRank << ")Sending a failure message: ";
+			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			fflush(stdout);
 #endif
-
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    // assert(ghostOwner != -1);
-                                    // assert(ghostOwner != myRank);
-
-                                    PCounter[ghostOwner]++;
-                                    (*NumMessagesBundled)++;
-                                    (*msgInd)++;
-
-                                    privateQLocalVtx.push_back(v);
-                                    privateQGhostVtx.push_back(w);
-                                    privateQMsgType.push_back(FAILURE);
-                                    privateQOwner.push_back(ghostOwner);
-
-                                } // End of if(GHOST)
-                            }     // End of for loop
-                            break;
-                        case 5:
-                        default:
-
+			
+			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			// assert(ghostOwner != -1);
+			// assert(ghostOwner != myRank);
+			
+			PCounter[ghostOwner]++;
+			(*NumMessagesBundled)++;
+			(*msgInd)++;
+			
+			privateQLocalVtx.push_back(v);
+			privateQGhostVtx.push_back(w);
+			privateQMsgType.push_back(FAILURE);
+			privateQOwner.push_back(ghostOwner);
+			
+		      } // End of if(GHOST)
+		    }     // End of for loop
+		    break;
+		  case 5:
+		  default:
+		    
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")Sending a success message: ";
-                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
-                            fflush(stdout);
+		    cout << "\n(" << myRank << ")Sending a success message: ";
+		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+		    fflush(stdout);
 #endif
-
-                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            // assert(ghostOwner != -1);
-                            // assert(ghostOwner != myRank);
-
-                            (*NumMessagesBundled)++;
-                            PCounter[ghostOwner]++;
-                            (*msgInd)++;
-
-                            privateQLocalVtx.push_back(u);
-                            privateQGhostVtx.push_back(v);
-                            privateQMsgType.push_back(SUCCESS);
-                            privateQOwner.push_back(ghostOwner);
-
-                            break;
-                        } // End of switch
-
-                    } // End of inner for
-                }
-            } // End of outer for
-
-            queuesTransfer(U, privateU, QLocalVtx,
-                           QGhostVtx,
-                           QMsgType, QOwner, privateQLocalVtx,
-                           privateQGhostVtx,
-                           privateQMsgType,
-                           privateQOwner);
-
+		    
+		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+		    // assert(ghostOwner != -1);
+		    // assert(ghostOwner != myRank);
+		    
+		    (*NumMessagesBundled)++;
+		    PCounter[ghostOwner]++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(u);
+		    privateQGhostVtx.push_back(v);
+		    privateQMsgType.push_back(SUCCESS);
+		    privateQOwner.push_back(ghostOwner);
+		    
+		    break;
+		  } // End of switch
+		
+	      } // End of inner for
+	    }
+	  } // End of outer for
+	  
+	  queuesTransfer(U, privateU, QLocalVtx,
+			 QGhostVtx,
+			 QMsgType, QOwner, privateQLocalVtx,
+			 privateQGhostVtx,
+			 privateQMsgType,
+			 privateQOwner);
+	  
 #pragma omp critical(U)
-            {
-                U.insert(U.end(), privateU.begin(), privateU.end());
-            }
-
-            privateU.clear();
-
+	  {
+	    U.insert(U.end(), privateU.begin(), privateU.end());
+	  }
+	  
+	  privateU.clear();
+	  
 #pragma omp critical(sendMessageTransfer)
-            {
-
-                QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
-                QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
-                QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
-                QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
-            }
-
-            privateQLocalVtx.clear();
-            privateQGhostVtx.clear();
-            privateQMsgType.clear();
-            privateQOwner.clear();
-
+	  {
+	    
+	    QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+	    QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+	    QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+	    QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
+	  }
+	  
+	  privateQLocalVtx.clear();
+	  privateQGhostVtx.clear();
+	  privateQMsgType.clear();
+	  privateQOwner.clear();
+	  
         } // End of while ( !U.empty() )
-
+	
 #ifdef COUNT_LOCAL_VERTEX
         printf("Count local vertexes: %ld for thread %d of processor %d\n",
                localVertices,
                omp_get_thread_num(),
                myRank);
-
+	
 #endif
     } // End of parallel region
 }
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
index 3322a05b..469d7a16 100644
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -1,39 +1,39 @@
 #include "MatchBoxPC.h"
-
+//#define DEBUG_HANG_
 void processMatchedVerticesAndSendMessages(
-    MilanLongInt NLVer,
-    vector<MilanLongInt> &UChunkBeingProcessed,
-    vector<MilanLongInt> &U,
-    vector<MilanLongInt> &privateU,
-    MilanLongInt StartIndex,
-    MilanLongInt EndIndex,
-    MilanLongInt *myCard,
-    MilanLongInt *msgInd,
-    MilanLongInt *NumMessagesBundled,
-    MilanLongInt *SPtr,
-    MilanLongInt *verLocPtr,
-    MilanLongInt *verLocInd,
-    MilanLongInt *verDistance,
-    MilanLongInt *PCounter,
-    vector<MilanLongInt> &Counter,
-    MilanInt myRank,
-    MilanInt numProcs,
-    MilanLongInt *candidateMate,
-    vector<MilanLongInt> &GMate,
-    MilanLongInt *Mate,
-    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
-    MilanReal *edgeLocWeight,
-    vector<MilanLongInt> &QLocalVtx,
-    vector<MilanLongInt> &QGhostVtx,
-    vector<MilanLongInt> &QMsgType,
-    vector<MilanInt> &QOwner,
-    vector<MilanLongInt> &privateQLocalVtx,
-    vector<MilanLongInt> &privateQGhostVtx,
-    vector<MilanLongInt> &privateQMsgType,
-    vector<MilanInt> &privateQOwner,
-    MPI_Comm comm,
-    MilanLongInt *msgActual,
-    vector<MilanLongInt> &Message)
+					   MilanLongInt NLVer,
+					   vector<MilanLongInt> &UChunkBeingProcessed,
+					   vector<MilanLongInt> &U,
+					   vector<MilanLongInt> &privateU,
+					   MilanLongInt StartIndex,
+					   MilanLongInt EndIndex,
+					   MilanLongInt *myCard,
+					   MilanLongInt *msgInd,
+					   MilanLongInt *NumMessagesBundled,
+					   MilanLongInt *SPtr,
+					   MilanLongInt *verLocPtr,
+					   MilanLongInt *verLocInd,
+					   MilanLongInt *verDistance,
+					   MilanLongInt *PCounter,
+					   vector<MilanLongInt> &Counter,
+					   MilanInt myRank,
+					   MilanInt numProcs,
+					   MilanLongInt *candidateMate,
+					   vector<MilanLongInt> &GMate,
+					   MilanLongInt *Mate,
+					   map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+					   MilanReal *edgeLocWeight,
+					   vector<MilanLongInt> &QLocalVtx,
+					   vector<MilanLongInt> &QGhostVtx,
+					   vector<MilanLongInt> &QMsgType,
+					   vector<MilanInt> &QOwner,
+					   vector<MilanLongInt> &privateQLocalVtx,
+					   vector<MilanLongInt> &privateQGhostVtx,
+					   vector<MilanLongInt> &privateQMsgType,
+					   vector<MilanInt> &privateQOwner,
+					   MPI_Comm comm,
+					   MilanLongInt *msgActual,
+					   vector<MilanLongInt> &Message)
 {
 
     MilanLongInt initialSize = QLocalVtx.size();
@@ -50,266 +50,259 @@ void processMatchedVerticesAndSendMessages(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                                             \
-    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
-        num_threads(NUM_THREAD)                                                                                                                                     \
-            reduction(+                                                                                                                                             \
-                      : msgInd[:1], PCounter                                                                                                                        \
-                      [:numProcs], myCard                                                                                                                           \
-                      [:1], NumMessagesBundled                                                                                                                      \
-                      [:1], msgActual                                                                                                                               \
+    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
+    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\
+		 privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
+        num_threads(NUM_THREAD)                                                        \
+            reduction(+                                                                \
+                      : msgInd[:1], PCounter                                           \
+                      [:numProcs], myCard                                              \
+                      [:1], NumMessagesBundled                                         \
+                      [:1], msgActual                                                  \
                       [:1])
     {
 
-        while (!U.empty())
-        {
-
-            extractUChunk(UChunkBeingProcessed, U, privateU);
-
-            for (MilanLongInt u : UChunkBeingProcessed)
-            {
+        while (!U.empty()) {
+	  
+	  extractUChunk(UChunkBeingProcessed, U, privateU);
+	  
+	  for (MilanLongInt u : UChunkBeingProcessed) {
 #ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")u: " << u;
-                fflush(stdout);
+	    cout << "\n(" << myRank << ")u: " << u;
+	    fflush(stdout);
 #endif
-                if ((u >= StartIndex) && (u <= EndIndex))
-                { // Process Only the Local Vertices
-
+	    if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices
+	      
 #ifdef COUNT_LOCAL_VERTEX
-                    localVertices++;
+	      localVertices++;
 #endif
-
-                    // Get the Adjacency list for u
-                    adj1 = verLocPtr[u - StartIndex]; // Pointer
-                    adj2 = verLocPtr[u - StartIndex + 1];
-                    for (k = adj1; k < adj2; k++)
-                    {
-                        option = -1;
-                        v = verLocInd[k];
-
-                        if ((v >= StartIndex) && (v <= EndIndex))
-                        { // If Local Vertex:
-
+	      
+	      // Get the Adjacency list for u
+	      adj1 = verLocPtr[u - StartIndex]; // Pointer
+	      adj2 = verLocPtr[u - StartIndex + 1];
+	      for (k = adj1; k < adj2; k++) {
+		option = -1;
+		v = verLocInd[k];
+		
+		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
+		  
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
-                            fflush(stdout);
+		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+		  fflush(stdout);
 #endif
 #pragma omp atomic read
-                            mateVal = Mate[v - StartIndex];
-                            // If the current vertex is pointing to a matched vertex and is not matched
-                            if (mateVal < 0)
-                            {
+		  mateVal = Mate[v - StartIndex];
+		  // If the current vertex is pointing to a matched vertex and is not matched
+		  if (mateVal < 0) {
 #pragma omp critical
-                                {
-                                    if (candidateMate[v - StartIndex] == u)
-                                    {
-                                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                        w = computeCandidateMate(verLocPtr[v - StartIndex],
-                                                                 verLocPtr[v - StartIndex + 1],
-                                                                 edgeLocWeight, 0,
-                                                                 verLocInd,
-                                                                 StartIndex,
-                                                                 EndIndex,
-                                                                 GMate,
-                                                                 Mate,
-                                                                 Ghost2LocalMap);
-
-                                        candidateMate[v - StartIndex] = w;
-
+		    {
+		      if (candidateMate[v - StartIndex] == u)  {
+			// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+			w = computeCandidateMate(verLocPtr[v - StartIndex],
+						 verLocPtr[v - StartIndex + 1],
+						 edgeLocWeight, 0,
+						 verLocInd,
+						 StartIndex,
+						 EndIndex,
+						 GMate,
+						 Mate,
+						 Ghost2LocalMap);
+			
+			candidateMate[v - StartIndex] = w;
+			
 #ifdef PRINT_DEBUG_INFO_
-                                        cout << "\n(" << myRank << ")" << v << " Points to: " << w;
-                                        fflush(stdout);
+			cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+			fflush(stdout);
 #endif
-                                        // If found a dominating edge:
-                                        if (w >= 0)
-                                        {
-
-                                            if ((w < StartIndex) || (w > EndIndex))
-                                            { // A ghost
+			// If found a dominating edge:
+			if (w >= 0)  {
+			  
+			  if ((w < StartIndex) || (w > EndIndex))  { // A ghost
 #ifdef PRINT_DEBUG_INFO_
-                                                cout << "\n(" << myRank << ")Sending a request message:";
-                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			    cout << "\n(" << myRank << ")Sending a request message:";
+			    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
-                                                option = 2;
-
-                                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                                {
-                                                    option = 1;
-                                                    Mate[v - StartIndex] = w;     // v is a local vertex
-                                                    GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
-
-                                                } // End of if CandidateMate[w] = v
-                                            }     // End of if a Ghost Vertex
-                                            else
-                                            { // w is a local vertex
-                                                if (candidateMate[w - StartIndex] == v)
-                                                {
-                                                    option = 3;
-                                                    Mate[v - StartIndex] = w; // v is a local vertex
-                                                    Mate[w - StartIndex] = v; // w is a local vertex
-
+			    option = 2;
+			    
+			    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+			      option = 1;
+			      Mate[v - StartIndex] = w;     // v is a local vertex
+			      GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+			      
+			    } // End of if CandidateMate[w] = v
+			  }     // End of if a Ghost Vertex
+			  else   { // w is a local vertex
+			    if (candidateMate[w - StartIndex] == v)  {
+			      option = 3;
+			      Mate[v - StartIndex] = w; // v is a local vertex
+			      Mate[w - StartIndex] = v; // w is a local vertex
+			      
 #ifdef PRINT_DEBUG_INFO_
-                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                                                    fflush(stdout);
+			      cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+			      fflush(stdout);
 #endif
-                                                } // End of if(CandidateMate(w) = v
-                                            }     // End of Else
-                                        }         // End of if(w >=0)
-                                        else
-                                            option = 4; // End of Else: w == -1
-                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                                    } // End of If (candidateMate[v-StartIndex] == u
-                                }     // End of task
-                            }         // mateval < 0
-                        }             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
-                        else
-                        { // Neighbor is a ghost vertex
-
+			    } // End of if(CandidateMate(w) = v
+			  }     // End of Else
+			}         // End of if(w >=0)
+			else
+			  option = 4; // End of Else: w == -1
+			// End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+		      } // End of If (candidateMate[v-StartIndex] == u
+		    }     // End of task
+		  }         // mateval < 0
+		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+		else { // Neighbor is a ghost vertex
+		  
 #pragma omp critical
-                            {
-                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
-                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
-                                if (v != Mate[u - StartIndex])
-                                    option = 5; // u is local
-                            }                   // End of critical
-                        }                       // End of Else //A Ghost Vertex
-
-                        switch (option)
-                        {
-                        case -1:
-                            // No things to do
-                            break;
-                        case 1:
-                            // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
-                            privateU.push_back(v);
-                            privateU.push_back(w);
-                            (*myCard)++;
+		  {
+		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+		    if (v != Mate[u - StartIndex])
+		      option = 5; // u is local
+		  }                   // End of critical
+		}                       // End of Else //A Ghost Vertex
+		
+		switch (option)
+		  {
+		  case -1:
+		    // No things to do
+		    break;
+		  case 1:
+		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
-                            fflush(stdout);
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+		    fflush(stdout);
 #endif
-                            // Decrement the counter:
-                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
-                        case 2:
-
-                            // Found a dominating edge, it is a ghost
-                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-
-                            // Build the Message Packet:
-                            // Message[0] = v;       // LOCAL
-                            // Message[1] = w;       // GHOST
-                            // Message[2] = REQUEST; // TYPE
-                            // Send a Request (Asynchronous)
-                            // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-
-                            (*msgActual)++;
-                            (*msgInd)++;
-
-                            privateQLocalVtx.push_back(v);
-                            privateQGhostVtx.push_back(w);
-                            privateQMsgType.push_back(REQUEST);
-                            privateQOwner.push_back(ghostOwner);
-                            break;
-                        case 3:
-                            privateU.push_back(v);
-                            privateU.push_back(w);
-                            (*myCard)++;
-                            break;
-                        case 4:
-                            // Could not find a dominating vertex
-                            adj11 = verLocPtr[v - StartIndex];
-                            adj12 = verLocPtr[v - StartIndex + 1];
-                            for (k1 = adj11; k1 < adj12; k1++)
-                            {
-                                w = verLocInd[k1];
-                                if ((w < StartIndex) || (w > EndIndex))
-                                { // A ghost
-
+		    // Decrement the counter:
+		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
+		  case 2:
+		    
+		    // Found a dominating edge, it is a ghost
+		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		    
+		    // Build the Message Packet:
+		    // Message[0] = v;       // LOCAL
+		    // Message[1] = w;       // GHOST
+		    // Message[2] = REQUEST; // TYPE
+		    // Send a Request (Asynchronous)
+		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		    
+		    (*msgActual)++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(v);
+		    privateQGhostVtx.push_back(w);
+		    privateQMsgType.push_back(REQUEST);
+		    privateQOwner.push_back(ghostOwner);
+		    break;
+		  case 3:
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    (*myCard)++;
+		    break;
+		  case 4:
+		    // Could not find a dominating vertex
+		    adj11 = verLocPtr[v - StartIndex];
+		    adj12 = verLocPtr[v - StartIndex + 1];
+		    for (k1 = adj11; k1 < adj12; k1++) {
+		      w = verLocInd[k1];
+		      if ((w < StartIndex) || (w > EndIndex))  { // A ghost
+			
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a failure message: ";
-                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    fflush(stdout);
+			cout << "\n(" << myRank << ")Sending a failure message: ";
+			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			fflush(stdout);
 #endif
-
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-
-                                    // Build the Message Packet:
-                                    // Message[0] = v;       // LOCAL
-                                    // Message[1] = w;       // GHOST
-                                    // Message[2] = FAILURE; // TYPE
-                                    // Send a Request (Asynchronous)
-                                    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-
-                                    (*msgActual)++;
-                                    (*msgInd)++;
-
-                                    privateQLocalVtx.push_back(v);
-                                    privateQGhostVtx.push_back(w);
-                                    privateQMsgType.push_back(FAILURE);
-                                    privateQOwner.push_back(ghostOwner);
-
-                                } // End of if(GHOST)
-                            }     // End of for loop
-                            break;
-                        case 5:
-                        default:
-
+			
+			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			
+			// Build the Message Packet:
+			// Message[0] = v;       // LOCAL
+			// Message[1] = w;       // GHOST
+			// Message[2] = FAILURE; // TYPE
+			// Send a Request (Asynchronous)
+			// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+			
+			(*msgActual)++;
+			(*msgInd)++;
+			
+			privateQLocalVtx.push_back(v);
+			privateQGhostVtx.push_back(w);
+			privateQMsgType.push_back(FAILURE);
+			privateQOwner.push_back(ghostOwner);
+			
+		      } // End of if(GHOST)
+		    }     // End of for loop
+		    break;
+		  case 5:
+		  default:
+		    
 #ifdef PRINT_DEBUG_INFO_
-                            cout << "\n(" << myRank << ")Sending a success message: ";
-                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
-                            fflush(stdout);
+		    cout << "\n(" << myRank << ")Sending a success message: ";
+		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+		    fflush(stdout);
 #endif
-
-                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-
-                            // Build the Message Packet:
-                            // Message[0] = u;       // LOCAL
-                            // Message[1] = v;       // GHOST
-                            // Message[2] = SUCCESS; // TYPE
-                            // Send a Request (Asynchronous)
-                            // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-
-                            (*msgActual)++;
-                            (*msgInd)++;
-
-                            privateQLocalVtx.push_back(u);
-                            privateQGhostVtx.push_back(v);
-                            privateQMsgType.push_back(SUCCESS);
-                            privateQOwner.push_back(ghostOwner);
-
-                            break;
-                        } // End of switch
-                    }     // End of inner for
-                }
-            } // End of outer for
-
-            queuesTransfer(U, privateU, QLocalVtx,
-                           QGhostVtx,
-                           QMsgType, QOwner, privateQLocalVtx,
-                           privateQGhostVtx,
-                           privateQMsgType,
-                           privateQOwner);
-
+		    
+		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+		    
+		    // Build the Message Packet:
+		    // Message[0] = u;       // LOCAL
+		    // Message[1] = v;       // GHOST
+		    // Message[2] = SUCCESS; // TYPE
+		    // Send a Request (Asynchronous)
+		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		    
+		    (*msgActual)++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(u);
+		    privateQGhostVtx.push_back(v);
+		    privateQMsgType.push_back(SUCCESS);
+		    privateQOwner.push_back(ghostOwner);
+		    
+		    break;
+		  } // End of switch
+	      }     // End of inner for
+	    }
+	  } // End of outer for
+	  
+	  queuesTransfer(U, privateU, QLocalVtx,
+			 QGhostVtx,
+			 QMsgType, QOwner, privateQLocalVtx,
+			 privateQGhostVtx,
+			 privateQMsgType,
+			 privateQOwner);
+	  
         } // End of while ( !U.empty() )
-
+	
 #ifdef COUNT_LOCAL_VERTEX
         printf("Count local vertexes: %ld for thread %d of processor %d\n",
                localVertices,
                omp_get_thread_num(),
                myRank);
-
+	
 #endif
     } // End of parallel region
-
+    
     // Send the messages
-    for (int i = initialSize; i < QOwner.size(); i++)
-    {
-
-        Message[0] = QLocalVtx[i];
-        Message[1] = QGhostVtx[i];
-        Message[2] = QMsgType[i];
-        ghostOwner = QOwner[i];
-
-        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+#ifdef DEBUG_HANG_
+    cout << myRank<<" Sending: "<<QOwner.size()-initialSize<<" messages" <<endl;
+#endif
+    for (int i = initialSize; i < QOwner.size(); i++)   {
+      
+      Message[0] = QLocalVtx[i];
+      Message[1] = QGhostVtx[i];
+      Message[2] = QMsgType[i];
+      ghostOwner = QOwner[i];
+      
+      //MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+      //cout << myRank<<" Sending to "<<ghostOwner<<endl;
+      MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
     }
+#ifdef DEBUG_HANG_
+    cout << myRank<<" Done sending messages"<<endl;
+#endif
 }
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index 804790c6..d2fccc94 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -1,4 +1,5 @@
 #include "MatchBoxPC.h"
+//#define DEBUG_HANG_
 
 void processMessages(
     MilanLongInt NLVer,
@@ -78,6 +79,7 @@ void processMessages(
     fflush(stdout);
 #endif
 
+    //cout << myRank<<" Receiving ...";
     error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
     if (error_codeC != MPI_SUCCESS)
     {
@@ -86,70 +88,66 @@ void processMessages(
         fflush(stdout);
     }
     Sender = computeStatus.MPI_SOURCE;
-
+    //cout << " ...from "<<Sender << endl;
 #ifdef PRINT_DEBUG_INFO_
     cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
     fflush(stdout);
 #endif
 
-    if (Message[2] == SIZEINFO)
-    {
+    if (Message[2] == SIZEINFO)  {
 
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
-        fflush(stdout);
+      cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
+      fflush(stdout);
 #endif
-        bundleSize = Message[0]; //#of integers in the message
-        // Build the Message Buffer:
-        if (!ReceiveBuffer.empty())
-            ReceiveBuffer.clear();            // Empty it out first
-        ReceiveBuffer.resize(bundleSize, -1); // Initialize
+      bundleSize = Message[0]; //#of integers in the message
+      // Build the Message Buffer:
+      if (!ReceiveBuffer.empty())
+	ReceiveBuffer.clear();            // Empty it out first
+      ReceiveBuffer.resize(bundleSize, -1); // Initialize
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
-        for (int i = 0; i < bundleSize; i++)
-            cout << ReceiveBuffer[i] << ",";
-        cout << endl;
-        fflush(stdout);
+      cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
+      for (int i = 0; i < bundleSize; i++)
+	cout << ReceiveBuffer[i] << ",";
+      cout << endl;
+      fflush(stdout);
 #endif
-        // Receive the message
-        error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
-        if (error_codeC != MPI_SUCCESS)
-        {
-            MPI_Error_string(error_codeC, error_message, &message_length);
-            cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
-            fflush(stdout);
-        }
+      // Receive the message
+      //cout << myRank<<" Receiving from "<<Sender<<endl;
+      error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
+      if (error_codeC != MPI_SUCCESS) {
+	MPI_Error_string(error_codeC, error_message, &message_length);
+	cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
+	fflush(stdout);
+      }
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Message Bundle After: " << endl;
-        for (int i = 0; i < bundleSize; i++)
-            cout << ReceiveBuffer[i] << ",";
-        cout << endl;
-        fflush(stdout);
+      cout << "\n(" << myRank << ")Message Bundle After: " << endl;
+      for (int i = 0; i < bundleSize; i++)
+	cout << ReceiveBuffer[i] << ",";
+      cout << endl;
+      fflush(stdout);
 #endif
-    }
-    else
-    { // Just a single message:
+    } else   { // Just a single message:
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
-        fflush(stdout);
+      cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
+      fflush(stdout);
 #endif
-        // Add the current message to Queue:
-        bundleSize = 3; //#of integers in the message
-        // Build the Message Buffer:
-        if (!ReceiveBuffer.empty())
-            ReceiveBuffer.clear();            // Empty it out first
-        ReceiveBuffer.resize(bundleSize, -1); // Initialize
-
-        ReceiveBuffer[0] = Message[0]; // u
-        ReceiveBuffer[1] = Message[1]; // v
-        ReceiveBuffer[2] = Message[2]; // message_type
+      // Add the current message to Queue:
+      bundleSize = 3; //#of integers in the message
+      // Build the Message Buffer:
+      if (!ReceiveBuffer.empty())
+	ReceiveBuffer.clear();            // Empty it out first
+      ReceiveBuffer.resize(bundleSize, -1); // Initialize
+      
+      ReceiveBuffer[0] = Message[0]; // u
+      ReceiveBuffer[1] = Message[1]; // v
+      ReceiveBuffer[2] = Message[2]; // message_type
     }
-
+    
 #ifdef DEBUG_GHOST_
-    if ((v < StartIndex) || (v > EndIndex))
-    {
-        cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
-        fflush(stdout);
+    if ((v < StartIndex) || (v > EndIndex)) {
+      cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
+      fflush(stdout);
     }
 #endif
 #ifdef PRINT_DEBUG_INFO_
@@ -158,172 +156,160 @@ void processMessages(
 #endif
 
     // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
-    for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3)
-    {
-        u = ReceiveBuffer[bundleCounter - 3];            // GHOST
-        v = ReceiveBuffer[bundleCounter - 2];            // LOCAL
-        message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
-
-        // CASE I: REQUEST
-        if (message_type == REQUEST)
-        {
+    for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3)  {
+      u = ReceiveBuffer[bundleCounter - 3];            // GHOST
+      v = ReceiveBuffer[bundleCounter - 2];            // LOCAL
+      message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
+      
+      // CASE I: REQUEST
+      if (message_type == REQUEST)  {
 #ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
-            fflush(stdout);
+	cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
+	fflush(stdout);
 #endif
 #ifdef DEBUG_GHOST_
-            if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
-            {
-                cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
-                fflush(stdout);
-            }
+	if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
+	  cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+	  fflush(stdout);
+	}
 
 #endif
 
-            if (Mate[v - StartIndex] == -1)
-            {                                                 // Process only if not already matched  (v is local)
-                candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
-                if (candidateMate[v - StartIndex] == u)
-                {
-                    GMate[Ghost2LocalMap[u]] = v; // u is ghost
-                    Mate[v - StartIndex] = u;     // v is local
-                    U.push_back(v);
-                    U.push_back(u);
-                    (*myCard)++;
+	if (Mate[v - StartIndex] == -1) {
+	  // Process only if not already matched  (v is local)
+	  candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
+	  if (candidateMate[v - StartIndex] == u)   {
+	    GMate[Ghost2LocalMap[u]] = v; // u is ghost
+	    Mate[v - StartIndex] = u;     // v is local
+	    U.push_back(v);
+	    U.push_back(u);
+	    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
-                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
-                    fflush(stdout);
+	    cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
+	    fflush(stdout);
 #endif
-
-                    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
-                } // End of if ( candidateMate[v-StartIndex] == u )e
-            }     // End of if ( Mate[v] == -1 )
-        }         // End of REQUEST
-        else
-        { // CASE II: SUCCESS
-            if (message_type == SUCCESS)
-            {
+	    
+	    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
+	  } // End of if ( candidateMate[v-StartIndex] == u )e
+	}     // End of if ( Mate[v] == -1 )
+      }         // End of REQUEST
+      else  { // CASE II: SUCCESS
+	if (message_type == SUCCESS) {
 #ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
-                fflush(stdout);
+	  cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
+	  fflush(stdout);
 #endif
-                GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
-                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
+	  GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
+	  PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
 #ifdef DEBUG_GHOST_
-                if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
-                {
-                    cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
-                    fflush(stdout);
-                }
+	  if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
+	    cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+	    fflush(stdout);
+	  }
 #endif
-                if (Mate[v - StartIndex] == -1)
-                { // Process only if not already matched ( v is local)
-                    if (candidateMate[v - StartIndex] == u)
-                    {
-                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                        w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
-                        candidateMate[v - StartIndex] = w;
+	  if (Mate[v - StartIndex] == -1) {
+	    // Process only if not already matched ( v is local)
+	    if (candidateMate[v - StartIndex] == u)  {
+	      // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+	      w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k,
+				       verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
+	      candidateMate[v - StartIndex] = w;
 #ifdef PRINT_DEBUG_INFO_
-                        cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
-                        fflush(stdout);
+	      cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
+	      fflush(stdout);
 #endif
-                        // If found a dominating edge:
-                        if (w >= 0)
-                        {
-                            if ((w < StartIndex) || (w > EndIndex))
-                            { // w is a ghost
-                                // Build the Message Packet:
-                                Message[0] = v;       // LOCAL
-                                Message[1] = w;       // GHOST
-                                Message[2] = REQUEST; // TYPE
-                                                      // Send a Request (Asynchronous)
+	      // If found a dominating edge:
+	      if (w >= 0)  {
+		if ((w < StartIndex) || (w > EndIndex))   {
+		  // w is a ghost
+		  // Build the Message Packet:
+		  Message[0] = v;       // LOCAL
+		  Message[1] = w;       // GHOST
+		  Message[2] = REQUEST; // TYPE
+		  // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                                cout << "\n(" << myRank << ")Sending a request message: ";
-                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
-                                fflush(stdout);
+		  cout << "\n(" << myRank << ")Sending a request message: ";
+		  cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+		  fflush(stdout);
 #endif
-                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                //assert(ghostOwner != -1);
-                                //assert(ghostOwner != myRank);
-
-                                MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                (*msgInd)++;
-                                (*msgActual)++;
-                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
-                                {
-                                    Mate[v - StartIndex] = w;     // v is local
-                                    GMate[Ghost2LocalMap[w]] = v; // w is ghost
-                                    U.push_back(v);
-                                    U.push_back(w);
-                                    (*myCard)++;
+		  ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		  //assert(ghostOwner != -1);
+		  //assert(ghostOwner != myRank);
+		  //cout << myRank<<" Sending to "<<ghostOwner<<endl;
+		  MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		  (*msgInd)++;
+		  (*msgActual)++;
+		  if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+		    Mate[v - StartIndex] = w;     // v is local
+		    GMate[Ghost2LocalMap[w]] = v; // w is ghost
+		    U.push_back(v);
+		    U.push_back(w);
+		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
-                                    fflush(stdout);
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+		    fflush(stdout);
 #endif
-
-                                    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
-                                } // End of if CandidateMate[w] = v
-                            }     // End of if a Ghost Vertex
-                            else
-                            { // w is a local vertex
-                                if (candidateMate[w - StartIndex] == v)
-                                {
-                                    Mate[v - StartIndex] = w; // v is local
-                                    Mate[w - StartIndex] = v; // w is local
-                                    // Q.push_back(u);
-                                    U.push_back(v);
-                                    U.push_back(w);
-                                    (*myCard)++;
+		    
+		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
+		  } // End of if CandidateMate[w] = v
+		}     // End of if a Ghost Vertex
+		else   { // w is a local vertex
+		  if (candidateMate[w - StartIndex] == v) {
+		    Mate[v - StartIndex] = w; // v is local
+		    Mate[w - StartIndex] = v; // w is local
+		    // Q.push_back(u);
+		    U.push_back(v);
+		    U.push_back(w);
+		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
-                                    fflush(stdout);
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+		    fflush(stdout);
 #endif
-                                } // End of if(CandidateMate(w) = v
-                            }     // End of Else
-                        }         // End of if(w >=0)
-                        else
-                        { // No dominant edge found
-                            adj11 = verLocPtr[v - StartIndex];
-                            adj12 = verLocPtr[v - StartIndex + 1];
-                            for (k1 = adj11; k1 < adj12; k1++)
-                            {
-                                w = verLocInd[k1];
-                                if ((w < StartIndex) || (w > EndIndex))
-                                { // A ghost
-                                    // Build the Message Packet:
-                                    Message[0] = v;       // LOCAL
-                                    Message[1] = w;       // GHOST
-                                    Message[2] = FAILURE; // TYPE
-                                                          // Send a Request (Asynchronous)
+		  } // End of if(CandidateMate(w) = v
+		}     // End of Else
+	      }         // End of if(w >=0)
+	      else  { // No dominant edge found
+		adj11 = verLocPtr[v - StartIndex];
+		adj12 = verLocPtr[v - StartIndex + 1];
+		for (k1 = adj11; k1 < adj12; k1++) {
+		  w = verLocInd[k1];
+		  if ((w < StartIndex) || (w > EndIndex))  {
+		    // A ghost
+		    // Build the Message Packet:
+		    Message[0] = v;       // LOCAL
+		    Message[1] = w;       // GHOST
+		    Message[2] = FAILURE; // TYPE
+		    // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-                                    cout << "\n(" << myRank << ")Sending a failure message: ";
-                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
-                                    fflush(stdout);
+		    cout << "\n(" << myRank << ")Sending a failure message: ";
+		    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+		    fflush(stdout);
 #endif
-                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    //assert(ghostOwner != -1);
-                                    //assert(ghostOwner != myRank);
-                                    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-                                    (*msgInd)++;
-                                    (*msgActual)++;
-                                } // End of if(GHOST)
-                            }     // End of for loop
-                        }         // End of Else: w == -1
+		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		    //assert(ghostOwner != -1);
+		    //assert(ghostOwner != myRank);
+		    //cout << myRank<<" Sending to "<<ghostOwner<<endl;
+		    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		    (*msgInd)++;
+		    (*msgActual)++;
+		  } // End of if(GHOST)
+		}     // End of for loop
+	      }         // End of Else: w == -1
                         // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
-                    } // End of if ( candidateMate[v-StartIndex] == u )
-                }     // End of if ( Mate[v] == -1 )
-            }         // End of if ( message_type == SUCCESS )
-            else
-            { // CASE III: FAILURE
+	    } // End of if ( candidateMate[v-StartIndex] == u )
+	  }     // End of if ( Mate[v] == -1 )
+	}         // End of if ( message_type == SUCCESS )
+	else  {
+	  // CASE III: FAILURE
 #ifdef PRINT_DEBUG_INFO_
-                cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
-                fflush(stdout);
+	  cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
+	  fflush(stdout);
 #endif
-                GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
-                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
-            }                                                       // End of else: CASE III
-        }                                                           // End of else: CASE I
+	  GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
+	  PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
+	}                                                       // End of else: CASE III
+      }                                                           // End of else: CASE I
     }
-
+    
     return;
-}
\ No newline at end of file
+}
diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp
index 7200b43d..33c65749 100644
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@@ -32,4 +32,5 @@ void queuesTransfer(vector<MilanLongInt> &U,
     privateQGhostVtx.clear();
     privateQMsgType.clear();
     privateQOwner.clear();
-}
\ No newline at end of file
+
+}
diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp
index f7fd2f78..80a88b94 100644
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@@ -38,108 +38,107 @@ void sendBundledMessages(MilanLongInt *numGhostEdges,
 #pragma omp task depend(inout                                                       \
                         : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \
                                                                                  : NumMessagesBundled, numProcs)
-            {try {
+            {
+	      try {
                 PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
-    PCumulative.reserve(numProcs + 1);                          // Similar to Row Pointer vector in CSR data structure
-    PSizeInfoMessages.reserve(numProcs * 3);                    // Buffer to hold the Size info message packets
-}
-catch (length_error)
-{
-    cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
-    cout << "Not enough memory to allocate the internal variables \n";
-    exit(1);
-}
-PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
-PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
-PSizeInfoMessages.resize(numProcs * 3, 0);
-}
+		PCumulative.reserve(numProcs + 1);                          // Similar to Row Pointer vector in CSR data structure
+		PSizeInfoMessages.reserve(numProcs * 3);                    // Buffer to hold the Size info message packets
+	      }
+	      catch (length_error)
+		{
+		  cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+		  cout << "Not enough memory to allocate the internal variables \n";
+		  exit(1);
+		}
+	      PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
+	      PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
+	      PSizeInfoMessages.resize(numProcs * 3, 0);
+	    }
 
 #pragma omp task depend(inout                    \
-                        : PCumulative) depend(in \
+                        : PCumulative) depend(in		\
                                               : PCounter)
-{
-    for (i = 0; i < numProcs; i++)
-        PCumulative[i + 1] = PCumulative[i] + PCounter[i];
-}
-
+	    {
+	      for (i = 0; i < numProcs; i++)
+		PCumulative[i + 1] = PCumulative[i] + PCounter[i];
+	    }
+	    
 #pragma omp task depend(inout \
                         : PCounter)
-{
-    // Reuse PCounter to keep track of how many messages were inserted:
-    for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
-        PCounter[i] = 0;
-}
+	    {
+	      // Reuse PCounter to keep track of how many messages were inserted:
+	      for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+		PCounter[i] = 0;
+	    }
 
 // Build the Message Bundle packet:
 #pragma omp task depend(in                                                                                          \
                         : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \
                                                                                                                 : myIndex, PMessageBundle, PCounter)
 {
-    for (i = 0; i < NumMessagesBundled; i++)
-    {
-        myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
-        PMessageBundle[myIndex + 0] = QLocalVtx[i];
-        PMessageBundle[myIndex + 1] = QGhostVtx[i];
-        PMessageBundle[myIndex + 2] = QMsgType[i];
-        PCounter[QOwner[i]]++;
-    }
-}
-
+  for (i = 0; i < NumMessagesBundled; i++)   {
+    myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
+    PMessageBundle[myIndex + 0] = QLocalVtx[i];
+    PMessageBundle[myIndex + 1] = QGhostVtx[i];
+    PMessageBundle[myIndex + 2] = QMsgType[i];
+    PCounter[QOwner[i]]++;
+  }
+ }
+ 
 // Send the Bundled Messages: Use ISend
 #pragma omp task depend(out \
                         : SRequest, SStatus)
-{
-    try
-    {
-        SRequest.reserve(numProcs * 2); // At most two messages per processor
-        SStatus.reserve(numProcs * 2);  // At most two messages per processor
-    }
-    catch (length_error)
-    {
-        cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
-        cout << "Not enough memory to allocate the internal variables \n";
-        exit(1);
-    }
-}
-
+ {
+   try
+     {
+       SRequest.reserve(numProcs * 2); // At most two messages per processor
+       SStatus.reserve(numProcs * 2);  // At most two messages per processor
+     }
+   catch (length_error)
+     {
+       cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
+       cout << "Not enough memory to allocate the internal variables \n";
+       exit(1);
+     }
+ }
+ 
 // Send the Messages
 #pragma omp task depend(inout                                                  \
                         : SRequest, PSizeInfoMessages, PCumulative) depend(out \
                                                                            : *msgActual, *msgInd)
 {
-    for (i = 0; i < numProcs; i++)
-    {                    // Changed by Fabio to be an integer, addresses needs to be integers!
-        if (i == myRank) // Do not send anything to yourself
-            continue;
-        // Send the Message with information about the size of next message:
-        // Build the Message Packet:
-        PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
-        PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
-        PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
-                                                                                  // Send a Request (Asynchronous)
+  for (i = 0; i < numProcs; i++)   {                    // Changed by Fabio to be an integer, addresses needs to be integers!
+    if (i == myRank) // Do not send anything to yourself
+      continue;
+    // Send the Message with information about the size of next message:
+    // Build the Message Packet:
+    PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
+    PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
+    PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
+    // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
-        cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
-        fflush(stdout);
+    cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
+    fflush(stdout);
 #endif
-        if (PSizeInfoMessages[i * 3 + 0] > 0)
-        { // Send only if it is a nonempty packet
-            MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
-                      &SRequest[(*msgInd)]);
-            (*msgActual)++;
-            (*msgInd)++;
-            // Now Send the message with the data packet:
+    if (PSizeInfoMessages[i * 3 + 0] > 0)
+      { // Send only if it is a nonempty packet
+	MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
+		  &SRequest[(*msgInd)]);
+	(*msgActual)++;
+	(*msgInd)++;
+	// Now Send the message with the data packet:
 #ifdef PRINT_DEBUG_INFO_
-            cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
-            for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
-                cout << PMessageBundle[k] << ",";
-            cout << endl;
-            fflush(stdout);
+	cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
+	for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
+	  cout << PMessageBundle[k] << ",";
+	cout << endl;
+	fflush(stdout);
 #endif
-            MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
-                      TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
-            (*msgInd)++;
-        } // End of if size > 0
-    }
+	MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
+		  TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
+	(*msgInd)++;
+      } // End of if size > 0
+  }
 }
 
 #pragma omp task depend(inout \
@@ -147,64 +146,64 @@ PSizeInfoMessages.resize(numProcs * 3, 0);
 {
 
     // Free up temporary memory:
-    PCumulative.clear();
-    QLocalVtx.clear();
-    QGhostVtx.clear();
-    QMsgType.clear();
-    QOwner.clear();
+  PCumulative.clear();
+  QLocalVtx.clear();
+  QGhostVtx.clear();
+  QMsgType.clear();
+  QOwner.clear();
 }
 
 #pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges)
 {
 
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
-    cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
-    cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
-    if (*numGhostEdges > 0)
+  cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
+  cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
+  cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
+  if (*numGhostEdges > 0)
     {
-        cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
+      cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
     }
-    fflush(stdout);
+  fflush(stdout);
 #endif
 
-    // Allocate memory for MPI Send messages:
-    /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
-    OneMessageSize = 0;
-    MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
-    // How many messages to send?
-    // Potentially three kinds of messages will be sent/received:
-    // Request, Success, Failure.
-    // But only two will be sent from a given processor.
-    // Substract the number of messages that have already been sent as bundled messages:
-    numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
-    *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
+  // Allocate memory for MPI Send messages:
+  /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
+  OneMessageSize = 0;
+  MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
+  // How many messages to send?
+  // Potentially three kinds of messages will be sent/received:
+  // Request, Success, Failure.
+  // But only two will be sent from a given processor.
+  // Substract the number of messages that have already been sent as bundled messages:
+  numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
+  *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
 }
 
 #pragma omp task depend(out : Buffer) depend(in : *BufferSize)
-{
-    Buffer = 0;
+ {
+   Buffer = 0;
 #ifdef PRINT_DEBUG_INFO_
-    cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
-    cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
-    cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
-    cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
-    cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
-    cout << "\n(" << myRank << ")Attaching Buffer on.. ";
-    fflush(stdout);
+   cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
+   cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
+   cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
+   cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
+   cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
+   cout << "\n(" << myRank << ")Attaching Buffer on.. ";
+   fflush(stdout);
 #endif
-    if ((*BufferSize) > 0)
-    {
-        Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
-        if (Buffer == 0)
-        {
-            cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
-            cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
-            exit(1);
-        }
-        MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
-    }
+   if ((*BufferSize) > 0)
+     {
+       Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
+       if (Buffer == 0)
+	 {
+	   cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+	   cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
+	   exit(1);
+	 }
+       MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
+     }
+ }
 }
 }
 }
-}
\ No newline at end of file

From a612cea167c0d74ffd14cf00eaea91ae61e96cc2 Mon Sep 17 00:00:00 2001
From: Salvatore Filippone <salvatore.filippone@uniroma2.it>
Date: Fri, 10 Feb 2023 07:53:04 -0500
Subject: [PATCH 83/96] Debug for matchboxp

---
 amgprec/amg_d_matchboxp_mod.f90 | 39 ++++++++++++++++++++++++++++-----
 amgprec/amg_s_matchboxp_mod.f90 | 39 ++++++++++++++++++++++++++++-----
 2 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/amgprec/amg_d_matchboxp_mod.f90 b/amgprec/amg_d_matchboxp_mod.f90
index 2df58797..e19ce617 100644
--- a/amgprec/amg_d_matchboxp_mod.f90
+++ b/amgprec/amg_d_matchboxp_mod.f90
@@ -143,7 +143,7 @@ contains
     type(psb_ld_coo_sparse_mat) :: tmpcoo
     logical  :: display_out_, print_out_, reproducible_
     logical, parameter  :: dump=.false., debug=.false., dump_mate=.false., &
-         & debug_ilaggr=.false., debug_sync=.false.
+         & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
     integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
     logical, parameter :: do_timings=.true.
     integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
@@ -214,7 +214,20 @@ contains
       call psb_barrier(ictxt)
       if (iam == 0) write(0,*)' out from buildmatching:', info
     end if
-
+    if (debug_mate) then
+      block
+        integer(psb_lpk_), allocatable :: ckmate(:)
+        allocate(ckmate(nr))
+        ckmate(1:nr) = mate(1:nr)
+        call psb_msort(ckmate(1:nr))
+        do i=1,nr-1
+          if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
+            write(0,*) iam,' Duplicate mate entry  at',i,' :',ckmate(i)
+          end if
+        end do
+      end block
+    end if
+     
     if (info == 0) then
       if (do_timings) call psb_tic(idx_phase2)
       if (debug_sync) then
@@ -276,6 +289,9 @@ contains
                   ilaggr(idx) = nlaggr(iam)
                   wtemp(k)    = w(k)/nrmagg
                   wtemp(idx)  = w(idx)/nrmagg
+                else
+                  write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
+                       &mate(idx),ilaggr(idx)
                 end if
                 nlpairs = nlpairs+1
               else if (idx <= nc) then
@@ -326,6 +342,12 @@ contains
                 nlsingl     = nlsingl + 1
               end if
             end if
+            if (ilaggr(k) == ilaggr_neginit) then
+              write(0,*) iam,' Error: no update to ',k,mate(k),&
+                   & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
+            end if
+          else
+            if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
           end if
         end if
       end do
@@ -360,9 +382,14 @@ contains
           else
             write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
           end if
-        end if
-        if (ilaggr(k) <0) then
-          write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+        else if (ilaggr(k) <0) then
+          write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+          write(0,*) iam,'         :             : ',nr,nc,mate(k)
+          if (mate(k) <= nr) then
+            write(0,*) iam,'         :             : ',ilaggr(mate(k)),mate(mate(k)),&
+                 & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
+          end if
+          flush(0)
         end if
       end do
       if (debug_sync) then
@@ -415,7 +442,7 @@ contains
 
       end block
       if (iam == 0) then
-        write(0,*) 'Matching statistics: Unmatched nodes ',&
+        write(0,*) iam,'Matching statistics: Unmatched nodes ',&
              & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
       end if
 
diff --git a/amgprec/amg_s_matchboxp_mod.f90 b/amgprec/amg_s_matchboxp_mod.f90
index 9144d70e..a7f41c24 100644
--- a/amgprec/amg_s_matchboxp_mod.f90
+++ b/amgprec/amg_s_matchboxp_mod.f90
@@ -143,7 +143,7 @@ contains
     type(psb_ls_coo_sparse_mat) :: tmpcoo
     logical  :: display_out_, print_out_, reproducible_
     logical, parameter  :: dump=.false., debug=.false., dump_mate=.false., &
-         & debug_ilaggr=.false., debug_sync=.false.
+         & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
     integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
     logical, parameter :: do_timings=.true.
     integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
@@ -214,7 +214,20 @@ contains
       call psb_barrier(ictxt)
       if (iam == 0) write(0,*)' out from buildmatching:', info
     end if
-
+    if (debug_mate) then
+      block
+        integer(psb_lpk_), allocatable :: ckmate(:)
+        allocate(ckmate(nr))
+        ckmate(1:nr) = mate(1:nr)
+        call psb_msort(ckmate(1:nr))
+        do i=1,nr-1
+          if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
+            write(0,*) iam,' Duplicate mate entry  at',i,' :',ckmate(i)
+          end if
+        end do
+      end block
+    end if
+     
     if (info == 0) then
       if (do_timings) call psb_tic(idx_phase2)
       if (debug_sync) then
@@ -276,6 +289,9 @@ contains
                   ilaggr(idx) = nlaggr(iam)
                   wtemp(k)    = w(k)/nrmagg
                   wtemp(idx)  = w(idx)/nrmagg
+                else
+                  write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
+                       &mate(idx),ilaggr(idx)
                 end if
                 nlpairs = nlpairs+1
               else if (idx <= nc) then
@@ -326,6 +342,12 @@ contains
                 nlsingl     = nlsingl + 1
               end if
             end if
+            if (ilaggr(k) == ilaggr_neginit) then
+              write(0,*) iam,' Error: no update to ',k,mate(k),&
+                   & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
+            end if
+          else
+            if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
           end if
         end if
       end do
@@ -360,9 +382,14 @@ contains
           else
             write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
           end if
-        end if
-        if (ilaggr(k) <0) then
-          write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+        else if (ilaggr(k) <0) then
+          write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+          write(0,*) iam,'         :             : ',nr,nc,mate(k)
+          if (mate(k) <= nr) then
+            write(0,*) iam,'         :             : ',ilaggr(mate(k)),mate(mate(k)),&
+                 & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
+          end if
+          flush(0)
         end if
       end do
       if (debug_sync) then
@@ -415,7 +442,7 @@ contains
 
       end block
       if (iam == 0) then
-        write(0,*) 'Matching statistics: Unmatched nodes ',&
+        write(0,*) iam,'Matching statistics: Unmatched nodes ',&
              & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
       end if
 

From 73e5d499131c086b77edca6c94538fc2ecda17a0 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Fri, 2 Jun 2023 11:37:58 +0200
Subject: [PATCH 84/96] Added timers to build phases

---
 .../aggregator/amg_c_dec_aggregator_tprol.f90 | 10 +++++
 .../impl/aggregator/amg_caggrmat_smth_bld.f90 | 38 ++++++++++++++++---
 .../aggregator/amg_d_dec_aggregator_tprol.f90 | 10 +++++
 .../impl/aggregator/amg_daggrmat_smth_bld.f90 | 38 ++++++++++++++++---
 .../aggregator/amg_s_dec_aggregator_tprol.f90 | 10 +++++
 .../impl/aggregator/amg_saggrmat_smth_bld.f90 | 38 ++++++++++++++++---
 .../aggregator/amg_z_dec_aggregator_tprol.f90 | 10 +++++
 .../impl/aggregator/amg_zaggrmat_smth_bld.f90 | 38 ++++++++++++++++---
 .../impl/level/amg_c_base_onelev_mat_asb.f90  | 16 +++++++-
 .../impl/level/amg_d_base_onelev_mat_asb.f90  | 16 +++++++-
 .../impl/level/amg_s_base_onelev_mat_asb.f90  | 16 +++++++-
 .../impl/level/amg_z_base_onelev_mat_asb.f90  | 16 +++++++-
 12 files changed, 224 insertions(+), 32 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90
index 4efaf61d..250cc549 100644
--- a/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90
@@ -97,6 +97,8 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
   integer(psb_lpk_)   :: ntaggr
   integer(psb_ipk_)   :: debug_level, debug_unit
   logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.
 
   name='amg_c_dec_aggregator_tprol'
   call psb_erractionsave(err_act)
@@ -108,6 +110,10 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
 
   call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
        &   amg_mult_ml_,is_legal_ml_cycle)
@@ -121,10 +127,14 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
   ! The decoupled aggregator based on SOC measures ignores
   ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
   !
+  if (do_timings) call psb_tic(idx_map_bld)
   clean_zeros = ag%do_clean_zeros
   call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)
 
   if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
   if (info /= psb_success_) then
     info=psb_err_from_subroutine_
     call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
diff --git a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90
index 53e740fe..c4a85b05 100644
--- a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90
@@ -140,6 +140,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   real(psb_spk_)     :: anorm, omega, tmp, dg, theta
   logical, parameter :: debug_new=.false.
   character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
 
   name='amg_aggrmat_smth_bld'
   info=psb_success_
@@ -153,6 +156,23 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ctxt = desc_a%get_context()
 
   call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+
 
   nglob = desc_a%get_global_rows()
   nrow  = desc_a%get_local_rows()
@@ -171,6 +191,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ! naggr: number of local aggregates
   ! nrow: local rows. 
   ! 
+  if (do_timings) call psb_tic(idx_phase1)
 
   ! Get the diagonal D
   adiag = a%get_diag(info)
@@ -196,7 +217,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     !
     ! Build the filtered matrix Af from A
     ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
     do i=1, nrow
       tmp = czero
       jd  = -1 
@@ -214,11 +235,13 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
         acsrf%val(jd)=acsrf%val(jd)-tmp
       end if
     enddo
+    !$OMP end parallel do 
     ! Take out zeroed terms 
     call acsrf%clean_zeros(info)
   end if
 
 
+  !$OMP parallel do private(i) schedule(static)
   do i=1,size(adiag)
     if (adiag(i) /= czero) then
       adiag(i) = cone / adiag(i)
@@ -226,7 +249,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
       adiag(i) = cone
     end if
   end do
-
+  !$OMP end parallel do 
   if (parms%aggr_omega_alg == amg_eig_est_) then 
 
     if (parms%aggr_eig == amg_max_norm_) then 
@@ -252,8 +275,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
     goto 9999
   end if
+  if (do_timings) call psb_toc(idx_phase1)
 
-  
+  if (do_timings) call psb_tic(idx_phase2)
   call acsrf%scal(adiag,info)
   if (info /= psb_success_) goto 9999
 
@@ -267,6 +291,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call psb_cdasb(desc_ac,info)
   call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
   !
   ! Build the smoothed prolongator using either A or Af
   !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@@ -279,8 +305,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
     goto 9999
   end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done SPSPMM 1'
@@ -292,7 +318,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call op_prol%mv_from(coo_prol)
   call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done smooth_aggregate '
diff --git a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90
index 2edcca6c..26edbb0a 100644
--- a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90
@@ -97,6 +97,8 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
   integer(psb_lpk_)   :: ntaggr
   integer(psb_ipk_)   :: debug_level, debug_unit
   logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.
 
   name='amg_d_dec_aggregator_tprol'
   call psb_erractionsave(err_act)
@@ -108,6 +110,10 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
 
   call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
        &   amg_mult_ml_,is_legal_ml_cycle)
@@ -121,10 +127,14 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
   ! The decoupled aggregator based on SOC measures ignores
   ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
   !
+  if (do_timings) call psb_tic(idx_map_bld)
   clean_zeros = ag%do_clean_zeros
   call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)
 
   if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
   if (info /= psb_success_) then
     info=psb_err_from_subroutine_
     call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
diff --git a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90
index 82da3fc7..d365bf27 100644
--- a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90
@@ -140,6 +140,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   real(psb_dpk_)     :: anorm, omega, tmp, dg, theta
   logical, parameter :: debug_new=.false.
   character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
 
   name='amg_aggrmat_smth_bld'
   info=psb_success_
@@ -153,6 +156,23 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ctxt = desc_a%get_context()
 
   call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+
 
   nglob = desc_a%get_global_rows()
   nrow  = desc_a%get_local_rows()
@@ -171,6 +191,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ! naggr: number of local aggregates
   ! nrow: local rows. 
   ! 
+  if (do_timings) call psb_tic(idx_phase1)
 
   ! Get the diagonal D
   adiag = a%get_diag(info)
@@ -196,7 +217,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     !
     ! Build the filtered matrix Af from A
     ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
     do i=1, nrow
       tmp = dzero
       jd  = -1 
@@ -214,11 +235,13 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
         acsrf%val(jd)=acsrf%val(jd)-tmp
       end if
     enddo
+    !$OMP end parallel do 
     ! Take out zeroed terms 
     call acsrf%clean_zeros(info)
   end if
 
 
+  !$OMP parallel do private(i) schedule(static)
   do i=1,size(adiag)
     if (adiag(i) /= dzero) then
       adiag(i) = done / adiag(i)
@@ -226,7 +249,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
       adiag(i) = done
     end if
   end do
-
+  !$OMP end parallel do 
   if (parms%aggr_omega_alg == amg_eig_est_) then 
 
     if (parms%aggr_eig == amg_max_norm_) then 
@@ -252,8 +275,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
     goto 9999
   end if
+  if (do_timings) call psb_toc(idx_phase1)
 
-  
+  if (do_timings) call psb_tic(idx_phase2)
   call acsrf%scal(adiag,info)
   if (info /= psb_success_) goto 9999
 
@@ -267,6 +291,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call psb_cdasb(desc_ac,info)
   call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
   !
   ! Build the smoothed prolongator using either A or Af
   !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@@ -279,8 +305,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
     goto 9999
   end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done SPSPMM 1'
@@ -292,7 +318,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call op_prol%mv_from(coo_prol)
   call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done smooth_aggregate '
diff --git a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90
index c52c04f7..9529d141 100644
--- a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90
@@ -97,6 +97,8 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
   integer(psb_lpk_)   :: ntaggr
   integer(psb_ipk_)   :: debug_level, debug_unit
   logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.
 
   name='amg_s_dec_aggregator_tprol'
   call psb_erractionsave(err_act)
@@ -108,6 +110,10 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
 
   call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
        &   amg_mult_ml_,is_legal_ml_cycle)
@@ -121,10 +127,14 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
   ! The decoupled aggregator based on SOC measures ignores
   ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
   !
+  if (do_timings) call psb_tic(idx_map_bld)
   clean_zeros = ag%do_clean_zeros
   call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)
 
   if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
   if (info /= psb_success_) then
     info=psb_err_from_subroutine_
     call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
diff --git a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90
index d96176b2..c2eae3a4 100644
--- a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90
@@ -140,6 +140,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   real(psb_spk_)     :: anorm, omega, tmp, dg, theta
   logical, parameter :: debug_new=.false.
   character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
 
   name='amg_aggrmat_smth_bld'
   info=psb_success_
@@ -153,6 +156,23 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ctxt = desc_a%get_context()
 
   call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+
 
   nglob = desc_a%get_global_rows()
   nrow  = desc_a%get_local_rows()
@@ -171,6 +191,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ! naggr: number of local aggregates
   ! nrow: local rows. 
   ! 
+  if (do_timings) call psb_tic(idx_phase1)
 
   ! Get the diagonal D
   adiag = a%get_diag(info)
@@ -196,7 +217,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     !
     ! Build the filtered matrix Af from A
     ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
     do i=1, nrow
       tmp = szero
       jd  = -1 
@@ -214,11 +235,13 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
         acsrf%val(jd)=acsrf%val(jd)-tmp
       end if
     enddo
+    !$OMP end parallel do 
     ! Take out zeroed terms 
     call acsrf%clean_zeros(info)
   end if
 
 
+  !$OMP parallel do private(i) schedule(static)
   do i=1,size(adiag)
     if (adiag(i) /= szero) then
       adiag(i) = sone / adiag(i)
@@ -226,7 +249,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
       adiag(i) = sone
     end if
   end do
-
+  !$OMP end parallel do 
   if (parms%aggr_omega_alg == amg_eig_est_) then 
 
     if (parms%aggr_eig == amg_max_norm_) then 
@@ -252,8 +275,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
     goto 9999
   end if
+  if (do_timings) call psb_toc(idx_phase1)
 
-  
+  if (do_timings) call psb_tic(idx_phase2)
   call acsrf%scal(adiag,info)
   if (info /= psb_success_) goto 9999
 
@@ -267,6 +291,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call psb_cdasb(desc_ac,info)
   call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
   !
   ! Build the smoothed prolongator using either A or Af
   !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@@ -279,8 +305,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
     goto 9999
   end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done SPSPMM 1'
@@ -292,7 +318,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call op_prol%mv_from(coo_prol)
   call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done smooth_aggregate '
diff --git a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90
index a64e3ebb..a6a7856e 100644
--- a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90
@@ -97,6 +97,8 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
   integer(psb_lpk_)   :: ntaggr
   integer(psb_ipk_)   :: debug_level, debug_unit
   logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.
 
   name='amg_z_dec_aggregator_tprol'
   call psb_erractionsave(err_act)
@@ -108,6 +110,10 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
 
   call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
        &   amg_mult_ml_,is_legal_ml_cycle)
@@ -121,10 +127,14 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
   ! The decoupled aggregator based on SOC measures ignores
   ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
   !
+  if (do_timings) call psb_tic(idx_map_bld)
   clean_zeros = ag%do_clean_zeros
   call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)
 
   if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
   if (info /= psb_success_) then
     info=psb_err_from_subroutine_
     call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
diff --git a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90
index 2f944699..7b8ed075 100644
--- a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90
@@ -140,6 +140,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   real(psb_dpk_)     :: anorm, omega, tmp, dg, theta
   logical, parameter :: debug_new=.false.
   character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
 
   name='amg_aggrmat_smth_bld'
   info=psb_success_
@@ -153,6 +156,23 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ctxt = desc_a%get_context()
 
   call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+
 
   nglob = desc_a%get_global_rows()
   nrow  = desc_a%get_local_rows()
@@ -171,6 +191,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
   ! naggr: number of local aggregates
   ! nrow: local rows. 
   ! 
+  if (do_timings) call psb_tic(idx_phase1)
 
   ! Get the diagonal D
   adiag = a%get_diag(info)
@@ -196,7 +217,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     !
     ! Build the filtered matrix Af from A
     ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
     do i=1, nrow
       tmp = zzero
       jd  = -1 
@@ -214,11 +235,13 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
         acsrf%val(jd)=acsrf%val(jd)-tmp
       end if
     enddo
+    !$OMP end parallel do 
     ! Take out zeroed terms 
     call acsrf%clean_zeros(info)
   end if
 
 
+  !$OMP parallel do private(i) schedule(static)
   do i=1,size(adiag)
     if (adiag(i) /= zzero) then
       adiag(i) = zone / adiag(i)
@@ -226,7 +249,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
       adiag(i) = zone
     end if
   end do
-
+  !$OMP end parallel do 
   if (parms%aggr_omega_alg == amg_eig_est_) then 
 
     if (parms%aggr_eig == amg_max_norm_) then 
@@ -252,8 +275,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
     goto 9999
   end if
+  if (do_timings) call psb_toc(idx_phase1)
 
-  
+  if (do_timings) call psb_tic(idx_phase2)
   call acsrf%scal(adiag,info)
   if (info /= psb_success_) goto 9999
 
@@ -267,6 +291,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call psb_cdasb(desc_ac,info)
   call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
   !
   ! Build the smoothed prolongator using either A or Af
   !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@@ -279,8 +305,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
     call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
     goto 9999
   end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done SPSPMM 1'
@@ -292,7 +318,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
 
   call op_prol%mv_from(coo_prol)
   call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),&
        & 'Done smooth_aggregate '
diff --git a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90
index e79c90c9..27896806 100644
--- a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90
@@ -109,6 +109,8 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   type(psb_cspmat_type)    :: ac, op_restr, op_prol
   integer(psb_ipk_)        :: nzl, inl
   integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.
 
   name='amg_c_onelev_mat_asb'
   call psb_erractionsave(err_act)
@@ -120,6 +122,12 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
 
   call amg_check_def(lv%parms%aggr_prol,'Smoother',&
        &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@@ -139,9 +147,10 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! the mapping defined by amg_aggrmap_bld and applying the aggregation
   ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
   !
+  if (do_timings) call psb_tic(idx_matbld)
   call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
     goto 9999
@@ -151,14 +160,17 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! Now build its descriptor and convert global indices for
   ! ac, op_restr and op_prol
   !
+  if (do_timings) call psb_tic(idx_matasb)
   if (info == psb_success_) &
        & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
   if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
   
   if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
        & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
     goto 9999
diff --git a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90
index e9e55a9a..6bd4e1ac 100644
--- a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90
@@ -109,6 +109,8 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   type(psb_dspmat_type)    :: ac, op_restr, op_prol
   integer(psb_ipk_)        :: nzl, inl
   integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.
 
   name='amg_d_onelev_mat_asb'
   call psb_erractionsave(err_act)
@@ -120,6 +122,12 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
 
   call amg_check_def(lv%parms%aggr_prol,'Smoother',&
        &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@@ -139,9 +147,10 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! the mapping defined by amg_aggrmap_bld and applying the aggregation
   ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
   !
+  if (do_timings) call psb_tic(idx_matbld)
   call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
     goto 9999
@@ -151,14 +160,17 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! Now build its descriptor and convert global indices for
   ! ac, op_restr and op_prol
   !
+  if (do_timings) call psb_tic(idx_matasb)
   if (info == psb_success_) &
        & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
   if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
   
   if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
        & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
     goto 9999
diff --git a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90
index 271b31d0..034151d3 100644
--- a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90
@@ -109,6 +109,8 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   type(psb_sspmat_type)    :: ac, op_restr, op_prol
   integer(psb_ipk_)        :: nzl, inl
   integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.
 
   name='amg_s_onelev_mat_asb'
   call psb_erractionsave(err_act)
@@ -120,6 +122,12 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
 
   call amg_check_def(lv%parms%aggr_prol,'Smoother',&
        &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@@ -139,9 +147,10 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! the mapping defined by amg_aggrmap_bld and applying the aggregation
   ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
   !
+  if (do_timings) call psb_tic(idx_matbld)
   call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
     goto 9999
@@ -151,14 +160,17 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! Now build its descriptor and convert global indices for
   ! ac, op_restr and op_prol
   !
+  if (do_timings) call psb_tic(idx_matasb)
   if (info == psb_success_) &
        & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
   if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
   
   if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
        & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
     goto 9999
diff --git a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90
index 07ab3e0b..eb11cad2 100644
--- a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90
@@ -109,6 +109,8 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   type(psb_zspmat_type)    :: ac, op_restr, op_prol
   integer(psb_ipk_)        :: nzl, inl
   integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.
 
   name='amg_z_onelev_mat_asb'
   call psb_erractionsave(err_act)
@@ -120,6 +122,12 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   info  = psb_success_
   ctxt = desc_a%get_context()
   call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
 
   call amg_check_def(lv%parms%aggr_prol,'Smoother',&
        &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@@ -139,9 +147,10 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! the mapping defined by amg_aggrmap_bld and applying the aggregation
   ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
   !
+  if (do_timings) call psb_tic(idx_matbld)
   call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
     goto 9999
@@ -151,14 +160,17 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
   ! Now build its descriptor and convert global indices for
   ! ac, op_restr and op_prol
   !
+  if (do_timings) call psb_tic(idx_matasb)
   if (info == psb_success_) &
        & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
        & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
   if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
   
   if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
        & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
   if(info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
     goto 9999

From 494b8b925ff06b9610db488f045909e166aa65cc Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Mon, 5 Jun 2023 11:46:39 +0200
Subject: [PATCH 85/96] OpenMP loop in samples data generation

---
 samples/advanced/pdegen/amg_d_genpde_mod.F90 | 420 ++++++++++---------
 samples/advanced/pdegen/amg_s_genpde_mod.F90 | 420 ++++++++++---------
 2 files changed, 444 insertions(+), 396 deletions(-)

diff --git a/samples/advanced/pdegen/amg_d_genpde_mod.F90 b/samples/advanced/pdegen/amg_d_genpde_mod.F90
index d6acd01c..ec3affc5 100644
--- a/samples/advanced/pdegen/amg_d_genpde_mod.F90
+++ b/samples/advanced/pdegen/amg_d_genpde_mod.F90
@@ -93,6 +93,9 @@ contains
        & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
     use psb_base_mod
     use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
     !
     !   Discretizes the partial differential equation
     !
@@ -128,7 +131,6 @@ contains
     type(psb_d_csc_sparse_mat)  :: acsc
     type(psb_d_coo_sparse_mat)  :: acoo
     type(psb_d_csr_sparse_mat)  :: acsr
-    real(psb_dpk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
     integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
     integer(psb_lpk_) :: m,n,glob_row,nt
     integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@@ -141,8 +143,7 @@ contains
     ! Process grid
     integer(psb_ipk_) :: np, iam
     integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_dpk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
     ! deltah dimension of each grid cell
     ! deltat discretization time
     real(psb_dpk_)            :: deltah, sqdeltah, deltah2
@@ -368,119 +369,128 @@ contains
     call psb_barrier(ctxt)
     talc = psb_wtime()-t0
 
-    if (info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='allocation rout.'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
     call psb_barrier(ctxt)
     t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
-        ! x, y, z coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-        z = (iz-1)*deltah
-        zt(k) = f_(x,y,z)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y,z)
-        !
-        val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
-        if (ix == 1) then
-          zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1,z)
-        val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,dzero,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y,z-1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
-        if (iz == 1) then
-          zt(k) = g(x,y,dzero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y,z)
-        val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
-             & + c(x,y,z)
-        call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y,z+1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
-        if (iz == idim) then
-          zt(k) = g(x,y,done)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y+1,z)
-        val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,done,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y,z)
-        val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
-        if (ix==idim) then
-          zt(k) = g(done,y,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_dpk_), allocatable :: val(:)
+      real(psb_dpk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr, nb
+        if (info /= psb_success_) cycle
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
+          ! x, y, z coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+          z = (iz-1)*deltah
+          zt(k) = f_(x,y,z)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y,z)
+          !
+          val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
+          if (ix == 1) then
+            zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1,z)
+          val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,dzero,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y,z-1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
+          if (iz == 1) then
+            zt(k) = g(x,y,dzero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y,z)
+          val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
+               & + c(x,y,z)
+          call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
           irow(icoeff) = glob_row
           icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y,z+1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
+          if (iz == idim) then
+            zt(k) = g(x,y,done)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y+1,z)
+          val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,done,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y,z)
+          val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
+          if (ix==idim) then
+            zt(k) = g(done,y,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
 
+        end do
+        !write(0,*) ' Outer in_parallel ',omp_in_parallel()
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=dzero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
       end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=dzero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel
 
     tgen = psb_wtime()-t1
     if(info /= psb_success_) then
@@ -490,7 +500,6 @@ contains
       goto 9999
     end if
 
-    deallocate(val,irow,icol)
 
     call psb_barrier(ctxt)
     t1 = psb_wtime()
@@ -557,6 +566,9 @@ contains
        & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
     use psb_base_mod
     use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
     !
     !   Discretizes the partial differential equation
     !
@@ -591,7 +603,6 @@ contains
     type(psb_d_csc_sparse_mat)  :: acsc
     type(psb_d_coo_sparse_mat)  :: acoo
     type(psb_d_csr_sparse_mat)  :: acsr
-    real(psb_dpk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
     integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
     integer(psb_lpk_) :: m,n,glob_row,nt
     integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@@ -604,8 +615,7 @@ contains
     ! Process grid
     integer(psb_ipk_) :: np, iam
     integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_dpk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
     ! deltah dimension of each grid cell
     ! deltat discretization time
     real(psb_dpk_)            :: deltah, sqdeltah, deltah2, dd
@@ -791,7 +801,7 @@ contains
           !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy()
         end if
       end block
-      
+
     case default
       write(psb_err_unit,*) iam, 'Initialization error: should not get here'
       info = -1
@@ -816,93 +826,109 @@ contains
       goto 9999
     end if
 
-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
     call psb_barrier(ctxt)
     t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,glob_row,idim,idim)
-        ! x, y coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-
-        zt(k) = f_(x,y)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y)
-        !
-        val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
-        if (ix == 1) then
-          zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1)
-        val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,dzero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y)
-        val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
-        call ijk2idx(icol(icoeff),ix,iy,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y+1)
-        val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,done)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y)
-        val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
-        if (ix==idim) then
-          zt(k) = g(done,y)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_dpk_), allocatable :: val(:)
+      real(psb_dpk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      ! loop over rows belonging to current process in a block
+      ! distribution.
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr,nb
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,glob_row,idim,idim)
+          ! x, y coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+
+          zt(k) = f_(x,y)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y)
+          !
+          val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
+          if (ix == 1) then
+            zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1)
+          val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,dzero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y)
+          val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
+          call ijk2idx(icol(icoeff),ix,iy,idim,idim)
           irow(icoeff) = glob_row
           icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y+1)
+          val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,done)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y)
+          val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
+          if (ix==idim) then
+            zt(k) = g(done,y)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
 
+        end do
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=dzero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
       end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=dzero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel
 
     tgen = psb_wtime()-t1
     if(info /= psb_success_) then
@@ -912,8 +938,6 @@ contains
       goto 9999
     end if
 
-    deallocate(val,irow,icol)
-
     call psb_barrier(ctxt)
     t1 = psb_wtime()
     call psb_cdasb(desc_a,info)
diff --git a/samples/advanced/pdegen/amg_s_genpde_mod.F90 b/samples/advanced/pdegen/amg_s_genpde_mod.F90
index 7d32cf30..dfa79ab3 100644
--- a/samples/advanced/pdegen/amg_s_genpde_mod.F90
+++ b/samples/advanced/pdegen/amg_s_genpde_mod.F90
@@ -93,6 +93,9 @@ contains
        & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
     use psb_base_mod
     use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
     !
     !   Discretizes the partial differential equation
     !
@@ -128,7 +131,6 @@ contains
     type(psb_s_csc_sparse_mat)  :: acsc
     type(psb_s_coo_sparse_mat)  :: acoo
     type(psb_s_csr_sparse_mat)  :: acsr
-    real(psb_spk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
     integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
     integer(psb_lpk_) :: m,n,glob_row,nt
     integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@@ -141,8 +143,7 @@ contains
     ! Process grid
     integer(psb_ipk_) :: np, iam
     integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_spk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
     ! deltah dimension of each grid cell
     ! deltat discretization time
     real(psb_spk_)            :: deltah, sqdeltah, deltah2
@@ -368,119 +369,128 @@ contains
     call psb_barrier(ctxt)
     talc = psb_wtime()-t0
 
-    if (info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='allocation rout.'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
     call psb_barrier(ctxt)
     t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
-        ! x, y, z coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-        z = (iz-1)*deltah
-        zt(k) = f_(x,y,z)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y,z)
-        !
-        val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
-        if (ix == 1) then
-          zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1,z)
-        val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,szero,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y,z-1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
-        if (iz == 1) then
-          zt(k) = g(x,y,szero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y,z)
-        val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
-             & + c(x,y,z)
-        call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y,z+1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
-        if (iz == idim) then
-          zt(k) = g(x,y,sone)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y+1,z)
-        val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,sone,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y,z)
-        val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
-        if (ix==idim) then
-          zt(k) = g(sone,y,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_spk_), allocatable :: val(:)
+      real(psb_spk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr, nb
+        if (info /= psb_success_) cycle
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
+          ! x, y, z coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+          z = (iz-1)*deltah
+          zt(k) = f_(x,y,z)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y,z)
+          !
+          val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
+          if (ix == 1) then
+            zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1,z)
+          val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,szero,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y,z-1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
+          if (iz == 1) then
+            zt(k) = g(x,y,szero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y,z)
+          val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
+               & + c(x,y,z)
+          call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
           irow(icoeff) = glob_row
           icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y,z+1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
+          if (iz == idim) then
+            zt(k) = g(x,y,sone)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y+1,z)
+          val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,sone,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y,z)
+          val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
+          if (ix==idim) then
+            zt(k) = g(sone,y,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
 
+        end do
+        !write(0,*) ' Outer in_parallel ',omp_in_parallel()
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=szero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
       end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=szero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel
 
     tgen = psb_wtime()-t1
     if(info /= psb_success_) then
@@ -490,7 +500,6 @@ contains
       goto 9999
     end if
 
-    deallocate(val,irow,icol)
 
     call psb_barrier(ctxt)
     t1 = psb_wtime()
@@ -557,6 +566,9 @@ contains
        & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
     use psb_base_mod
     use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
     !
     !   Discretizes the partial differential equation
     !
@@ -591,7 +603,6 @@ contains
     type(psb_s_csc_sparse_mat)  :: acsc
     type(psb_s_coo_sparse_mat)  :: acoo
     type(psb_s_csr_sparse_mat)  :: acsr
-    real(psb_spk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
     integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
     integer(psb_lpk_) :: m,n,glob_row,nt
     integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@@ -604,8 +615,7 @@ contains
     ! Process grid
     integer(psb_ipk_) :: np, iam
     integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_spk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
     ! deltah dimension of each grid cell
     ! deltat discretization time
     real(psb_spk_)            :: deltah, sqdeltah, deltah2, dd
@@ -791,7 +801,7 @@ contains
           !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy()
         end if
       end block
-      
+
     case default
       write(psb_err_unit,*) iam, 'Initialization error: should not get here'
       info = -1
@@ -816,93 +826,109 @@ contains
       goto 9999
     end if
 
-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
     call psb_barrier(ctxt)
     t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,glob_row,idim,idim)
-        ! x, y coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-
-        zt(k) = f_(x,y)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y)
-        !
-        val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
-        if (ix == 1) then
-          zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1)
-        val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,szero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y)
-        val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
-        call ijk2idx(icol(icoeff),ix,iy,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y+1)
-        val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,sone)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y)
-        val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
-        if (ix==idim) then
-          zt(k) = g(sone,y)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_spk_), allocatable :: val(:)
+      real(psb_spk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      ! loop over rows belonging to current process in a block
+      ! distribution.
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr,nb
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,glob_row,idim,idim)
+          ! x, y coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+
+          zt(k) = f_(x,y)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y)
+          !
+          val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
+          if (ix == 1) then
+            zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1)
+          val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,szero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y)
+          val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
+          call ijk2idx(icol(icoeff),ix,iy,idim,idim)
           irow(icoeff) = glob_row
           icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y+1)
+          val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,sone)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y)
+          val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
+          if (ix==idim) then
+            zt(k) = g(sone,y)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
 
+        end do
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=szero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
       end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=szero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel
 
     tgen = psb_wtime()-t1
     if(info /= psb_success_) then
@@ -912,8 +938,6 @@ contains
       goto 9999
     end if
 
-    deallocate(val,irow,icol)
-
     call psb_barrier(ctxt)
     t1 = psb_wtime()
     call psb_cdasb(desc_a,info)

From 3a5e73e4c8e2871f7912f7525c8b5d997803d3d3 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Wed, 14 Jun 2023 14:05:42 +0200
Subject: [PATCH 86/96] adjust NTH in samples/pdegen

---
 .../{amg_d_pde2d.f90 => amg_d_pde2d.F90}       | 18 ++++++++++++++++--
 .../{amg_d_pde3d.f90 => amg_d_pde3d.F90}       | 16 +++++++++++++++-
 .../{amg_s_pde2d.f90 => amg_s_pde2d.F90}       | 18 ++++++++++++++++--
 .../{amg_s_pde3d.f90 => amg_s_pde3d.F90}       | 16 +++++++++++++++-
 4 files changed, 62 insertions(+), 6 deletions(-)
 rename samples/advanced/pdegen/{amg_d_pde2d.f90 => amg_d_pde2d.F90} (98%)
 rename samples/advanced/pdegen/{amg_d_pde3d.f90 => amg_d_pde3d.F90} (98%)
 rename samples/advanced/pdegen/{amg_s_pde2d.f90 => amg_s_pde2d.F90} (98%)
 rename samples/advanced/pdegen/{amg_s_pde3d.f90 => amg_s_pde3d.F90} (98%)

diff --git a/samples/advanced/pdegen/amg_d_pde2d.f90 b/samples/advanced/pdegen/amg_d_pde2d.F90
similarity index 98%
rename from samples/advanced/pdegen/amg_d_pde2d.f90
rename to samples/advanced/pdegen/amg_d_pde2d.F90
index c036aa6d..37e9fcd6 100644
--- a/samples/advanced/pdegen/amg_d_pde2d.f90
+++ b/samples/advanced/pdegen/amg_d_pde2d.F90
@@ -73,6 +73,9 @@ program amg_d_pde2d
   use amg_d_pde2d_exp_mod
   use amg_d_pde2d_box_mod
   use amg_d_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! input parameters
@@ -93,7 +96,7 @@ program amg_d_pde2d
   type(psb_d_vect_type) :: x,b,r
   ! parallel environment
   type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
 
   ! solver parameters
   integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@@ -197,6 +200,15 @@ program amg_d_pde2d
 
   call psb_init(ctxt)
   call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif
 
   if (iam < 0) then
     ! This should not happen, but just in case
@@ -451,7 +463,9 @@ program amg_d_pde2d
   call psb_sum(ctxt,precsize)
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')   np
+    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
     write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
     write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
diff --git a/samples/advanced/pdegen/amg_d_pde3d.f90 b/samples/advanced/pdegen/amg_d_pde3d.F90
similarity index 98%
rename from samples/advanced/pdegen/amg_d_pde3d.f90
rename to samples/advanced/pdegen/amg_d_pde3d.F90
index 1f6118ca..a1ceea84 100644
--- a/samples/advanced/pdegen/amg_d_pde3d.f90
+++ b/samples/advanced/pdegen/amg_d_pde3d.F90
@@ -74,6 +74,9 @@ program amg_d_pde3d
   use amg_d_pde3d_exp_mod
   use amg_d_pde3d_gauss_mod
   use amg_d_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! input parameters
@@ -94,7 +97,7 @@ program amg_d_pde3d
   type(psb_d_vect_type) :: x,b,r
   ! parallel environment
   type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
 
   ! solver parameters
   integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@@ -198,6 +201,15 @@ program amg_d_pde3d
 
   call psb_init(ctxt)
   call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif
 
   if (iam < 0) then
     ! This should not happen, but just in case
@@ -456,6 +468,8 @@ program amg_d_pde3d
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
     write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
     write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
     write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
diff --git a/samples/advanced/pdegen/amg_s_pde2d.f90 b/samples/advanced/pdegen/amg_s_pde2d.F90
similarity index 98%
rename from samples/advanced/pdegen/amg_s_pde2d.f90
rename to samples/advanced/pdegen/amg_s_pde2d.F90
index a81d16ff..eb8a8d63 100644
--- a/samples/advanced/pdegen/amg_s_pde2d.f90
+++ b/samples/advanced/pdegen/amg_s_pde2d.F90
@@ -73,6 +73,9 @@ program amg_s_pde2d
   use amg_s_pde2d_exp_mod
   use amg_s_pde2d_box_mod
   use amg_s_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! input parameters
@@ -93,7 +96,7 @@ program amg_s_pde2d
   type(psb_s_vect_type) :: x,b,r
   ! parallel environment
   type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
 
   ! solver parameters
   integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@@ -197,6 +200,15 @@ program amg_s_pde2d
 
   call psb_init(ctxt)
   call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif
 
   if (iam < 0) then
     ! This should not happen, but just in case
@@ -451,7 +463,9 @@ program amg_s_pde2d
   call psb_sum(ctxt,precsize)
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')   np
+    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
     write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
     write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
diff --git a/samples/advanced/pdegen/amg_s_pde3d.f90 b/samples/advanced/pdegen/amg_s_pde3d.F90
similarity index 98%
rename from samples/advanced/pdegen/amg_s_pde3d.f90
rename to samples/advanced/pdegen/amg_s_pde3d.F90
index 7542c3a2..da5cd173 100644
--- a/samples/advanced/pdegen/amg_s_pde3d.f90
+++ b/samples/advanced/pdegen/amg_s_pde3d.F90
@@ -74,6 +74,9 @@ program amg_s_pde3d
   use amg_s_pde3d_exp_mod
   use amg_s_pde3d_gauss_mod
   use amg_s_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! input parameters
@@ -94,7 +97,7 @@ program amg_s_pde3d
   type(psb_s_vect_type) :: x,b,r
   ! parallel environment
   type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
 
   ! solver parameters
   integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@@ -198,6 +201,15 @@ program amg_s_pde3d
 
   call psb_init(ctxt)
   call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif
 
   if (iam < 0) then
     ! This should not happen, but just in case
@@ -456,6 +468,8 @@ program amg_s_pde3d
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
     write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
     write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
     write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)

From 2fd718be6fc195157e49133028c7dc68fed11888 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Wed, 19 Jul 2023 15:59:11 +0200
Subject: [PATCH 87/96] Updates and measurements for OpenMP build

---
 amgprec/impl/aggregator/amg_c_ptap_bld.f90    |  34 +--
 ...oc1_map_bld.f90 => amg_c_soc1_map_bld.F90} | 218 ++++++++++++++++--
 amgprec/impl/aggregator/amg_d_ptap_bld.f90    |  34 +--
 ...oc1_map_bld.f90 => amg_d_soc1_map_bld.F90} | 218 ++++++++++++++++--
 amgprec/impl/aggregator/amg_s_ptap_bld.f90    |  34 +--
 ...oc1_map_bld.f90 => amg_s_soc1_map_bld.F90} | 218 ++++++++++++++++--
 amgprec/impl/aggregator/amg_z_ptap_bld.f90    |  34 +--
 ...oc1_map_bld.f90 => amg_z_soc1_map_bld.F90} | 218 ++++++++++++++++--
 amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 |   7 +
 amgprec/impl/solver/amg_c_gs_solver_bld.f90   |  11 +-
 amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 |   7 +
 amgprec/impl/solver/amg_d_gs_solver_bld.f90   |  11 +-
 amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 |   7 +
 amgprec/impl/solver/amg_s_gs_solver_bld.f90   |  11 +-
 amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 |   7 +
 amgprec/impl/solver/amg_z_gs_solver_bld.f90   |  11 +-
 samples/advanced/pdegen/amg_d_pde3d.F90       |   6 +-
 samples/advanced/pdegen/amg_s_pde3d.F90       |   6 +-
 samples/advanced/pdegen/runs/amg_pde3d.inp    |   8 +-
 19 files changed, 954 insertions(+), 146 deletions(-)
 rename amgprec/impl/aggregator/{amg_c_soc1_map_bld.f90 => amg_c_soc1_map_bld.F90} (59%)
 rename amgprec/impl/aggregator/{amg_d_soc1_map_bld.f90 => amg_d_soc1_map_bld.F90} (59%)
 rename amgprec/impl/aggregator/{amg_s_soc1_map_bld.f90 => amg_s_soc1_map_bld.F90} (59%)
 rename amgprec/impl/aggregator/{amg_z_soc1_map_bld.f90 => amg_z_soc1_map_bld.F90} (59%)

diff --git a/amgprec/impl/aggregator/amg_c_ptap_bld.f90 b/amgprec/impl/aggregator/amg_c_ptap_bld.f90
index d787d7a7..02dcb1f4 100644
--- a/amgprec/impl/aggregator/amg_c_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_c_ptap_bld.f90
@@ -76,7 +76,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
   integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
   logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
 
   name='amg_ptap_bld'
   if(psb_get_errstatus().ne.0) return 
@@ -93,7 +93,11 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ncol  = desc_a%get_local_cols()
 
   if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
 
   naggr   = nlaggr(me+1)
   ntaggr  = sum(nlaggr)
@@ -128,6 +132,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ! Ok first product done.
 
   if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
     block 
       call coo_prol%cp_to_coo(coo_restr,info)
       call coo_restr%set_ncols(desc_ac%get_local_cols())
@@ -137,7 +142,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
       call coo_restr%set_ncols(desc_ax%get_local_cols())
     end block
     call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
@@ -167,27 +172,28 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
 
     call coo_restr%transp()
     nzl = coo_restr%get_nzeros()
-    nrl = desc_ac%get_local_rows() 
-    i=0
+    nrl = desc_ac%get_local_rows()
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
     !
     ! Only keep local rows
     !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
       end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
     nzl  = coo_restr%get_nzeros()
     call coo_restr%set_nrows(desc_ac%get_local_rows())
     call coo_restr%set_ncols(desc_a%get_local_cols())
     if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
     call csr_restr%cp_from_coo(coo_restr,info)
 
+    if (do_timings) call psb_toc(idx_cpytrans2)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
similarity index 59%
rename from amgprec/impl/aggregator/amg_c_soc1_map_bld.f90
rename to amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index d1c734fc..4ec81322 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.f90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -72,7 +72,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_c_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! Arguments
@@ -99,6 +101,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_)   :: nrow, ncol, n_ne
   integer(psb_lpk_)   :: nrglob
   character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc1_map_bld'
@@ -114,6 +119,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -133,41 +146,194 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc1_p0)
   call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
-  else 
+    !$omp end parallel do 
+  else
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
     end do
+    !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-  icnt = 0
+#if 0&&defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    block
+      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      nths = omp_get_num_threads()
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+!!$      write(0,*) 'From thread : rsz ',myth,rsz
+      !$omp master
+      allocate(bnds(0:nths),locnaggr(0:nths))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      bnds(myth+1) = rsz
+      !$omp master
+!!$      write(0,*) 'From master 1: ',bnds      
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+!!$      write(0,*) 'From master 2: ',bnds
+      !$omp end master
+      !$omp barrier
+
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          if (info /= 0) cycle
+          i = idxs(ii)
+          if ((i<1).or.(i>nr)) then
+            info=psb_err_internal_error_
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              info=psb_err_internal_error_
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            if (.false.) then 
+              ip = 0
+              do k=1, nz
+                j   = icol(k)
+                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
+                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                    ip = ip + 1
+                    icol(ip) = icol(k)
+                  end if
+                end if
+              enddo
+
+              !
+              ! If the whole strongly coupled neighborhood of I is
+              ! as yet unconnected, turn it into the next aggregate.
+              ! Same if ip==0 (in which case, neighborhood only
+              ! contains I even if it does not look like it from matrix)
+              !
+              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+              if (disjoint) then       
+                locnaggr(kk)     = locnaggr(kk) + 1
+                do k=1, ip
+                  ilaggr(icol(k)) = locnaggr(kk)
+                end do
+                ilaggr(i) = locnaggr(kk)
+              end if
+            else
+              ip = 0
+              minip = nr +1              
+              do k=1, nz
+                j   = icol(k)
+                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                  ip = ip + 1
+                  icol(ip) = icol(k)
+                  minip = min(icol(ip),minip)
+                end if
+              enddo
+              if (bnds(myth)<=minip) then
+
+                !
+                ! If the whole strongly coupled neighborhood of I is
+                ! as yet unconnected, turn it into the next aggregate.
+                ! Same if ip==0 (in which case, neighborhood only
+                ! contains I even if it does not look like it from matrix)
+                !
+                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                if (disjoint) then       
+                  !$omp critical(update_ilaggr)
+                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                  if (disjoint) then       
+                    locnaggr(kk)     = locnaggr(kk) + 1
+                    do k=1, ip
+                      ilaggr(icol(k)) = locnaggr(kk)
+                    end do
+                    ilaggr(i) = locnaggr(kk)
+                  end if
+                  !$omp end critical(update_ilaggr)
+                end if
+              endif
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+      !$omp barrier
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
+#else
   step1: do ii=1, nr
+    if (info /= 0) cycle
     i = idxs(ii)
     if ((i<1).or.(i>nr)) then
       info=psb_err_internal_error_
       call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
     end if
-    
+
     if (ilaggr(i) == -(nr+1)) then
       nz         = (acsr%irp(i+1)-acsr%irp(i))
       if ((nz<0).or.(nz>size(icol))) then
         info=psb_err_internal_error_
         call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
       end if
 
       icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@@ -176,7 +342,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !
       ! Build the set of all strongly coupled nodes 
       !
-      ip = 0 
+      ip = 0
       do k=1, nz
         j   = icol(k)
         if ((1<=j).and.(j<=nr)) then 
@@ -194,8 +360,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       ! contains I even if it does not look like it from matrix)
       !
       disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-      if (disjoint) then 
-        icnt      = icnt + 1 
+      if (disjoint) then       
         naggr     = naggr + 1
         do k=1, ip
           ilaggr(icol(k)) = naggr
@@ -204,16 +369,22 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+!!$  write(0,*) 'NAGGR ',naggr
+#endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
   !
   ! Phase two: join the neighbours
   !
+  ! $ omp workshare
   tmpaggr = ilaggr
+  ! $ omp end workshare
+  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -244,8 +415,15 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  ! $ omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+  
+  if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -274,7 +452,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         end if
       enddo
       if (ip > 0) then
-        icnt      = icnt + 1 
         naggr     = naggr + 1
         ilaggr(i) = naggr
         do k=1, ip
@@ -309,7 +486,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
-
+  if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
@@ -336,9 +513,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nlaggr(:) = 0
   nlaggr(me+1) = naggr
   call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
-
   call psb_erractionrestore(err_act)
   return
 
diff --git a/amgprec/impl/aggregator/amg_d_ptap_bld.f90 b/amgprec/impl/aggregator/amg_d_ptap_bld.f90
index 8520e58e..4006c04c 100644
--- a/amgprec/impl/aggregator/amg_d_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_d_ptap_bld.f90
@@ -76,7 +76,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
   integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
   logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
 
   name='amg_ptap_bld'
   if(psb_get_errstatus().ne.0) return 
@@ -93,7 +93,11 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ncol  = desc_a%get_local_cols()
 
   if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
 
   naggr   = nlaggr(me+1)
   ntaggr  = sum(nlaggr)
@@ -128,6 +132,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ! Ok first product done.
 
   if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
     block 
       call coo_prol%cp_to_coo(coo_restr,info)
       call coo_restr%set_ncols(desc_ac%get_local_cols())
@@ -137,7 +142,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
       call coo_restr%set_ncols(desc_ax%get_local_cols())
     end block
     call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
@@ -167,27 +172,28 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
 
     call coo_restr%transp()
     nzl = coo_restr%get_nzeros()
-    nrl = desc_ac%get_local_rows() 
-    i=0
+    nrl = desc_ac%get_local_rows()
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
     !
     ! Only keep local rows
     !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
       end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
     nzl  = coo_restr%get_nzeros()
     call coo_restr%set_nrows(desc_ac%get_local_rows())
     call coo_restr%set_ncols(desc_a%get_local_cols())
     if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
     call csr_restr%cp_from_coo(coo_restr,info)
 
+    if (do_timings) call psb_toc(idx_cpytrans2)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
similarity index 59%
rename from amgprec/impl/aggregator/amg_d_soc1_map_bld.f90
rename to amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index e3ae5245..af0a7764 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.f90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -72,7 +72,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_d_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! Arguments
@@ -99,6 +101,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_)   :: nrow, ncol, n_ne
   integer(psb_lpk_)   :: nrglob
   character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc1_map_bld'
@@ -114,6 +119,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -133,41 +146,194 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc1_p0)
   call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
-  else 
+    !$omp end parallel do 
+  else
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
     end do
+    !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-  icnt = 0
+#if 0&&defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    block
+      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      nths = omp_get_num_threads()
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+!!$      write(0,*) 'From thread : rsz ',myth,rsz
+      !$omp master
+      allocate(bnds(0:nths),locnaggr(0:nths))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      bnds(myth+1) = rsz
+      !$omp master
+!!$      write(0,*) 'From master 1: ',bnds      
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+!!$      write(0,*) 'From master 2: ',bnds
+      !$omp end master
+      !$omp barrier
+
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          if (info /= 0) cycle
+          i = idxs(ii)
+          if ((i<1).or.(i>nr)) then
+            info=psb_err_internal_error_
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              info=psb_err_internal_error_
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            if (.false.) then 
+              ip = 0
+              do k=1, nz
+                j   = icol(k)
+                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
+                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                    ip = ip + 1
+                    icol(ip) = icol(k)
+                  end if
+                end if
+              enddo
+
+              !
+              ! If the whole strongly coupled neighborhood of I is
+              ! as yet unconnected, turn it into the next aggregate.
+              ! Same if ip==0 (in which case, neighborhood only
+              ! contains I even if it does not look like it from matrix)
+              !
+              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+              if (disjoint) then       
+                locnaggr(kk)     = locnaggr(kk) + 1
+                do k=1, ip
+                  ilaggr(icol(k)) = locnaggr(kk)
+                end do
+                ilaggr(i) = locnaggr(kk)
+              end if
+            else
+              ip = 0
+              minip = nr +1              
+              do k=1, nz
+                j   = icol(k)
+                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                  ip = ip + 1
+                  icol(ip) = icol(k)
+                  minip = min(icol(ip),minip)
+                end if
+              enddo
+              if (bnds(myth)<=minip) then
+
+                !
+                ! If the whole strongly coupled neighborhood of I is
+                ! as yet unconnected, turn it into the next aggregate.
+                ! Same if ip==0 (in which case, neighborhood only
+                ! contains I even if it does not look like it from matrix)
+                !
+                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                if (disjoint) then       
+                  !$omp critical(update_ilaggr)
+                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                  if (disjoint) then       
+                    locnaggr(kk)     = locnaggr(kk) + 1
+                    do k=1, ip
+                      ilaggr(icol(k)) = locnaggr(kk)
+                    end do
+                    ilaggr(i) = locnaggr(kk)
+                  end if
+                  !$omp end critical(update_ilaggr)
+                end if
+              endif
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+      !$omp barrier
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
+#else
   step1: do ii=1, nr
+    if (info /= 0) cycle
     i = idxs(ii)
     if ((i<1).or.(i>nr)) then
       info=psb_err_internal_error_
       call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
     end if
-    
+
     if (ilaggr(i) == -(nr+1)) then
       nz         = (acsr%irp(i+1)-acsr%irp(i))
       if ((nz<0).or.(nz>size(icol))) then
         info=psb_err_internal_error_
         call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
       end if
 
       icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@@ -176,7 +342,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !
       ! Build the set of all strongly coupled nodes 
       !
-      ip = 0 
+      ip = 0
       do k=1, nz
         j   = icol(k)
         if ((1<=j).and.(j<=nr)) then 
@@ -194,8 +360,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       ! contains I even if it does not look like it from matrix)
       !
       disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-      if (disjoint) then 
-        icnt      = icnt + 1 
+      if (disjoint) then       
         naggr     = naggr + 1
         do k=1, ip
           ilaggr(icol(k)) = naggr
@@ -204,16 +369,22 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+!!$  write(0,*) 'NAGGR ',naggr
+#endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
   !
   ! Phase two: join the neighbours
   !
+  ! $ omp workshare
   tmpaggr = ilaggr
+  ! $ omp end workshare
+  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -244,8 +415,15 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  ! $ omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+  
+  if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -274,7 +452,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         end if
       enddo
       if (ip > 0) then
-        icnt      = icnt + 1 
         naggr     = naggr + 1
         ilaggr(i) = naggr
         do k=1, ip
@@ -309,7 +486,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
-
+  if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
@@ -336,9 +513,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nlaggr(:) = 0
   nlaggr(me+1) = naggr
   call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
-
   call psb_erractionrestore(err_act)
   return
 
diff --git a/amgprec/impl/aggregator/amg_s_ptap_bld.f90 b/amgprec/impl/aggregator/amg_s_ptap_bld.f90
index 93b79b63..e1a6c867 100644
--- a/amgprec/impl/aggregator/amg_s_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_s_ptap_bld.f90
@@ -76,7 +76,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
   integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
   logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
 
   name='amg_ptap_bld'
   if(psb_get_errstatus().ne.0) return 
@@ -93,7 +93,11 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ncol  = desc_a%get_local_cols()
 
   if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
 
   naggr   = nlaggr(me+1)
   ntaggr  = sum(nlaggr)
@@ -128,6 +132,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ! Ok first product done.
 
   if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
     block 
       call coo_prol%cp_to_coo(coo_restr,info)
       call coo_restr%set_ncols(desc_ac%get_local_cols())
@@ -137,7 +142,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
       call coo_restr%set_ncols(desc_ax%get_local_cols())
     end block
     call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
@@ -167,27 +172,28 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
 
     call coo_restr%transp()
     nzl = coo_restr%get_nzeros()
-    nrl = desc_ac%get_local_rows() 
-    i=0
+    nrl = desc_ac%get_local_rows()
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
     !
     ! Only keep local rows
     !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
       end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
     nzl  = coo_restr%get_nzeros()
     call coo_restr%set_nrows(desc_ac%get_local_rows())
     call coo_restr%set_ncols(desc_a%get_local_cols())
     if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
     call csr_restr%cp_from_coo(coo_restr,info)
 
+    if (do_timings) call psb_toc(idx_cpytrans2)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
similarity index 59%
rename from amgprec/impl/aggregator/amg_s_soc1_map_bld.f90
rename to amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 3f4cc437..967ee669 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.f90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -72,7 +72,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_s_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! Arguments
@@ -99,6 +101,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_)   :: nrow, ncol, n_ne
   integer(psb_lpk_)   :: nrglob
   character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc1_map_bld'
@@ -114,6 +119,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -133,41 +146,194 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc1_p0)
   call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
-  else 
+    !$omp end parallel do 
+  else
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
     end do
+    !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-  icnt = 0
+#if 0&&defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    block
+      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      nths = omp_get_num_threads()
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+!!$      write(0,*) 'From thread : rsz ',myth,rsz
+      !$omp master
+      allocate(bnds(0:nths),locnaggr(0:nths))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      bnds(myth+1) = rsz
+      !$omp master
+!!$      write(0,*) 'From master 1: ',bnds      
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+!!$      write(0,*) 'From master 2: ',bnds
+      !$omp end master
+      !$omp barrier
+
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          if (info /= 0) cycle
+          i = idxs(ii)
+          if ((i<1).or.(i>nr)) then
+            info=psb_err_internal_error_
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              info=psb_err_internal_error_
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            if (.false.) then 
+              ip = 0
+              do k=1, nz
+                j   = icol(k)
+                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
+                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                    ip = ip + 1
+                    icol(ip) = icol(k)
+                  end if
+                end if
+              enddo
+
+              !
+              ! If the whole strongly coupled neighborhood of I is
+              ! as yet unconnected, turn it into the next aggregate.
+              ! Same if ip==0 (in which case, neighborhood only
+              ! contains I even if it does not look like it from matrix)
+              !
+              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+              if (disjoint) then       
+                locnaggr(kk)     = locnaggr(kk) + 1
+                do k=1, ip
+                  ilaggr(icol(k)) = locnaggr(kk)
+                end do
+                ilaggr(i) = locnaggr(kk)
+              end if
+            else
+              ip = 0
+              minip = nr +1              
+              do k=1, nz
+                j   = icol(k)
+                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                  ip = ip + 1
+                  icol(ip) = icol(k)
+                  minip = min(icol(ip),minip)
+                end if
+              enddo
+              if (bnds(myth)<=minip) then
+
+                !
+                ! If the whole strongly coupled neighborhood of I is
+                ! as yet unconnected, turn it into the next aggregate.
+                ! Same if ip==0 (in which case, neighborhood only
+                ! contains I even if it does not look like it from matrix)
+                !
+                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                if (disjoint) then       
+                  !$omp critical(update_ilaggr)
+                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                  if (disjoint) then       
+                    locnaggr(kk)     = locnaggr(kk) + 1
+                    do k=1, ip
+                      ilaggr(icol(k)) = locnaggr(kk)
+                    end do
+                    ilaggr(i) = locnaggr(kk)
+                  end if
+                  !$omp end critical(update_ilaggr)
+                end if
+              endif
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+      !$omp barrier
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
+#else
   step1: do ii=1, nr
+    if (info /= 0) cycle
     i = idxs(ii)
     if ((i<1).or.(i>nr)) then
       info=psb_err_internal_error_
       call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
     end if
-    
+
     if (ilaggr(i) == -(nr+1)) then
       nz         = (acsr%irp(i+1)-acsr%irp(i))
       if ((nz<0).or.(nz>size(icol))) then
         info=psb_err_internal_error_
         call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
       end if
 
       icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@@ -176,7 +342,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !
       ! Build the set of all strongly coupled nodes 
       !
-      ip = 0 
+      ip = 0
       do k=1, nz
         j   = icol(k)
         if ((1<=j).and.(j<=nr)) then 
@@ -194,8 +360,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       ! contains I even if it does not look like it from matrix)
       !
       disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-      if (disjoint) then 
-        icnt      = icnt + 1 
+      if (disjoint) then       
         naggr     = naggr + 1
         do k=1, ip
           ilaggr(icol(k)) = naggr
@@ -204,16 +369,22 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+!!$  write(0,*) 'NAGGR ',naggr
+#endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
   !
   ! Phase two: join the neighbours
   !
+  ! $ omp workshare
   tmpaggr = ilaggr
+  ! $ omp end workshare
+  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -244,8 +415,15 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  ! $ omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+  
+  if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -274,7 +452,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         end if
       enddo
       if (ip > 0) then
-        icnt      = icnt + 1 
         naggr     = naggr + 1
         ilaggr(i) = naggr
         do k=1, ip
@@ -309,7 +486,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
-
+  if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
@@ -336,9 +513,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nlaggr(:) = 0
   nlaggr(me+1) = naggr
   call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
-
   call psb_erractionrestore(err_act)
   return
 
diff --git a/amgprec/impl/aggregator/amg_z_ptap_bld.f90 b/amgprec/impl/aggregator/amg_z_ptap_bld.f90
index 6faf1b71..e322a303 100644
--- a/amgprec/impl/aggregator/amg_z_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_z_ptap_bld.f90
@@ -76,7 +76,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
   integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
   logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
 
   name='amg_ptap_bld'
   if(psb_get_errstatus().ne.0) return 
@@ -93,7 +93,11 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ncol  = desc_a%get_local_cols()
 
   if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
 
   naggr   = nlaggr(me+1)
   ntaggr  = sum(nlaggr)
@@ -128,6 +132,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
   ! Ok first product done.
 
   if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
     block 
       call coo_prol%cp_to_coo(coo_restr,info)
       call coo_restr%set_ncols(desc_ac%get_local_cols())
@@ -137,7 +142,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
       call coo_restr%set_ncols(desc_ax%get_local_cols())
     end block
     call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
@@ -167,27 +172,28 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
 
     call coo_restr%transp()
     nzl = coo_restr%get_nzeros()
-    nrl = desc_ac%get_local_rows() 
-    i=0
+    nrl = desc_ac%get_local_rows()
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
     !
     ! Only keep local rows
     !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
       end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
     nzl  = coo_restr%get_nzeros()
     call coo_restr%set_nrows(desc_ac%get_local_rows())
     call coo_restr%set_ncols(desc_a%get_local_cols())
     if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
     call csr_restr%cp_from_coo(coo_restr,info)
 
+    if (do_timings) call psb_toc(idx_cpytrans2)
     if (info /= psb_success_) then 
       call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
       goto 9999
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
similarity index 59%
rename from amgprec/impl/aggregator/amg_z_soc1_map_bld.f90
rename to amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index d9f24130..611590cb 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.f90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -72,7 +72,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_z_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
   implicit none
 
   ! Arguments
@@ -99,6 +101,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_)   :: nrow, ncol, n_ne
   integer(psb_lpk_)   :: nrglob
   character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc1_map_bld'
@@ -114,6 +119,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -133,41 +146,194 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc1_p0)
   call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
-  else 
+    !$omp end parallel do 
+  else
+    !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
     end do
+    !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-  icnt = 0
+#if 0&&defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    block
+      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      nths = omp_get_num_threads()
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+!!$      write(0,*) 'From thread : rsz ',myth,rsz
+      !$omp master
+      allocate(bnds(0:nths),locnaggr(0:nths))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      bnds(myth+1) = rsz
+      !$omp master
+!!$      write(0,*) 'From master 1: ',bnds      
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+!!$      write(0,*) 'From master 2: ',bnds
+      !$omp end master
+      !$omp barrier
+
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          if (info /= 0) cycle
+          i = idxs(ii)
+          if ((i<1).or.(i>nr)) then
+            info=psb_err_internal_error_
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              info=psb_err_internal_error_
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            if (.false.) then 
+              ip = 0
+              do k=1, nz
+                j   = icol(k)
+                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
+                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                    ip = ip + 1
+                    icol(ip) = icol(k)
+                  end if
+                end if
+              enddo
+
+              !
+              ! If the whole strongly coupled neighborhood of I is
+              ! as yet unconnected, turn it into the next aggregate.
+              ! Same if ip==0 (in which case, neighborhood only
+              ! contains I even if it does not look like it from matrix)
+              !
+              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+              if (disjoint) then       
+                locnaggr(kk)     = locnaggr(kk) + 1
+                do k=1, ip
+                  ilaggr(icol(k)) = locnaggr(kk)
+                end do
+                ilaggr(i) = locnaggr(kk)
+              end if
+            else
+              ip = 0
+              minip = nr +1              
+              do k=1, nz
+                j   = icol(k)
+                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                  ip = ip + 1
+                  icol(ip) = icol(k)
+                  minip = min(icol(ip),minip)
+                end if
+              enddo
+              if (bnds(myth)<=minip) then
+
+                !
+                ! If the whole strongly coupled neighborhood of I is
+                ! as yet unconnected, turn it into the next aggregate.
+                ! Same if ip==0 (in which case, neighborhood only
+                ! contains I even if it does not look like it from matrix)
+                !
+                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                if (disjoint) then       
+                  !$omp critical(update_ilaggr)
+                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+                  if (disjoint) then       
+                    locnaggr(kk)     = locnaggr(kk) + 1
+                    do k=1, ip
+                      ilaggr(icol(k)) = locnaggr(kk)
+                    end do
+                    ilaggr(i) = locnaggr(kk)
+                  end if
+                  !$omp end critical(update_ilaggr)
+                end if
+              endif
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+      !$omp barrier
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
+#else
   step1: do ii=1, nr
+    if (info /= 0) cycle
     i = idxs(ii)
     if ((i<1).or.(i>nr)) then
       info=psb_err_internal_error_
       call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
     end if
-    
+
     if (ilaggr(i) == -(nr+1)) then
       nz         = (acsr%irp(i+1)-acsr%irp(i))
       if ((nz<0).or.(nz>size(icol))) then
         info=psb_err_internal_error_
         call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
       end if
 
       icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@@ -176,7 +342,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !
       ! Build the set of all strongly coupled nodes 
       !
-      ip = 0 
+      ip = 0
       do k=1, nz
         j   = icol(k)
         if ((1<=j).and.(j<=nr)) then 
@@ -194,8 +360,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       ! contains I even if it does not look like it from matrix)
       !
       disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-      if (disjoint) then 
-        icnt      = icnt + 1 
+      if (disjoint) then       
         naggr     = naggr + 1
         do k=1, ip
           ilaggr(icol(k)) = naggr
@@ -204,16 +369,22 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+!!$  write(0,*) 'NAGGR ',naggr
+#endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
   !
   ! Phase two: join the neighbours
   !
+  ! $ omp workshare
   tmpaggr = ilaggr
+  ! $ omp end workshare
+  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -244,8 +415,15 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  ! $ omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+  
+  if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -274,7 +452,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         end if
       enddo
       if (ip > 0) then
-        icnt      = icnt + 1 
         naggr     = naggr + 1
         ilaggr(i) = naggr
         do k=1, ip
@@ -309,7 +486,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
-
+  if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
@@ -336,9 +513,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nlaggr(:) = 0
   nlaggr(me+1) = naggr
   call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if
+!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
-
   call psb_erractionrestore(err_act)
   return
 
diff --git a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90
index f760c80f..11ea6576 100644
--- a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -77,7 +81,10 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
     !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)
 
   else
 
diff --git a/amgprec/impl/solver/amg_c_gs_solver_bld.f90 b/amgprec/impl/solver/amg_c_gs_solver_bld.f90
index 3cdfe7e7..79be20b5 100644
--- a/amgprec/impl/solver/amg_c_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_c_gs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='c_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -76,9 +80,12 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     !
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
-    ! 
+    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
   else
 
     info = psb_err_missing_override_method_
diff --git a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90
index 859c8ebe..de5f91f8 100644
--- a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -77,7 +81,10 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
     !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)
 
   else
 
diff --git a/amgprec/impl/solver/amg_d_gs_solver_bld.f90 b/amgprec/impl/solver/amg_d_gs_solver_bld.f90
index 3cbc78ee..918712b5 100644
--- a/amgprec/impl/solver/amg_d_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_d_gs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='d_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -76,9 +80,12 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     !
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
-    ! 
+    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
   else
 
     info = psb_err_missing_override_method_
diff --git a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90
index e96e1229..d285d0b3 100644
--- a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -77,7 +81,10 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
     !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)
 
   else
 
diff --git a/amgprec/impl/solver/amg_s_gs_solver_bld.f90 b/amgprec/impl/solver/amg_s_gs_solver_bld.f90
index b4580f0f..6e0870b7 100644
--- a/amgprec/impl/solver/amg_s_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_s_gs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='s_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -76,9 +80,12 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     !
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
-    ! 
+    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
   else
 
     info = psb_err_missing_override_method_
diff --git a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90
index dec629f5..a953c608 100644
--- a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -77,7 +81,10 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
     !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)
 
   else
 
diff --git a/amgprec/impl/solver/amg_z_gs_solver_bld.f90 b/amgprec/impl/solver/amg_z_gs_solver_bld.f90
index b347937a..748a6122 100644
--- a/amgprec/impl/solver/amg_z_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_z_gs_solver_bld.f90
@@ -56,6 +56,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   type(psb_ctxt_type) :: ctxt
   integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
   character(len=20)   :: name='z_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   call psb_erractionsave(err_act)
@@ -65,6 +67,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
   call psb_info(ctxt, me, np)
   if (debug_level >= psb_debug_outer_) &
        & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
 
 
   n_row  = desc_a%get_local_rows()
@@ -76,9 +80,12 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
     !
     ! This cuts out the off-diagonal part, because it's supposed to
     ! be handled by the outer Jacobi smoother.
-    ! 
+    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
     call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
   else
 
     info = psb_err_missing_override_method_
diff --git a/samples/advanced/pdegen/amg_d_pde3d.F90 b/samples/advanced/pdegen/amg_d_pde3d.F90
index a1ceea84..cb9542d4 100644
--- a/samples/advanced/pdegen/amg_d_pde3d.F90
+++ b/samples/advanced/pdegen/amg_d_pde3d.F90
@@ -195,7 +195,7 @@ program amg_d_pde3d
   ! other variables
   integer(psb_ipk_)  :: info, i, k
   character(len=20)  :: name,ch_err
-
+  type(psb_d_csr_sparse_mat) :: amold
   info=psb_success_
 
 
@@ -402,7 +402,7 @@ program amg_d_pde3d
   end if
   call psb_barrier(ctxt)
   t1 = psb_wtime()
-  call prec%smoothers_build(a,desc_a,info)
+  call prec%smoothers_build(a,desc_a,info,amold=amold)
   tprec = psb_wtime()-t1
   if (info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@@ -492,7 +492,7 @@ program amg_d_pde3d
     write(psb_out_unit,'("Storage format for DESC_A          : ",a  )') desc_a%get_fmt()
 
   end if
-
+  call psb_print_timers(ctxt)
   !
   !  cleanup storage and exit
   !
diff --git a/samples/advanced/pdegen/amg_s_pde3d.F90 b/samples/advanced/pdegen/amg_s_pde3d.F90
index da5cd173..d6195c45 100644
--- a/samples/advanced/pdegen/amg_s_pde3d.F90
+++ b/samples/advanced/pdegen/amg_s_pde3d.F90
@@ -195,7 +195,7 @@ program amg_s_pde3d
   ! other variables
   integer(psb_ipk_)  :: info, i, k
   character(len=20)  :: name,ch_err
-
+  type(psb_s_csr_sparse_mat) :: amold
   info=psb_success_
 
 
@@ -402,7 +402,7 @@ program amg_s_pde3d
   end if
   call psb_barrier(ctxt)
   t1 = psb_wtime()
-  call prec%smoothers_build(a,desc_a,info)
+  call prec%smoothers_build(a,desc_a,info,amold=amold)
   tprec = psb_wtime()-t1
   if (info /= psb_success_) then
     call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@@ -492,7 +492,7 @@ program amg_s_pde3d
     write(psb_out_unit,'("Storage format for DESC_A          : ",a  )') desc_a%get_fmt()
 
   end if
-
+  call psb_print_timers(ctxt)
   !
   !  cleanup storage and exit
   !
diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp
index 0cd5d6c5..7a3329cd 100644
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0200                       ! IDIM; domain size. Linear system size is IDIM**3
+0200                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC
@@ -9,7 +9,7 @@ BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS F
 30                          ! IRST (restart for RGMRES and BiCGSTABL)
 1.d-6                       ! EPS
 %%%%%%%%%%%  Main preconditioner choices %%%%%%%%%%%%%%%%
-ML-VCYCLE-BJAC-D-BJAC       ! Longer descriptive name for preconditioner (up to 20 chars)
+ML-VBM-VCYCLE-FBGS-D-BJAC       ! Longer descriptive name for preconditioner (up to 20 chars)
 ML                          ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML
 %%%%%%%%%%%  First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%%
 FBGS                        ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous.
@@ -39,8 +39,8 @@ VCYCLE                      ! Type of multilevel CYCLE: VCYCLE WCYCLE KCYCLE MUL
 -3                          ! Max Number of levels in a multilevel preconditioner; if <0, lib default
 -3                          ! Target coarse matrix size per process; if <0, lib default
 SMOOTHED                    ! Type of aggregation: SMOOTHED UNSMOOTHED
-COUPLED                     ! Parallel aggregation: DEC, SYMDEC, COUPLED
-MATCHBOXP                   ! aggregation measure SOC1, MATCHBOXP
+DEC                         ! Parallel aggregation: DEC, SYMDEC, COUPLED
+SOC1                        ! aggregation measure SOC1, MATCHBOXP
 8                           ! Requested size of the aggregates for MATCHBOXP
 NATURAL                     ! Ordering of aggregation NATURAL DEGREE
 -1.5                        ! Coarsening ratio, if < 0 use library default

From 7b9c722a1a8801109358358089aa1924374ef6bd Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Wed, 26 Jul 2023 14:32:13 +0200
Subject: [PATCH 88/96] Fixed OpenMP version of SOC1.

---
 .../impl/aggregator/amg_c_soc1_map_bld.F90    | 129 +++++++-----------
 .../impl/aggregator/amg_d_soc1_map_bld.F90    | 129 +++++++-----------
 .../impl/aggregator/amg_s_soc1_map_bld.F90    | 129 +++++++-----------
 .../impl/aggregator/amg_z_soc1_map_bld.F90    | 129 +++++++-----------
 4 files changed, 200 insertions(+), 316 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index 4ec81322..91acfefe 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   complex(psb_spk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -154,7 +154,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
-      idxs(i)   = i 
+      idxs(i)   = i
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
@@ -162,6 +163,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -172,37 +174,35 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if 0&&defined(OPENMP)
+#if defined(OPENMP)
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    !$omp private(icol,val,myth,kk)
     block
-      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
       if (myth < mod(nr,nths)) rsz = rsz + 1
-!!$      write(0,*) 'From thread : rsz ',myth,rsz
       !$omp master
-      allocate(bnds(0:nths),locnaggr(0:nths))
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
       bnds(myth+1) = rsz
+      !$omp barrier
       !$omp master
-!!$      write(0,*) 'From master 1: ',bnds      
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
-!!$      write(0,*) 'From master 2: ',bnds
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static) 
+      !$omp  do schedule(static)
       do kk=0, nths-1
-!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
           i = idxs(ii)
@@ -228,94 +228,67 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             !
             ! Build the set of all strongly coupled nodes 
             !
-            if (.false.) then 
-              ip = 0
-              do k=1, nz
-                j   = icol(k)
-                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
-                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                    ip = ip + 1
-                    icol(ip) = icol(k)
-                  end if
-                end if
-              enddo
-
-              !
-              ! If the whole strongly coupled neighborhood of I is
-              ! as yet unconnected, turn it into the next aggregate.
-              ! Same if ip==0 (in which case, neighborhood only
-              ! contains I even if it does not look like it from matrix)
-              !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then       
+              !$omp critical(update_ilaggr)
               disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
               if (disjoint) then       
                 locnaggr(kk)     = locnaggr(kk) + 1
                 do k=1, ip
-                  ilaggr(icol(k)) = locnaggr(kk)
+                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
+                  ioffs(icol(k))  = kk
                 end do
-                ilaggr(i) = locnaggr(kk)
+                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
+                ioffs(i)  = kk
               end if
-            else
-              ip = 0
-              minip = nr +1              
-              do k=1, nz
-                j   = icol(k)
-                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                  ip = ip + 1
-                  icol(ip) = icol(k)
-                  minip = min(icol(ip),minip)
-                end if
-              enddo
-              if (bnds(myth)<=minip) then
-
-                !
-                ! If the whole strongly coupled neighborhood of I is
-                ! as yet unconnected, turn it into the next aggregate.
-                ! Same if ip==0 (in which case, neighborhood only
-                ! contains I even if it does not look like it from matrix)
-                !
-                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                if (disjoint) then       
-                  !$omp critical(update_ilaggr)
-                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                  if (disjoint) then       
-                    locnaggr(kk)     = locnaggr(kk) + 1
-                    do k=1, ip
-                      ilaggr(icol(k)) = locnaggr(kk)
-                    end do
-                    ilaggr(i) = locnaggr(kk)
-                  end if
-                  !$omp end critical(update_ilaggr)
-                end if
-              endif
+              !$omp end critical(update_ilaggr)
             end if
+
           end if
         enddo step1
       end do
       !$omp end do
-      !$omp barrier
+
       !$omp master
       naggr = sum(locnaggr(0:nths-1))
-!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
       do i=1,nths
         locnaggr(i) = locnaggr(i) + locnaggr(i-1)
       end do
-      do i=nths,1,-1
+      do i=nths+1,1,-1
         locnaggr(i) = locnaggr(i-1)
       end do
       locnaggr(0) = 0
-      !$omp end master
+      !$omp end master 
       !$omp barrier
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+          if (ilaggr(ii) > 0) then
+            kp = ioffs(ii) 
+            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
       end do
       !$omp end do
     end block
     !$omp end parallel
   end block
-!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
@@ -369,7 +342,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-!!$  write(0,*) 'NAGGR ',naggr
 #endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
@@ -381,10 +353,11 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   !
   ! Phase two: join the neighbours
   !
-  ! $ omp workshare
+  !$omp workshare
   tmpaggr = ilaggr
-  ! $ omp end workshare
-  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -415,14 +388,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-  ! $ omp end parallel do
+  !$omp end parallel do
   if (do_timings) call psb_toc(idx_soc1_p2)
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-  
+
   if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
@@ -488,7 +461,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
     goto 9999
@@ -518,7 +490,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
          & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
   call psb_erractionrestore(err_act)
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index af0a7764..0c76f269 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   real(psb_dpk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -154,7 +154,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
-      idxs(i)   = i 
+      idxs(i)   = i
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
@@ -162,6 +163,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -172,37 +174,35 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if 0&&defined(OPENMP)
+#if defined(OPENMP)
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    !$omp private(icol,val,myth,kk)
     block
-      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
       if (myth < mod(nr,nths)) rsz = rsz + 1
-!!$      write(0,*) 'From thread : rsz ',myth,rsz
       !$omp master
-      allocate(bnds(0:nths),locnaggr(0:nths))
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
       bnds(myth+1) = rsz
+      !$omp barrier
       !$omp master
-!!$      write(0,*) 'From master 1: ',bnds      
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
-!!$      write(0,*) 'From master 2: ',bnds
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static) 
+      !$omp  do schedule(static)
       do kk=0, nths-1
-!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
           i = idxs(ii)
@@ -228,94 +228,67 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             !
             ! Build the set of all strongly coupled nodes 
             !
-            if (.false.) then 
-              ip = 0
-              do k=1, nz
-                j   = icol(k)
-                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
-                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                    ip = ip + 1
-                    icol(ip) = icol(k)
-                  end if
-                end if
-              enddo
-
-              !
-              ! If the whole strongly coupled neighborhood of I is
-              ! as yet unconnected, turn it into the next aggregate.
-              ! Same if ip==0 (in which case, neighborhood only
-              ! contains I even if it does not look like it from matrix)
-              !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then       
+              !$omp critical(update_ilaggr)
               disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
               if (disjoint) then       
                 locnaggr(kk)     = locnaggr(kk) + 1
                 do k=1, ip
-                  ilaggr(icol(k)) = locnaggr(kk)
+                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
+                  ioffs(icol(k))  = kk
                 end do
-                ilaggr(i) = locnaggr(kk)
+                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
+                ioffs(i)  = kk
               end if
-            else
-              ip = 0
-              minip = nr +1              
-              do k=1, nz
-                j   = icol(k)
-                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                  ip = ip + 1
-                  icol(ip) = icol(k)
-                  minip = min(icol(ip),minip)
-                end if
-              enddo
-              if (bnds(myth)<=minip) then
-
-                !
-                ! If the whole strongly coupled neighborhood of I is
-                ! as yet unconnected, turn it into the next aggregate.
-                ! Same if ip==0 (in which case, neighborhood only
-                ! contains I even if it does not look like it from matrix)
-                !
-                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                if (disjoint) then       
-                  !$omp critical(update_ilaggr)
-                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                  if (disjoint) then       
-                    locnaggr(kk)     = locnaggr(kk) + 1
-                    do k=1, ip
-                      ilaggr(icol(k)) = locnaggr(kk)
-                    end do
-                    ilaggr(i) = locnaggr(kk)
-                  end if
-                  !$omp end critical(update_ilaggr)
-                end if
-              endif
+              !$omp end critical(update_ilaggr)
             end if
+
           end if
         enddo step1
       end do
       !$omp end do
-      !$omp barrier
+
       !$omp master
       naggr = sum(locnaggr(0:nths-1))
-!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
       do i=1,nths
         locnaggr(i) = locnaggr(i) + locnaggr(i-1)
       end do
-      do i=nths,1,-1
+      do i=nths+1,1,-1
         locnaggr(i) = locnaggr(i-1)
       end do
       locnaggr(0) = 0
-      !$omp end master
+      !$omp end master 
       !$omp barrier
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+          if (ilaggr(ii) > 0) then
+            kp = ioffs(ii) 
+            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
       end do
       !$omp end do
     end block
     !$omp end parallel
   end block
-!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
@@ -369,7 +342,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-!!$  write(0,*) 'NAGGR ',naggr
 #endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
@@ -381,10 +353,11 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   !
   ! Phase two: join the neighbours
   !
-  ! $ omp workshare
+  !$omp workshare
   tmpaggr = ilaggr
-  ! $ omp end workshare
-  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -415,14 +388,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-  ! $ omp end parallel do
+  !$omp end parallel do
   if (do_timings) call psb_toc(idx_soc1_p2)
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-  
+
   if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
@@ -488,7 +461,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
     goto 9999
@@ -518,7 +490,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
          & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
   call psb_erractionrestore(err_act)
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 967ee669..fe5701ef 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   real(psb_spk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -154,7 +154,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
-      idxs(i)   = i 
+      idxs(i)   = i
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
@@ -162,6 +163,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -172,37 +174,35 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if 0&&defined(OPENMP)
+#if defined(OPENMP)
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    !$omp private(icol,val,myth,kk)
     block
-      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
       if (myth < mod(nr,nths)) rsz = rsz + 1
-!!$      write(0,*) 'From thread : rsz ',myth,rsz
       !$omp master
-      allocate(bnds(0:nths),locnaggr(0:nths))
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
       bnds(myth+1) = rsz
+      !$omp barrier
       !$omp master
-!!$      write(0,*) 'From master 1: ',bnds      
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
-!!$      write(0,*) 'From master 2: ',bnds
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static) 
+      !$omp  do schedule(static)
       do kk=0, nths-1
-!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
           i = idxs(ii)
@@ -228,94 +228,67 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             !
             ! Build the set of all strongly coupled nodes 
             !
-            if (.false.) then 
-              ip = 0
-              do k=1, nz
-                j   = icol(k)
-                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
-                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                    ip = ip + 1
-                    icol(ip) = icol(k)
-                  end if
-                end if
-              enddo
-
-              !
-              ! If the whole strongly coupled neighborhood of I is
-              ! as yet unconnected, turn it into the next aggregate.
-              ! Same if ip==0 (in which case, neighborhood only
-              ! contains I even if it does not look like it from matrix)
-              !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then       
+              !$omp critical(update_ilaggr)
               disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
               if (disjoint) then       
                 locnaggr(kk)     = locnaggr(kk) + 1
                 do k=1, ip
-                  ilaggr(icol(k)) = locnaggr(kk)
+                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
+                  ioffs(icol(k))  = kk
                 end do
-                ilaggr(i) = locnaggr(kk)
+                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
+                ioffs(i)  = kk
               end if
-            else
-              ip = 0
-              minip = nr +1              
-              do k=1, nz
-                j   = icol(k)
-                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                  ip = ip + 1
-                  icol(ip) = icol(k)
-                  minip = min(icol(ip),minip)
-                end if
-              enddo
-              if (bnds(myth)<=minip) then
-
-                !
-                ! If the whole strongly coupled neighborhood of I is
-                ! as yet unconnected, turn it into the next aggregate.
-                ! Same if ip==0 (in which case, neighborhood only
-                ! contains I even if it does not look like it from matrix)
-                !
-                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                if (disjoint) then       
-                  !$omp critical(update_ilaggr)
-                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                  if (disjoint) then       
-                    locnaggr(kk)     = locnaggr(kk) + 1
-                    do k=1, ip
-                      ilaggr(icol(k)) = locnaggr(kk)
-                    end do
-                    ilaggr(i) = locnaggr(kk)
-                  end if
-                  !$omp end critical(update_ilaggr)
-                end if
-              endif
+              !$omp end critical(update_ilaggr)
             end if
+
           end if
         enddo step1
       end do
       !$omp end do
-      !$omp barrier
+
       !$omp master
       naggr = sum(locnaggr(0:nths-1))
-!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
       do i=1,nths
         locnaggr(i) = locnaggr(i) + locnaggr(i-1)
       end do
-      do i=nths,1,-1
+      do i=nths+1,1,-1
         locnaggr(i) = locnaggr(i-1)
       end do
       locnaggr(0) = 0
-      !$omp end master
+      !$omp end master 
       !$omp barrier
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+          if (ilaggr(ii) > 0) then
+            kp = ioffs(ii) 
+            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
       end do
       !$omp end do
     end block
     !$omp end parallel
   end block
-!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
@@ -369,7 +342,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-!!$  write(0,*) 'NAGGR ',naggr
 #endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
@@ -381,10 +353,11 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   !
   ! Phase two: join the neighbours
   !
-  ! $ omp workshare
+  !$omp workshare
   tmpaggr = ilaggr
-  ! $ omp end workshare
-  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -415,14 +388,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-  ! $ omp end parallel do
+  !$omp end parallel do
   if (do_timings) call psb_toc(idx_soc1_p2)
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-  
+
   if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
@@ -488,7 +461,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
     goto 9999
@@ -518,7 +490,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
          & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
   call psb_erractionrestore(err_act)
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index 611590cb..54c3add4 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   complex(psb_dpk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -154,7 +154,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     !$omp parallel do private(i)
     do i=1, nr
       ilaggr(i) = -(nr+1)
-      idxs(i)   = i 
+      idxs(i)   = i
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
@@ -162,6 +163,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
+      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -172,37 +174,35 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if 0&&defined(OPENMP)
+#if defined(OPENMP)
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    !$omp private(icol,val,myth,kk)
     block
-      integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz,  minip
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
       if (myth < mod(nr,nths)) rsz = rsz + 1
-!!$      write(0,*) 'From thread : rsz ',myth,rsz
       !$omp master
-      allocate(bnds(0:nths),locnaggr(0:nths))
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
       bnds(myth+1) = rsz
+      !$omp barrier
       !$omp master
-!!$      write(0,*) 'From master 1: ',bnds      
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
-!!$      write(0,*) 'From master 2: ',bnds
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static) 
+      !$omp  do schedule(static)
       do kk=0, nths-1
-!!$        write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
           i = idxs(ii)
@@ -228,94 +228,67 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             !
             ! Build the set of all strongly coupled nodes 
             !
-            if (.false.) then 
-              ip = 0
-              do k=1, nz
-                j   = icol(k)
-                if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then 
-                  if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                    ip = ip + 1
-                    icol(ip) = icol(k)
-                  end if
-                end if
-              enddo
-
-              !
-              ! If the whole strongly coupled neighborhood of I is
-              ! as yet unconnected, turn it into the next aggregate.
-              ! Same if ip==0 (in which case, neighborhood only
-              ! contains I even if it does not look like it from matrix)
-              !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then       
+              !$omp critical(update_ilaggr)
               disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
               if (disjoint) then       
                 locnaggr(kk)     = locnaggr(kk) + 1
                 do k=1, ip
-                  ilaggr(icol(k)) = locnaggr(kk)
+                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
+                  ioffs(icol(k))  = kk
                 end do
-                ilaggr(i) = locnaggr(kk)
+                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
+                ioffs(i)  = kk
               end if
-            else
-              ip = 0
-              minip = nr +1              
-              do k=1, nz
-                j   = icol(k)
-                if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
-                  ip = ip + 1
-                  icol(ip) = icol(k)
-                  minip = min(icol(ip),minip)
-                end if
-              enddo
-              if (bnds(myth)<=minip) then
-
-                !
-                ! If the whole strongly coupled neighborhood of I is
-                ! as yet unconnected, turn it into the next aggregate.
-                ! Same if ip==0 (in which case, neighborhood only
-                ! contains I even if it does not look like it from matrix)
-                !
-                disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                if (disjoint) then       
-                  !$omp critical(update_ilaggr)
-                  disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-                  if (disjoint) then       
-                    locnaggr(kk)     = locnaggr(kk) + 1
-                    do k=1, ip
-                      ilaggr(icol(k)) = locnaggr(kk)
-                    end do
-                    ilaggr(i) = locnaggr(kk)
-                  end if
-                  !$omp end critical(update_ilaggr)
-                end if
-              endif
+              !$omp end critical(update_ilaggr)
             end if
+
           end if
         enddo step1
       end do
       !$omp end do
-      !$omp barrier
+
       !$omp master
       naggr = sum(locnaggr(0:nths-1))
-!!$      write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1)
       do i=1,nths
         locnaggr(i) = locnaggr(i) + locnaggr(i-1)
       end do
-      do i=nths,1,-1
+      do i=nths+1,1,-1
         locnaggr(i) = locnaggr(i-1)
       end do
       locnaggr(0) = 0
-      !$omp end master
+      !$omp end master 
       !$omp barrier
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk)
+          if (ilaggr(ii) > 0) then
+            kp = ioffs(ii) 
+            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
       end do
       !$omp end do
     end block
     !$omp end parallel
   end block
-!!$  write(0,*) 'Out of parallel looop NAGGR ',naggr
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
@@ -369,7 +342,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-!!$  write(0,*) 'NAGGR ',naggr
 #endif
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
@@ -381,10 +353,11 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   !
   ! Phase two: join the neighbours
   !
-  ! $ omp workshare
+  !$omp workshare
   tmpaggr = ilaggr
-  ! $ omp end workshare
-  ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip)
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -415,14 +388,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-  ! $ omp end parallel do
+  !$omp end parallel do
   if (do_timings) call psb_toc(idx_soc1_p2)
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-  
+
   if (do_timings) call psb_tic(idx_soc1_p3)
   !
   ! Phase three: sweep over leftovers, if any 
@@ -488,7 +461,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
     goto 9999
@@ -518,7 +490,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
          & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
          & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
   end if
-!!$  write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr)
 
   call acsr%free()
   call psb_erractionrestore(err_act)

From e3de565b6dc31f6cd566547ee8622544d86593a8 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Wed, 26 Jul 2023 14:47:05 +0200
Subject: [PATCH 89/96] Updated commeents in SOC1

---
 amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 | 14 ++++++++++++++
 amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 | 14 ++++++++++++++
 amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 | 14 ++++++++++++++
 amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 | 14 ++++++++++++++
 4 files changed, 56 insertions(+)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index 91acfefe..eb6b0eac 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -178,6 +178,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    
     !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
@@ -231,6 +243,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ip = 0
             do k=1, nz
               j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
               if (ilaggr(j) > 0) cycle step1
               if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                 ip = ip + 1
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index 0c76f269..241f0568 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -178,6 +178,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    
     !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
@@ -231,6 +243,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ip = 0
             do k=1, nz
               j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
               if (ilaggr(j) > 0) cycle step1
               if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                 ip = ip + 1
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index fe5701ef..329cd3ba 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -178,6 +178,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    
     !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
@@ -231,6 +243,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ip = 0
             do k=1, nz
               j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
               if (ilaggr(j) > 0) cycle step1
               if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                 ip = ip + 1
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index 54c3add4..697a55b3 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -178,6 +178,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   block
     integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
     integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    
     !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
@@ -231,6 +243,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ip = 0
             do k=1, nz
               j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
               if (ilaggr(j) > 0) cycle step1
               if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                 ip = ip + 1

From e78449d0f5c6bed62b23ab5b9172de9b4cb16289 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Mon, 31 Jul 2023 13:26:05 +0200
Subject: [PATCH 90/96] Prepare for SOC2 OpenMP

---
 .../aggregator/{amg_c_soc2_map_bld.f90 => amg_c_soc2_map_bld.F90} | 0
 .../aggregator/{amg_d_soc2_map_bld.f90 => amg_d_soc2_map_bld.F90} | 0
 .../aggregator/{amg_s_soc2_map_bld.f90 => amg_s_soc2_map_bld.F90} | 0
 .../aggregator/{amg_z_soc2_map_bld.f90 => amg_z_soc2_map_bld.F90} | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename amgprec/impl/aggregator/{amg_c_soc2_map_bld.f90 => amg_c_soc2_map_bld.F90} (100%)
 rename amgprec/impl/aggregator/{amg_d_soc2_map_bld.f90 => amg_d_soc2_map_bld.F90} (100%)
 rename amgprec/impl/aggregator/{amg_s_soc2_map_bld.f90 => amg_s_soc2_map_bld.F90} (100%)
 rename amgprec/impl/aggregator/{amg_z_soc2_map_bld.f90 => amg_z_soc2_map_bld.F90} (100%)

diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
similarity index 100%
rename from amgprec/impl/aggregator/amg_c_soc2_map_bld.f90
rename to amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
similarity index 100%
rename from amgprec/impl/aggregator/amg_d_soc2_map_bld.f90
rename to amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
similarity index 100%
rename from amgprec/impl/aggregator/amg_s_soc2_map_bld.f90
rename to amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
similarity index 100%
rename from amgprec/impl/aggregator/amg_z_soc2_map_bld.f90
rename to amgprec/impl/aggregator/amg_z_soc2_map_bld.F90

From c1ecb4ebec85413cff2ff728a0c9e611eb77554f Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Thu, 3 Aug 2023 13:26:24 +0200
Subject: [PATCH 91/96] Fixed SOC1 and begin work on SOC2

---
 .../impl/aggregator/amg_c_soc1_map_bld.F90    | 63 ++++++++------
 .../impl/aggregator/amg_c_soc2_map_bld.F90    | 84 ++++++++++++++++---
 .../impl/aggregator/amg_d_soc1_map_bld.F90    | 63 ++++++++------
 .../impl/aggregator/amg_d_soc2_map_bld.F90    | 84 ++++++++++++++++---
 .../impl/aggregator/amg_s_soc1_map_bld.F90    | 63 ++++++++------
 .../impl/aggregator/amg_s_soc2_map_bld.F90    | 84 ++++++++++++++++---
 .../impl/aggregator/amg_z_soc1_map_bld.F90    | 63 ++++++++------
 .../impl/aggregator/amg_z_soc2_map_bld.F90    | 84 ++++++++++++++++---
 8 files changed, 448 insertions(+), 140 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index eb6b0eac..516daf4b 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   complex(psb_spk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -151,19 +151,17 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -189,11 +187,12 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    
-    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    info = 0
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
@@ -213,7 +212,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static)
+      !$omp  do schedule(static) private(disjoint) reduction(max: info)
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
@@ -257,23 +256,31 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! as yet unconnected, turn it into the next aggregate.
             ! Same if ip==0 (in which case, neighborhood only
             ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-            if (disjoint) then       
-              !$omp critical(update_ilaggr)
-              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-              if (disjoint) then       
-                locnaggr(kk)     = locnaggr(kk) + 1
-                do k=1, ip
-                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
-                  ioffs(icol(k))  = kk
-                end do
-                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
-                ioffs(i)  = kk
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                info = 12345678
+                cycle step1
               end if
-              !$omp end critical(update_ilaggr)
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
             end if
-
           end if
         enddo step1
       end do
@@ -293,9 +300,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) then
-            kp = ioffs(ii) 
-            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
           end if
         end do
       end do
@@ -303,6 +310,12 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     end block
     !$omp end parallel
   end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
index 020cae4b..ed4161a5 100644
--- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
@@ -71,6 +71,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_c_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
 
   implicit none
 
@@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_) :: np, me
   integer(psb_ipk_) :: nrow, ncol, n_ne
   character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc2_map_bld'
@@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc2_p0)
   diag = a%get_diag(info)
   if(info /= psb_success_) then
     info=psb_err_from_subroutine_
@@ -137,55 +152,104 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! 
   call a%cp_to(muij)
   if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
       if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
     end do
   end do
-
+  !$omp end parallel do 
   !
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
       if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
         if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = sone
+          s_neigh_coo%val(k) = sone
         else
-          s_neigh_coo%val(ip) = -sone
+          s_neigh_coo%val(k) = -sone
         end if
+      else
+        s_neigh_coo%val(k) = -sone        
       end if
     end do
   end do
   !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
 
-  if (iorder == amg_aggr_ord_nat_) then 
+  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
+    !$omp end parallel do 
   else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = muij%irp(i+1) - muij%irp(i)
     end do
+    !$omp end parallel do
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
 
+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
+#if defined(OPENMP)  
+  icnt = 0
+  step1: do ii=1, nr
+    i = idxs(ii)
+
+    if (ilaggr(i) == -(nr+1)) then 
+      !
+      ! Get the 1-neighbourhood of I 
+      !
+      ip1 = s_neigh%irp(i)
+      nz  = s_neigh%irp(i+1)-ip1
+      !
+      ! If the neighbourhood only contains I, skip it
+      !
+      if (nz ==0) then
+        ilaggr(i) = 0
+        cycle step1
+      end if
+      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+        ilaggr(i) = 0
+        cycle step1
+      end if      
+      !
+      ! If the whole strongly coupled neighborhood of I is
+      ! as yet unconnected, turn it into the next aggregate.
+      !
+      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+      if (disjoint) then 
+        icnt      = icnt + 1 
+        naggr     = naggr + 1
+        do k=1, nzcnt
+          ilaggr(icol(k)) = naggr
+        end do
+        ilaggr(i) = naggr
+      end if
+    endif
+  enddo step1
+
+#else
   icnt = 0
   step1: do ii=1, nr
     i = idxs(ii)
@@ -224,7 +288,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+#endif  
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index 241f0568..f2cf9027 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   real(psb_dpk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -151,19 +151,17 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -189,11 +187,12 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    
-    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    info = 0
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
@@ -213,7 +212,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static)
+      !$omp  do schedule(static) private(disjoint) reduction(max: info)
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
@@ -257,23 +256,31 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! as yet unconnected, turn it into the next aggregate.
             ! Same if ip==0 (in which case, neighborhood only
             ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-            if (disjoint) then       
-              !$omp critical(update_ilaggr)
-              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-              if (disjoint) then       
-                locnaggr(kk)     = locnaggr(kk) + 1
-                do k=1, ip
-                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
-                  ioffs(icol(k))  = kk
-                end do
-                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
-                ioffs(i)  = kk
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                info = 12345678
+                cycle step1
               end if
-              !$omp end critical(update_ilaggr)
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
             end if
-
           end if
         enddo step1
       end do
@@ -293,9 +300,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) then
-            kp = ioffs(ii) 
-            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
           end if
         end do
       end do
@@ -303,6 +310,12 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     end block
     !$omp end parallel
   end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
index 1433a670..6047f375 100644
--- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
@@ -71,6 +71,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_d_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
 
   implicit none
 
@@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_) :: np, me
   integer(psb_ipk_) :: nrow, ncol, n_ne
   character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc2_map_bld'
@@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc2_p0)
   diag = a%get_diag(info)
   if(info /= psb_success_) then
     info=psb_err_from_subroutine_
@@ -137,55 +152,104 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! 
   call a%cp_to(muij)
   if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
       if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
     end do
   end do
-
+  !$omp end parallel do 
   !
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
       if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
         if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = done
+          s_neigh_coo%val(k) = done
         else
-          s_neigh_coo%val(ip) = -done
+          s_neigh_coo%val(k) = -done
         end if
+      else
+        s_neigh_coo%val(k) = -done        
       end if
     end do
   end do
   !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
 
-  if (iorder == amg_aggr_ord_nat_) then 
+  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
+    !$omp end parallel do 
   else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = muij%irp(i+1) - muij%irp(i)
     end do
+    !$omp end parallel do
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
 
+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
+#if defined(OPENMP)  
+  icnt = 0
+  step1: do ii=1, nr
+    i = idxs(ii)
+
+    if (ilaggr(i) == -(nr+1)) then 
+      !
+      ! Get the 1-neighbourhood of I 
+      !
+      ip1 = s_neigh%irp(i)
+      nz  = s_neigh%irp(i+1)-ip1
+      !
+      ! If the neighbourhood only contains I, skip it
+      !
+      if (nz ==0) then
+        ilaggr(i) = 0
+        cycle step1
+      end if
+      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+        ilaggr(i) = 0
+        cycle step1
+      end if      
+      !
+      ! If the whole strongly coupled neighborhood of I is
+      ! as yet unconnected, turn it into the next aggregate.
+      !
+      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+      if (disjoint) then 
+        icnt      = icnt + 1 
+        naggr     = naggr + 1
+        do k=1, nzcnt
+          ilaggr(icol(k)) = naggr
+        end do
+        ilaggr(i) = naggr
+      end if
+    endif
+  enddo step1
+
+#else
   icnt = 0
   step1: do ii=1, nr
     i = idxs(ii)
@@ -224,7 +288,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+#endif  
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 329cd3ba..4d9ab106 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   real(psb_spk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -151,19 +151,17 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -189,11 +187,12 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    
-    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    info = 0
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
@@ -213,7 +212,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static)
+      !$omp  do schedule(static) private(disjoint) reduction(max: info)
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
@@ -257,23 +256,31 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! as yet unconnected, turn it into the next aggregate.
             ! Same if ip==0 (in which case, neighborhood only
             ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-            if (disjoint) then       
-              !$omp critical(update_ilaggr)
-              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-              if (disjoint) then       
-                locnaggr(kk)     = locnaggr(kk) + 1
-                do k=1, ip
-                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
-                  ioffs(icol(k))  = kk
-                end do
-                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
-                ioffs(i)  = kk
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                info = 12345678
+                cycle step1
               end if
-              !$omp end critical(update_ilaggr)
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
             end if
-
           end if
         enddo step1
       end do
@@ -293,9 +300,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) then
-            kp = ioffs(ii) 
-            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
           end if
         end do
       end do
@@ -303,6 +310,12 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     end block
     !$omp end parallel
   end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
index 4bb17a80..e94261a8 100644
--- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
@@ -71,6 +71,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_s_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
 
   implicit none
 
@@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_) :: np, me
   integer(psb_ipk_) :: nrow, ncol, n_ne
   character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc2_map_bld'
@@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc2_p0)
   diag = a%get_diag(info)
   if(info /= psb_success_) then
     info=psb_err_from_subroutine_
@@ -137,55 +152,104 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! 
   call a%cp_to(muij)
   if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
       if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
     end do
   end do
-
+  !$omp end parallel do 
   !
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
       if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
         if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = sone
+          s_neigh_coo%val(k) = sone
         else
-          s_neigh_coo%val(ip) = -sone
+          s_neigh_coo%val(k) = -sone
         end if
+      else
+        s_neigh_coo%val(k) = -sone        
       end if
     end do
   end do
   !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
 
-  if (iorder == amg_aggr_ord_nat_) then 
+  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
+    !$omp end parallel do 
   else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = muij%irp(i+1) - muij%irp(i)
     end do
+    !$omp end parallel do
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
 
+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
+#if defined(OPENMP)  
+  icnt = 0
+  step1: do ii=1, nr
+    i = idxs(ii)
+
+    if (ilaggr(i) == -(nr+1)) then 
+      !
+      ! Get the 1-neighbourhood of I 
+      !
+      ip1 = s_neigh%irp(i)
+      nz  = s_neigh%irp(i+1)-ip1
+      !
+      ! If the neighbourhood only contains I, skip it
+      !
+      if (nz ==0) then
+        ilaggr(i) = 0
+        cycle step1
+      end if
+      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+        ilaggr(i) = 0
+        cycle step1
+      end if      
+      !
+      ! If the whole strongly coupled neighborhood of I is
+      ! as yet unconnected, turn it into the next aggregate.
+      !
+      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+      if (disjoint) then 
+        icnt      = icnt + 1 
+        naggr     = naggr + 1
+        do k=1, nzcnt
+          ilaggr(icol(k)) = naggr
+        end do
+        ilaggr(i) = naggr
+      end if
+    endif
+  enddo step1
+
+#else
   icnt = 0
   step1: do ii=1, nr
     i = idxs(ii)
@@ -224,7 +288,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+#endif  
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index 697a55b3..40a85dae 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -87,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_), intent(out)               :: info
 
   ! Local variables
-  integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
        & ideg(:), idxs(:)
   integer(psb_lpk_), allocatable :: tmpaggr(:)
   complex(psb_dpk_), allocatable  :: val(:), diag(:)
@@ -130,7 +130,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
 
   nr = a%get_nrows()
   nc = a%get_ncols()
-  allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),&
+  allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),&
        & icol(nc),val(nc),stat=info)
   if(info /= psb_success_) then
     info=psb_err_alloc_request_
@@ -151,19 +151,17 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   if (do_timings) call psb_toc(idx_soc1_p0)
   if (clean_zeros) call acsr%clean_zeros(info)
   if (iorder == amg_aggr_ord_nat_) then 
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
   else
-    !$omp parallel do private(i)
+    !$omp parallel do private(i) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
-      ioffs(i)  = 0
     end do
     !$omp end parallel do 
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
@@ -189,11 +187,12 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    
-    !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
+    info = 0
+    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
     !$omp private(icol,val,myth,kk)
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
       nths = omp_get_num_threads()
       myth = omp_get_thread_num()
       rsz = nr/nths
@@ -213,7 +212,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp end master
       !$omp barrier
 
-      !$omp  do schedule(static)
+      !$omp  do schedule(static) private(disjoint) reduction(max: info)
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
           if (info /= 0) cycle
@@ -257,23 +256,31 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! as yet unconnected, turn it into the next aggregate.
             ! Same if ip==0 (in which case, neighborhood only
             ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-            if (disjoint) then       
-              !$omp critical(update_ilaggr)
-              disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
-              if (disjoint) then       
-                locnaggr(kk)     = locnaggr(kk) + 1
-                do k=1, ip
-                  ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk)
-                  ioffs(icol(k))  = kk
-                end do
-                ilaggr(i) = bnds(kk)-1+locnaggr(kk)
-                ioffs(i)  = kk
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                info = 12345678
+                cycle step1
               end if
-              !$omp end critical(update_ilaggr)
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
             end if
-
           end if
         enddo step1
       end do
@@ -293,9 +300,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       !$omp  do schedule(static) 
       do kk=0, nths-1
         do ii=bnds(kk), bnds(kk+1)-1
-          if (ilaggr(ii) > 0) then
-            kp = ioffs(ii) 
-            ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp)
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
           end if
         end do
       end do
@@ -303,6 +310,12 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     end block
     !$omp end parallel
   end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 #else
   step1: do ii=1, nr
     if (info /= 0) cycle
diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
index c1b165b1..e09bcf1e 100644
--- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
@@ -71,6 +71,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   use psb_base_mod
   use amg_base_prec_type
   use amg_z_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
 
   implicit none
 
@@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   integer(psb_ipk_) :: np, me
   integer(psb_ipk_) :: nrow, ncol, n_ne
   character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.
 
   info=psb_success_
   name = 'amg_soc2_map_bld'
@@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   nrow   = desc_a%get_local_rows()
   ncol   = desc_a%get_local_cols()
   nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
 
   nr = a%get_nrows()
   nc = a%get_ncols()
@@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     goto 9999
   end if
 
+  if (do_timings) call psb_tic(idx_soc2_p0)
   diag = a%get_diag(info)
   if(info /= psb_success_) then
     info=psb_err_from_subroutine_
@@ -137,55 +152,104 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! 
   call a%cp_to(muij)
   if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
       if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
     end do
   end do
-
+  !$omp end parallel do 
   !
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
       if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
         if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = done
+          s_neigh_coo%val(k) = done
         else
-          s_neigh_coo%val(ip) = -done
+          s_neigh_coo%val(k) = -done
         end if
+      else
+        s_neigh_coo%val(k) = -done        
       end if
     end do
   end do
   !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
 
-  if (iorder == amg_aggr_ord_nat_) then 
+  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       idxs(i)   = i 
     end do
+    !$omp end parallel do 
   else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
     do i=1, nr
       ilaggr(i) = -(nr+1)
       ideg(i)   = muij%irp(i+1) - muij%irp(i)
     end do
+    !$omp end parallel do
     call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
   end if
 
+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)
 
   !
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
+#if defined(OPENMP)  
+  icnt = 0
+  step1: do ii=1, nr
+    i = idxs(ii)
+
+    if (ilaggr(i) == -(nr+1)) then 
+      !
+      ! Get the 1-neighbourhood of I 
+      !
+      ip1 = s_neigh%irp(i)
+      nz  = s_neigh%irp(i+1)-ip1
+      !
+      ! If the neighbourhood only contains I, skip it
+      !
+      if (nz ==0) then
+        ilaggr(i) = 0
+        cycle step1
+      end if
+      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+        ilaggr(i) = 0
+        cycle step1
+      end if      
+      !
+      ! If the whole strongly coupled neighborhood of I is
+      ! as yet unconnected, turn it into the next aggregate.
+      !
+      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+      if (disjoint) then 
+        icnt      = icnt + 1 
+        naggr     = naggr + 1
+        do k=1, nzcnt
+          ilaggr(icol(k)) = naggr
+        end do
+        ilaggr(i) = naggr
+      end if
+    endif
+  enddo step1
+
+#else
   icnt = 0
   step1: do ii=1, nr
     i = idxs(ii)
@@ -224,7 +288,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     endif
   enddo step1
-  
+#endif  
   if (debug_level >= psb_debug_outer_) then 
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))

From 9e82d2e3118c41b174dfee14ce9002fa7934b7a8 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Fri, 4 Aug 2023 09:30:32 +0200
Subject: [PATCH 92/96] Final OMP version of SOC1.

---
 .../impl/aggregator/amg_c_soc1_map_bld.F90    | 37 ++++++++++++-------
 .../impl/aggregator/amg_d_soc1_map_bld.F90    | 37 ++++++++++++-------
 .../impl/aggregator/amg_s_soc1_map_bld.F90    | 37 ++++++++++++-------
 .../impl/aggregator/amg_z_soc1_map_bld.F90    | 37 ++++++++++++-------
 4 files changed, 92 insertions(+), 56 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index 516daf4b..70931f05 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -187,47 +187,51 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    info = 0
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
-    !$omp private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       integer(psb_lpk_) :: itmp
-      nths = omp_get_num_threads()
-      myth = omp_get_thread_num()
-      rsz = nr/nths
-      if (myth < mod(nr,nths)) rsz = rsz + 1
       !$omp master
+      nths = omp_get_num_threads()
       allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
       bnds(myth+1) = rsz
       !$omp barrier
       !$omp master
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
+      info = 0
       !$omp end master
       !$omp barrier
-
-      !$omp  do schedule(static) private(disjoint) reduction(max: info)
+      
+      !$omp do schedule(static) private(disjoint) 
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
-          if (info /= 0) cycle
-          i = idxs(ii)
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
           if ((i<1).or.(i>nr)) then
+            !$omp atomic write
             info=psb_err_internal_error_
+            !$omp end atomic 
             call psb_errpush(info,name)
             cycle step1
             !goto 9999
           end if
-
+            
           if (ilaggr(i) == -(nr+1)) then
             nz         = (acsr%irp(i+1)-acsr%irp(i))
             if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
               info=psb_err_internal_error_
+              !$omp end atomic 
               call psb_errpush(info,name)
               cycle step1
               !goto 9999
@@ -250,7 +254,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
                 icol(ip) = icol(k)
               end if
             enddo
-
+            
             !
             ! If the whole strongly coupled neighborhood of I is
             ! as yet unconnected, turn it into the next aggregate.
@@ -263,13 +267,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! However, a certain unrepeatability is already present
             ! because the sequence of aggregates is computed with a
             ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
             if (disjoint) then
               locnaggr(kk)     = locnaggr(kk) + 1
               itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
               if (itmp < (bnds(kk)-1+locnaggr(kk))) then
-                info = 12345678
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
                 cycle step1
               end if
               !$omp atomic write
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index f2cf9027..bd40a2a4 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -187,47 +187,51 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    info = 0
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
-    !$omp private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       integer(psb_lpk_) :: itmp
-      nths = omp_get_num_threads()
-      myth = omp_get_thread_num()
-      rsz = nr/nths
-      if (myth < mod(nr,nths)) rsz = rsz + 1
       !$omp master
+      nths = omp_get_num_threads()
       allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
       bnds(myth+1) = rsz
       !$omp barrier
       !$omp master
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
+      info = 0
       !$omp end master
       !$omp barrier
-
-      !$omp  do schedule(static) private(disjoint) reduction(max: info)
+      
+      !$omp do schedule(static) private(disjoint) 
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
-          if (info /= 0) cycle
-          i = idxs(ii)
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
           if ((i<1).or.(i>nr)) then
+            !$omp atomic write
             info=psb_err_internal_error_
+            !$omp end atomic 
             call psb_errpush(info,name)
             cycle step1
             !goto 9999
           end if
-
+            
           if (ilaggr(i) == -(nr+1)) then
             nz         = (acsr%irp(i+1)-acsr%irp(i))
             if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
               info=psb_err_internal_error_
+              !$omp end atomic 
               call psb_errpush(info,name)
               cycle step1
               !goto 9999
@@ -250,7 +254,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
                 icol(ip) = icol(k)
               end if
             enddo
-
+            
             !
             ! If the whole strongly coupled neighborhood of I is
             ! as yet unconnected, turn it into the next aggregate.
@@ -263,13 +267,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! However, a certain unrepeatability is already present
             ! because the sequence of aggregates is computed with a
             ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
             if (disjoint) then
               locnaggr(kk)     = locnaggr(kk) + 1
               itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
               if (itmp < (bnds(kk)-1+locnaggr(kk))) then
-                info = 12345678
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
                 cycle step1
               end if
               !$omp atomic write
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 4d9ab106..109abc00 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -187,47 +187,51 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    info = 0
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
-    !$omp private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       integer(psb_lpk_) :: itmp
-      nths = omp_get_num_threads()
-      myth = omp_get_thread_num()
-      rsz = nr/nths
-      if (myth < mod(nr,nths)) rsz = rsz + 1
       !$omp master
+      nths = omp_get_num_threads()
       allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
       bnds(myth+1) = rsz
       !$omp barrier
       !$omp master
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
+      info = 0
       !$omp end master
       !$omp barrier
-
-      !$omp  do schedule(static) private(disjoint) reduction(max: info)
+      
+      !$omp do schedule(static) private(disjoint) 
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
-          if (info /= 0) cycle
-          i = idxs(ii)
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
           if ((i<1).or.(i>nr)) then
+            !$omp atomic write
             info=psb_err_internal_error_
+            !$omp end atomic 
             call psb_errpush(info,name)
             cycle step1
             !goto 9999
           end if
-
+            
           if (ilaggr(i) == -(nr+1)) then
             nz         = (acsr%irp(i+1)-acsr%irp(i))
             if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
               info=psb_err_internal_error_
+              !$omp end atomic 
               call psb_errpush(info,name)
               cycle step1
               !goto 9999
@@ -250,7 +254,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
                 icol(ip) = icol(k)
               end if
             enddo
-
+            
             !
             ! If the whole strongly coupled neighborhood of I is
             ! as yet unconnected, turn it into the next aggregate.
@@ -263,13 +267,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! However, a certain unrepeatability is already present
             ! because the sequence of aggregates is computed with a
             ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
             if (disjoint) then
               locnaggr(kk)     = locnaggr(kk) + 1
               itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
               if (itmp < (bnds(kk)-1+locnaggr(kk))) then
-                info = 12345678
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
                 cycle step1
               end if
               !$omp atomic write
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index 40a85dae..3efee9e8 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -187,47 +187,51 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     ! been set because it is strongly connected to an entry J belonging to a
     ! different thread. 
 
-    info = 0
-    !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) &
-    !$omp private(icol,val,myth,kk)
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
     block
       integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
       integer(psb_lpk_) :: itmp
-      nths = omp_get_num_threads()
-      myth = omp_get_thread_num()
-      rsz = nr/nths
-      if (myth < mod(nr,nths)) rsz = rsz + 1
       !$omp master
+      nths = omp_get_num_threads()
       allocate(bnds(0:nths),locnaggr(0:nths+1))
       locnaggr(:) = 0
       bnds(0) = 1
       !$omp end master      
       !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
       bnds(myth+1) = rsz
       !$omp barrier
       !$omp master
       do i=1,nths
         bnds(i) = bnds(i) + bnds(i-1)
       end do
+      info = 0
       !$omp end master
       !$omp barrier
-
-      !$omp  do schedule(static) private(disjoint) reduction(max: info)
+      
+      !$omp do schedule(static) private(disjoint) 
       do kk=0, nths-1
         step1: do ii=bnds(kk), bnds(kk+1)-1
-          if (info /= 0) cycle
-          i = idxs(ii)
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
           if ((i<1).or.(i>nr)) then
+            !$omp atomic write
             info=psb_err_internal_error_
+            !$omp end atomic 
             call psb_errpush(info,name)
             cycle step1
             !goto 9999
           end if
-
+            
           if (ilaggr(i) == -(nr+1)) then
             nz         = (acsr%irp(i+1)-acsr%irp(i))
             if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
               info=psb_err_internal_error_
+              !$omp end atomic 
               call psb_errpush(info,name)
               cycle step1
               !goto 9999
@@ -250,7 +254,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
                 icol(ip) = icol(k)
               end if
             enddo
-
+            
             !
             ! If the whole strongly coupled neighborhood of I is
             ! as yet unconnected, turn it into the next aggregate.
@@ -263,13 +267,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
             ! However, a certain unrepeatability is already present
             ! because the sequence of aggregates is computed with a
             ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
             !
             disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
             if (disjoint) then
               locnaggr(kk)     = locnaggr(kk) + 1
               itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
               if (itmp < (bnds(kk)-1+locnaggr(kk))) then
-                info = 12345678
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
                 cycle step1
               end if
               !$omp atomic write

From 73495edf09f6d3f73749e1e6e15d03f7fafd3ee9 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Mon, 7 Aug 2023 08:59:32 +0200
Subject: [PATCH 93/96] Finish SOC1 OpenMP

---
 amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 | 7 ++++++-
 amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 | 7 ++++++-
 amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 | 7 ++++++-
 amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 | 7 ++++++-
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index 70931f05..b9110aae 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -478,7 +478,10 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   do i=1, nr
+    if (info /= 0) cycle
     if (ilaggr(i) < 0) then
       nz = (acsr%irp(i+1)-acsr%irp(i))
       if (nz == 1) then
@@ -489,9 +492,11 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
       endif
     end if
   end do
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index bd40a2a4..2b01f3e5 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -478,7 +478,10 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   do i=1, nr
+    if (info /= 0) cycle
     if (ilaggr(i) < 0) then
       nz = (acsr%irp(i+1)-acsr%irp(i))
       if (nz == 1) then
@@ -489,9 +492,11 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
       endif
     end if
   end do
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 109abc00..069c924e 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -478,7 +478,10 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   do i=1, nr
+    if (info /= 0) cycle
     if (ilaggr(i) < 0) then
       nz = (acsr%irp(i+1)-acsr%irp(i))
       if (nz == 1) then
@@ -489,9 +492,11 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
       endif
     end if
   end do
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index 3efee9e8..d618fe1c 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -478,7 +478,10 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
   do i=1, nr
+    if (info /= 0) cycle
     if (ilaggr(i) < 0) then
       nz = (acsr%irp(i+1)-acsr%irp(i))
       if (nz == 1) then
@@ -489,9 +492,11 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
       endif
     end if
   end do

From 5bcd36f39411dcd3201072cf48d7ebe2eb3bbf88 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Tue, 8 Aug 2023 09:25:15 +0200
Subject: [PATCH 94/96] Fixed SOC1 and SOC2 OpenMP

---
 .../impl/aggregator/amg_c_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_c_soc2_map_bld.F90    | 189 ++++++++++++++----
 .../impl/aggregator/amg_d_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_d_soc2_map_bld.F90    | 189 ++++++++++++++----
 .../impl/aggregator/amg_s_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_s_soc2_map_bld.F90    | 189 ++++++++++++++----
 .../impl/aggregator/amg_z_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_z_soc2_map_bld.F90    | 189 ++++++++++++++----
 8 files changed, 608 insertions(+), 156 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index b9110aae..81047953 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
index ed4161a5..3bda8e90 100644
--- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_c_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index 2b01f3e5..c83dfe3b 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
index 6047f375..b4602378 100644
--- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_d_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 069c924e..59a7c03b 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
index e94261a8..8dac2dd5 100644
--- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_s_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index d618fe1c..66c8e4e2 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
index e09bcf1e..19956309 100644
--- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_z_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0

From d33bcfe107508a5d71de3eb56f9b7d8c66fdcb0f Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Sun, 13 Aug 2023 09:52:25 +0200
Subject: [PATCH 95/96] Completed SOC2 OpenMP.

---
 .../impl/aggregator/amg_c_soc1_map_bld.F90    |  2 ++
 .../impl/aggregator/amg_c_soc2_map_bld.F90    | 22 ++++++++++++++-----
 .../impl/aggregator/amg_d_soc1_map_bld.F90    |  2 ++
 .../impl/aggregator/amg_d_soc2_map_bld.F90    | 22 ++++++++++++++-----
 .../impl/aggregator/amg_s_soc1_map_bld.F90    |  2 ++
 .../impl/aggregator/amg_s_soc2_map_bld.F90    | 22 ++++++++++++++-----
 .../impl/aggregator/amg_z_soc1_map_bld.F90    |  2 ++
 .../impl/aggregator/amg_z_soc2_map_bld.F90    | 22 ++++++++++++++-----
 8 files changed, 76 insertions(+), 20 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index 81047953..4041ebe5 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -500,6 +500,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
+  !$omp end parallel do
+  if (info /= 0) goto 9999
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     info=psb_err_internal_error_
diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
index 3bda8e90..b250e434 100644
--- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
@@ -406,11 +406,16 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))
   end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
   !
   ! Phase two: join the neighbours
   !
+  !$omp workshare
   tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -436,8 +441,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -471,6 +477,8 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
   do i=1, nr
     if (ilaggr(i) <= 0) then
       nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@@ -482,13 +490,17 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
       endif
     end if
   end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
   if (naggr > ncol) then 
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index c83dfe3b..de95abce 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -500,6 +500,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
+  !$omp end parallel do
+  if (info /= 0) goto 9999
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     info=psb_err_internal_error_
diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
index b4602378..345cd1ad 100644
--- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
@@ -406,11 +406,16 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))
   end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
   !
   ! Phase two: join the neighbours
   !
+  !$omp workshare
   tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -436,8 +441,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -471,6 +477,8 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
   do i=1, nr
     if (ilaggr(i) <= 0) then
       nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@@ -482,13 +490,17 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
       endif
     end if
   end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
   if (naggr > ncol) then 
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 59a7c03b..0a809624 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -500,6 +500,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
+  !$omp end parallel do
+  if (info /= 0) goto 9999
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     info=psb_err_internal_error_
diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
index 8dac2dd5..ef7f5707 100644
--- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
@@ -406,11 +406,16 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))
   end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
   !
   ! Phase two: join the neighbours
   !
+  !$omp workshare
   tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -436,8 +441,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -471,6 +477,8 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
   do i=1, nr
     if (ilaggr(i) <= 0) then
       nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@@ -482,13 +490,17 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
       endif
     end if
   end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
   if (naggr > ncol) then 
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index 66c8e4e2..2c467426 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -500,6 +500,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       endif
     end if
   end do
+  !$omp end parallel do
+  if (info /= 0) goto 9999
   if (do_timings) call psb_toc(idx_soc1_p3)
   if (naggr > ncol) then 
     info=psb_err_internal_error_
diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
index 19956309..c6ac226e 100644
--- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
@@ -406,11 +406,16 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
     write(debug_unit,*) me,' ',trim(name),&
          & ' Check 1:',count(ilaggr == -(nr+1))
   end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
   !
   ! Phase two: join the neighbours
   !
+  !$omp workshare
   tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
@@ -436,8 +441,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end if
   end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
   !
   ! Phase three: sweep over leftovers, if any 
   !
@@ -471,6 +477,8 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   end do step3
 
   ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
   do i=1, nr
     if (ilaggr(i) <= 0) then
       nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@@ -482,13 +490,17 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
         ! other processes. 
         ilaggr(i) = -(nrglob+nr)
       else
+        !$omp atomic write
         info=psb_err_internal_error_
+        !$omp end atomic
         call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
       endif
     end if
   end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
   if (naggr > ncol) then 
     info=psb_err_internal_error_
     call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

From 11421f53a256ad0b33503c5500d24aa5773f5934 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Tue, 22 Aug 2023 10:36:44 +0200
Subject: [PATCH 96/96] Minor updates on sample output

---
 samples/advanced/pdegen/amg_d_pde2d.F90 | 10 +++++-----
 samples/advanced/pdegen/amg_d_pde3d.F90 |  2 +-
 samples/advanced/pdegen/amg_s_pde2d.F90 | 10 +++++-----
 samples/advanced/pdegen/amg_s_pde3d.F90 |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/samples/advanced/pdegen/amg_d_pde2d.F90 b/samples/advanced/pdegen/amg_d_pde2d.F90
index 37e9fcd6..145c5890 100644
--- a/samples/advanced/pdegen/amg_d_pde2d.F90
+++ b/samples/advanced/pdegen/amg_d_pde2d.F90
@@ -463,14 +463,14 @@ program amg_d_pde2d
   call psb_sum(ctxt,precsize)
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
     write(psb_out_unit,'("Number of threads                  : ",i12)') nth
     write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
-    write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
-    write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
-    write(psb_out_unit,'("Preconditioner                     : ",a)')  trim(p_choice%descr)
-    write(psb_out_unit,'("Iterations to convergence          : ",i12)')   iter
+    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
+    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
+    write(psb_out_unit,'("Preconditioner                     : ",a)') trim(p_choice%descr)
+    write(psb_out_unit,'("Iterations to convergence          : ",i12)')    iter
     write(psb_out_unit,'("Relative error estimate on exit    : ",es12.5)') err
     write(psb_out_unit,'("Number of levels in hierarchy      : ",i12)')    prec%get_nlevs()
     write(psb_out_unit,'("Time to build hierarchy            : ",es12.5)') thier
diff --git a/samples/advanced/pdegen/amg_d_pde3d.F90 b/samples/advanced/pdegen/amg_d_pde3d.F90
index cb9542d4..75dcd1a0 100644
--- a/samples/advanced/pdegen/amg_d_pde3d.F90
+++ b/samples/advanced/pdegen/amg_d_pde3d.F90
@@ -467,7 +467,7 @@ program amg_d_pde3d
   call psb_sum(ctxt,precsize)
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
     write(psb_out_unit,'("Number of threads                  : ",i12)') nth
     write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
diff --git a/samples/advanced/pdegen/amg_s_pde2d.F90 b/samples/advanced/pdegen/amg_s_pde2d.F90
index eb8a8d63..ad28d1f6 100644
--- a/samples/advanced/pdegen/amg_s_pde2d.F90
+++ b/samples/advanced/pdegen/amg_s_pde2d.F90
@@ -463,14 +463,14 @@ program amg_s_pde2d
   call psb_sum(ctxt,precsize)
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
     write(psb_out_unit,'("Number of threads                  : ",i12)') nth
     write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
-    write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
-    write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
-    write(psb_out_unit,'("Preconditioner                     : ",a)')  trim(p_choice%descr)
-    write(psb_out_unit,'("Iterations to convergence          : ",i12)')   iter
+    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
+    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
+    write(psb_out_unit,'("Preconditioner                     : ",a)') trim(p_choice%descr)
+    write(psb_out_unit,'("Iterations to convergence          : ",i12)')    iter
     write(psb_out_unit,'("Relative error estimate on exit    : ",es12.5)') err
     write(psb_out_unit,'("Number of levels in hierarchy      : ",i12)')    prec%get_nlevs()
     write(psb_out_unit,'("Time to build hierarchy            : ",es12.5)') thier
diff --git a/samples/advanced/pdegen/amg_s_pde3d.F90 b/samples/advanced/pdegen/amg_s_pde3d.F90
index d6195c45..cda6a48b 100644
--- a/samples/advanced/pdegen/amg_s_pde3d.F90
+++ b/samples/advanced/pdegen/amg_s_pde3d.F90
@@ -467,7 +467,7 @@ program amg_s_pde3d
   call psb_sum(ctxt,precsize)
   call prec%descr(info,iout=psb_out_unit)
   if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
     write(psb_out_unit,'("Number of threads                  : ",i12)') nth
     write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
     write(psb_out_unit,'("Linear system size                 : ",i12)') system_size