From 5efee2004175bad45761608e74dd05d55bc2f5ad Mon Sep 17 00:00:00 2001
From: StefanoPetrilli <stefanop_1999@hotmail.it>
Date: Sat, 23 Jul 2022 05:52:27 -0500
Subject: [PATCH] Optimization, replaced all useless atomic with reduction

---
 amgprec/impl/aggregator/MatchBoxPC.h          |  1 -
 ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp |  1 -
 amgprec/impl/aggregator/initialize.cpp        |  6 +-
 .../impl/aggregator/processExposedVertex.cpp  | 21 +++--
 .../aggregator/processMatchedVertices.cpp     | 58 +++++++-------
 .../processMatchedVerticesAndSendMessages.cpp | 78 +++++++++----------
 amgprec/impl/aggregator/processMessages.cpp   |  8 +-
 7 files changed, 81 insertions(+), 92 deletions(-)
diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h
index 8bba9540..d4b8c04c 100644
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@@ -358,7 +358,6 @@ extern "C"
         bool sendMessages,
         MPI_Comm comm,
         MilanLongInt *msgActual,
-        MilanLongInt *msgInd,
         vector<MilanLongInt> &Message);
 
     void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
index d8e8bfb7..7b47c7c9 100644
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@@ -402,7 +402,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                                true,
                                comm,
                                &msgActual,
-                               &msgInd,
                                Message);
 
         ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp
index 3e7ac207..477f5f6d 100644
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@@ -57,13 +57,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 #pragma omp task depend(out \
                         : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
             {
-#pragma omp taskloop num_tasks(NUM_THREAD)
+#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
+                                                     : numGhostEdges[:1])
                 for (i = 0; i < NLEdge; i++)
                 { // O(m) - Each edge stored twice
                     insertMe = verLocInd[i];
                     if ((insertMe < StartIndex) || (insertMe > EndIndex))
                     { // Find a ghost
-#pragma omp atomic
                         (*numGhostEdges)++;
 #pragma omp critical
                         {
@@ -76,7 +76,7 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                             {                                                 // Insert an entry for the ghost:
                                 Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
                                 Counter.push_back(1);                         // Initialize the counter
-                                (*numGhostVertices)++;                          // Increment the number of ghost vertices
+                                (*numGhostVertices)++;                        // Increment the number of ghost vertices
                             }                                                 // End of else()
                         }
                     } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp
index c53f2f53..91035372 100644
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@@ -34,9 +34,13 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
     MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
     MilanInt ghostOwner = 0, option;
 
-#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner)                                                          \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) \
+        num_threads(NUM_THREAD)
+
     {
-#pragma omp for reduction(+ : PCounter[:numProcs]) schedule(static)
+#pragma omp for reduction(+ \
+                          : PCounter[:numProcs], myCard[:1], msgInd[:1], NumMessagesBundled[:1]) schedule(static)
         for (v = 0; v < NLVer; v++)
         {
             option = -1;
@@ -76,7 +80,6 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
 
                     if (w >= 0)
                     {
-#pragma omp atomic
                         (*myCard)++;
                         if ((w < StartIndex) || (w > EndIndex))
                         { // w is a ghost vertex
@@ -129,13 +132,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                         cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
                         fflush(stdout);
 #endif
-#pragma omp atomic
                         (*msgInd)++;
-#pragma omp atomic
                         (*NumMessagesBundled)++;
                         ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                        assert(ghostOwner != -1);
-                        assert(ghostOwner != myRank);
+                        // assert(ghostOwner != -1);
+                        // assert(ghostOwner != myRank);
                         PCounter[ghostOwner]++;
 
                         privateQLocalVtx.push_back(v + StartIndex);
@@ -169,13 +170,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                 cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                 fflush(stdout);
 #endif
-#pragma omp atomic
                 (*msgInd)++;
-#pragma omp atomic
                 (*NumMessagesBundled)++;
                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                assert(ghostOwner != -1);
-                assert(ghostOwner != myRank);
+                // assert(ghostOwner != -1);
+                // assert(ghostOwner != myRank);
                 PCounter[ghostOwner]++;
 
                 privateQLocalVtx.push_back(v + StartIndex);
diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp
index edb1f788..1e7b2641 100644
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@@ -7,9 +7,9 @@ void processMatchedVertices(
     staticQueue &privateU,
     MilanLongInt StartIndex,
     MilanLongInt EndIndex,
-    MilanLongInt *myCardPtr,
-    MilanLongInt *msgIndPtr,
-    MilanLongInt *NumMessagesBundledPtr,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *NumMessagesBundled,
     MilanLongInt *SPtr,
     MilanLongInt *verLocPtr,
     MilanLongInt *verLocInd,
@@ -46,7 +46,14 @@ void processMatchedVertices(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                                    \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
+        num_threads(NUM_THREAD)                                                                                                                            \
+            reduction(+                                                                                                                                    \
+                      : msgInd[:1], PCounter                                                                                                               \
+                      [:numProcs], myCard                                                                                                                  \
+                      [:1], NumMessagesBundled                                                                                                             \
+                      [:1])
     {
 
         while (!U.empty())
@@ -171,8 +178,8 @@ void processMatchedVertices(
                             // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+
+                            (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
                             cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                             fflush(stdout);
@@ -183,14 +190,11 @@ void processMatchedVertices(
 
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
-#pragma omp atomic
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
                             PCounter[ghostOwner]++;
-#pragma omp atomic
-                            (*NumMessagesBundledPtr)++;
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*NumMessagesBundled)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(v);
                             privateQGhostVtx.push_back(w);
@@ -200,8 +204,8 @@ void processMatchedVertices(
                         case 3:
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+
+                            (*myCard)++;
                             break;
                         case 4:
                             // Could not find a dominating vertex
@@ -220,14 +224,12 @@ void processMatchedVertices(
 #endif
 
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
-#pragma omp atomic
+                                    // assert(ghostOwner != -1);
+                                    // assert(ghostOwner != myRank);
+
                                     PCounter[ghostOwner]++;
-#pragma omp atomic
-                                    (*NumMessagesBundledPtr)++;
-#pragma omp atomic
-                                    (*msgIndPtr)++;
+                                    (*NumMessagesBundled)++;
+                                    (*msgInd)++;
 
                                     privateQLocalVtx.push_back(v);
                                     privateQGhostVtx.push_back(w);
@@ -247,16 +249,12 @@ void processMatchedVertices(
 #endif
 
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
 
-#pragma omp atomic
-                            (*NumMessagesBundledPtr)++;
-#pragma omp atomic
+                            (*NumMessagesBundled)++;
                             PCounter[ghostOwner]++;
-
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(u);
                             privateQGhostVtx.push_back(v);
diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
index e61d561f..7775d193 100644
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@@ -7,9 +7,9 @@ void processMatchedVerticesAndSendMessages(
     staticQueue &privateU,
     MilanLongInt StartIndex,
     MilanLongInt EndIndex,
-    MilanLongInt *myCardPtr,
-    MilanLongInt *msgIndPtr,
-    MilanLongInt *NumMessagesBundledPtr,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *NumMessagesBundled,
     MilanLongInt *SPtr,
     MilanLongInt *verLocPtr,
     MilanLongInt *verLocInd,
@@ -34,7 +34,6 @@ void processMatchedVerticesAndSendMessages(
     bool sendMessages,
     MPI_Comm comm,
     MilanLongInt *msgActual,
-    MilanLongInt *msgInd,
     vector<MilanLongInt> &Message)
 {
 
@@ -53,7 +52,16 @@ void processMatchedVerticesAndSendMessages(
 #ifdef COUNT_LOCAL_VERTEX
     MilanLongInt localVertices = 0;
 #endif
-#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD)
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
+    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) \ 
+default(shared) \ 
+num_threads(NUM_THREAD)                                                                 \
+        reduction(+                                                                     \
+                  : msgInd[:1], PCounter                                                \
+                  [:numProcs], myCard                                                   \
+                  [:1], NumMessagesBundled                                              \
+                  [:1], msgActual                                                       \
+                  [:1])
     {
 
         while (!U.empty())
@@ -178,8 +186,7 @@ void processMatchedVerticesAndSendMessages(
                             // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+                            (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
                             cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
                             fflush(stdout);
@@ -190,8 +197,8 @@ void processMatchedVerticesAndSendMessages(
 
                             // Found a dominating edge, it is a ghost
                             ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
                             if (sendMessages)
                             {
                                 // Build the Message Packet:
@@ -200,8 +207,8 @@ void processMatchedVerticesAndSendMessages(
                                 Message[2] = REQUEST; // TYPE
                                                       // Send a Request (Asynchronous)
 
-                                //printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                //fflush(stdout);
+                                // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                // fflush(stdout);
 #pragma omp critical(sendMessage)
                                 {
                                     messagesToSend.push_back(v);
@@ -211,19 +218,15 @@ void processMatchedVerticesAndSendMessages(
                                 }
                                 // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 
-#pragma omp atomic
                                 (*msgActual)++;
                             }
                             else
                             {
-#pragma omp atomic
                                 PCounter[ghostOwner]++;
-#pragma omp atomic
-                                (*NumMessagesBundledPtr)++;
+                                (*NumMessagesBundled)++;
                             }
 
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(v);
                             privateQGhostVtx.push_back(w);
@@ -233,8 +236,7 @@ void processMatchedVerticesAndSendMessages(
                         case 3:
                             privateU.push_back(v);
                             privateU.push_back(w);
-#pragma omp atomic
-                            (*myCardPtr)++;
+                            (*myCard)++;
                             break;
                         case 4:
                             // Could not find a dominating vertex
@@ -253,8 +255,8 @@ void processMatchedVerticesAndSendMessages(
 #endif
 
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
+                                    // assert(ghostOwner != -1);
+                                    // assert(ghostOwner != myRank);
                                     if (sendMessages)
                                     {
                                         // Build the Message Packet:
@@ -263,8 +265,8 @@ void processMatchedVerticesAndSendMessages(
                                         Message[2] = FAILURE; // TYPE
                                                               // Send a Request (Asynchronous)
 
-                                        //printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                        //fflush(stdout);
+                                        // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                        // fflush(stdout);
 #pragma omp critical(sendMessage)
                                         {
                                             messagesToSend.push_back(v);
@@ -273,19 +275,15 @@ void processMatchedVerticesAndSendMessages(
                                             messagesToSend.push_back(ghostOwner);
                                         }
                                         // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-#pragma omp atomic
                                         (*msgActual)++;
                                     }
                                     else
                                     {
-#pragma omp atomic
                                         PCounter[ghostOwner]++;
-#pragma omp atomic
-                                        (*NumMessagesBundledPtr)++;
+                                        (*NumMessagesBundled)++;
                                     }
 
-#pragma omp atomic
-                                    (*msgIndPtr)++;
+                                    (*msgInd)++;
 
                                     privateQLocalVtx.push_back(v);
                                     privateQGhostVtx.push_back(w);
@@ -305,8 +303,8 @@ void processMatchedVerticesAndSendMessages(
 #endif
 
                             ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
-                            assert(ghostOwner != -1);
-                            assert(ghostOwner != myRank);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
                             if (sendMessages)
                             {
                                 // Build the Message Packet:
@@ -315,8 +313,8 @@ void processMatchedVerticesAndSendMessages(
                                 Message[2] = SUCCESS; // TYPE
 
                                 // Send a Request (Asynchronous)
-                                //printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
-                                //fflush(stdout);
+                                // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]);
+                                // fflush(stdout);
 #pragma omp critical(sendMessage)
                                 {
                                     messagesToSend.push_back(u);
@@ -325,19 +323,15 @@ void processMatchedVerticesAndSendMessages(
                                     messagesToSend.push_back(ghostOwner);
                                 }
                                 // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
-#pragma omp atomic
                                 (*msgActual)++;
                             }
                             else
                             {
-#pragma omp atomic
-                                (*NumMessagesBundledPtr)++;
-#pragma omp atomic
+                                (*NumMessagesBundled)++;
                                 PCounter[ghostOwner]++;
                             }
 
-#pragma omp atomic
-                            (*msgIndPtr)++;
+                            (*msgInd)++;
 
                             privateQLocalVtx.push_back(u);
                             privateQGhostVtx.push_back(v);
@@ -371,10 +365,10 @@ void processMatchedVerticesAndSendMessages(
 
     for (int i = 0; i < messagesToSend.size(); i += 4)
     {
-        Message[0] = messagesToSend[i];       
-        Message[1] = messagesToSend[i + 1];       
+        Message[0] = messagesToSend[i];
+        Message[1] = messagesToSend[i + 1];
         Message[2] = messagesToSend[i + 2];
-        ghostOwner = messagesToSend[i + 3]; 
+        ghostOwner = messagesToSend[i + 3];
         MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
     }
 }
diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp
index c6cb2531..4150a330 100644
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@@ -243,8 +243,8 @@ void processMessages(
                                 fflush(stdout);
 #endif
                                 ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                assert(ghostOwner != -1);
-                                assert(ghostOwner != myRank);
+                                //assert(ghostOwner != -1);
+                                //assert(ghostOwner != myRank);
 
                                 MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
                                 (*msgInd)++;
@@ -301,8 +301,8 @@ void processMessages(
                                     fflush(stdout);
 #endif
                                     ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
-                                    assert(ghostOwner != -1);
-                                    assert(ghostOwner != myRank);
+                                    //assert(ghostOwner != -1);
+                                    //assert(ghostOwner != myRank);
                                     MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
                                     (*msgInd)++;
                                     (*msgActual)++;