From 5efee2004175bad45761608e74dd05d55bc2f5ad Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 05:52:27 -0500 Subject: [PATCH] Optimization, replaced all useless atomic with reduction --- amgprec/impl/aggregator/MatchBoxPC.h | 1 - ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1 - amgprec/impl/aggregator/initialize.cpp | 6 +- .../impl/aggregator/processExposedVertex.cpp | 21 +++-- .../aggregator/processMatchedVertices.cpp | 58 +++++++------- .../processMatchedVerticesAndSendMessages.cpp | 78 +++++++++---------- amgprec/impl/aggregator/processMessages.cpp | 8 +- 7 files changed, 81 insertions(+), 92 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 8bba9540..d4b8c04c 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -358,7 +358,6 @@ extern "C" bool sendMessages, MPI_Comm comm, MilanLongInt *msgActual, - MilanLongInt *msgInd, vector &Message); void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index d8e8bfb7..7b47c7c9 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -402,7 +402,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( true, comm, &msgActual, - &msgInd, Message); ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 3e7ac207..477f5f6d 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -57,13 +57,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp task depend(out \ : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices) { -#pragma omp taskloop num_tasks(NUM_THREAD) +#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \ + : numGhostEdges[:1]) for (i = 0; i < NLEdge; i++) { // O(m) - Each edge stored twice insertMe = verLocInd[i]; if ((insertMe < StartIndex) || (insertMe > EndIndex)) { // Find a ghost -#pragma omp atomic (*numGhostEdges)++; #pragma omp critical { @@ -76,7 +76,7 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, { // Insert an entry for the ghost: Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry Counter.push_back(1); // Initialize the counter - (*numGhostVertices)++; // Increment the number of ghost vertices + (*numGhostVertices)++; // Increment the number of ghost vertices } // End of else() } } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index c53f2f53..91035372 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -34,9 +34,13 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0; MilanInt ghostOwner = 0, option; -#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) \ + num_threads(NUM_THREAD) + { -#pragma omp for reduction(+ : PCounter[:numProcs]) schedule(static) +#pragma omp for reduction(+ \ + : PCounter[:numProcs], myCard[:1], msgInd[:1], NumMessagesBundled[:1]) schedule(static) for (v = 0; v < NLVer; v++) { option = -1; @@ -76,7 +80,6 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, if (w >= 0) { -#pragma omp atomic (*myCard)++; if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex @@ -129,13 +132,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif -#pragma omp atomic (*msgInd)++; -#pragma omp atomic (*NumMessagesBundled)++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); PCounter[ghostOwner]++; privateQLocalVtx.push_back(v + StartIndex); @@ -169,13 +170,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; fflush(stdout); #endif -#pragma omp atomic (*msgInd)++; -#pragma omp atomic (*NumMessagesBundled)++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); PCounter[ghostOwner]++; privateQLocalVtx.push_back(v + StartIndex); diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index edb1f788..1e7b2641 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -7,9 +7,9 @@ void processMatchedVertices( staticQueue &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt *myCardPtr, - MilanLongInt *msgIndPtr, - MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, MilanLongInt *SPtr, MilanLongInt *verLocPtr, MilanLongInt *verLocInd, @@ -46,7 +46,14 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ + num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1]) { while (!U.empty()) @@ -171,8 +178,8 @@ void processMatchedVertices( // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); @@ -183,14 +190,11 @@ void processMatchedVertices( // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); -#pragma omp atomic + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic - (*msgIndPtr)++; + (*NumMessagesBundled)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -200,8 +204,8 @@ void processMatchedVertices( case 3: privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + + (*myCard)++; break; case 4: // Could not find a dominating vertex @@ -220,14 +224,12 @@ void processMatchedVertices( #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); -#pragma omp atomic + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic - (*msgIndPtr)++; + (*NumMessagesBundled)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -247,16 +249,12 @@ void processMatchedVertices( #endif ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic + (*NumMessagesBundled)++; PCounter[ghostOwner]++; - -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(u); privateQGhostVtx.push_back(v); diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp index e61d561f..7775d193 100644 --- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -7,9 +7,9 @@ void processMatchedVerticesAndSendMessages( staticQueue &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt *myCardPtr, - MilanLongInt *msgIndPtr, - MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, MilanLongInt *SPtr, MilanLongInt *verLocPtr, MilanLongInt *verLocInd, @@ -34,7 +34,6 @@ void processMatchedVerticesAndSendMessages( bool sendMessages, MPI_Comm comm, MilanLongInt *msgActual, - MilanLongInt *msgInd, vector &Message) { @@ -53,7 +52,16 @@ void processMatchedVerticesAndSendMessages( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) \ +default(shared) \ +num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1], msgActual \ + [:1]) { while (!U.empty()) @@ -178,8 +186,7 @@ void processMatchedVerticesAndSendMessages( // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); @@ -190,8 +197,8 @@ void processMatchedVerticesAndSendMessages( // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); if (sendMessages) { // Build the Message Packet: @@ -200,8 +207,8 @@ void processMatchedVerticesAndSendMessages( Message[2] = REQUEST; // TYPE // Send a Request (Asynchronous) - //printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - //fflush(stdout); + // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); #pragma omp critical(sendMessage) { messagesToSend.push_back(v); @@ -211,19 +218,15 @@ void processMatchedVerticesAndSendMessages( } // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic (*msgActual)++; } else { -#pragma omp atomic PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; + (*NumMessagesBundled)++; } -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -233,8 +236,7 @@ void processMatchedVerticesAndSendMessages( case 3: privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + (*myCard)++; break; case 4: // Could not find a dominating vertex @@ -253,8 +255,8 @@ void processMatchedVerticesAndSendMessages( #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); if (sendMessages) { // Build the Message Packet: @@ -263,8 +265,8 @@ void processMatchedVerticesAndSendMessages( Message[2] = FAILURE; // TYPE // Send a Request (Asynchronous) - //printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - //fflush(stdout); + // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); #pragma omp critical(sendMessage) { messagesToSend.push_back(v); @@ -273,19 +275,15 @@ void processMatchedVerticesAndSendMessages( messagesToSend.push_back(ghostOwner); } // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic (*msgActual)++; } else { -#pragma omp atomic PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; + (*NumMessagesBundled)++; } -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -305,8 +303,8 @@ void processMatchedVerticesAndSendMessages( #endif ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); if (sendMessages) { // Build the Message Packet: @@ -315,8 +313,8 @@ void processMatchedVerticesAndSendMessages( Message[2] = SUCCESS; // TYPE // Send a Request (Asynchronous) - //printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - //fflush(stdout); + // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); #pragma omp critical(sendMessage) { messagesToSend.push_back(u); @@ -325,19 +323,15 @@ void processMatchedVerticesAndSendMessages( messagesToSend.push_back(ghostOwner); } // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic (*msgActual)++; } else { -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic + (*NumMessagesBundled)++; PCounter[ghostOwner]++; } -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(u); privateQGhostVtx.push_back(v); @@ -371,10 +365,10 @@ void processMatchedVerticesAndSendMessages( for (int i = 0; i < messagesToSend.size(); i += 4) { - Message[0] = messagesToSend[i]; - Message[1] = messagesToSend[i + 1]; + Message[0] = messagesToSend[i]; + Message[1] = messagesToSend[i + 1]; Message[2] = messagesToSend[i + 2]; - ghostOwner = messagesToSend[i + 3]; + ghostOwner = messagesToSend[i + 3]; MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); } } diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index c6cb2531..4150a330 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -243,8 +243,8 @@ void processMessages( fflush(stdout); #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); (*msgInd)++; @@ -301,8 +301,8 @@ void processMessages( fflush(stdout); #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); (*msgInd)++; (*msgActual)++;