From b079d71f30fc968a323169be8941b018eb897ddc Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Thu, 2 Jun 2022 07:29:21 -0500 Subject: [PATCH] Further optimizations PARALLEL_PROCESS_EXPOSED_VERTEX_B --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 284 +++++++++--------- 1 file changed, 143 insertions(+), 141 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 783ed17e..9f3cbb97 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -164,6 +164,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( for (int i = 0; i < numProcs; i++) PCounter[i] = 0; + MilanLongInt NumMessagesBundled; MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers! //vector candidateMate; @@ -213,15 +214,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt privateMyCard = 0; staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; - /* - staticQueue privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner; - */ bool isEmpty; #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif -#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner /*, privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner*/) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { // TODO comments about the reduction @@ -402,7 +400,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( * Create the Queue Data Structure for the Dominating Set * * I had to declare the staticuQueue U before the parallel region - * to have it in the correct scope. Since we can't chane the dimension + * to have it in the correct scope. Since we can't change the dimension * of a staticQueue I had to destroy the previous object and instantiate * a new one of the correct size. */ @@ -462,102 +460,103 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( * in parallel. */ - MilanLongInt size = numGhostEdges; //TODO how can I decide a meaningfull size? + MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size? //Fail messages privateQLocalVtx.~staticQueue(); privateQGhostVtx.~staticQueue(); privateQMsgType.~staticQueue(); privateQOwner.~staticQueue(); - //Request messages - /* - privateReqQLocalVtx.~staticQueue(); - privateReqQGhostVtx.~staticQueue(); - privateReqQMsgType.~staticQueue(); - privateReqQOwner.~staticQueue(); - */ + privateU.~staticQueue(); + + new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size? new(&privateQLocalVtx) staticQueue(size); new(&privateQGhostVtx) staticQueue(size); new(&privateQMsgType) staticQueue(size); new(&privateQOwner) staticQueue(size); - /* - new(&privateReqQLocalVtx) staticQueue(size); - new(&privateReqQGhostVtx) staticQueue(size); - new(&privateReqQMsgType) staticQueue(size); - new(&privateReqQOwner) staticQueue(size); - */ #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static) - for ( v=0; v < NLVer; v++ ) - { - //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - k = candidateMate[v]; - candidateMate[v] = verLocInd[k]; - w = candidateMate[v]; + for (v = 0; v < NLVer; v++) { + //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + k = candidateMate[v]; + candidateMate[v] = verLocInd[k]; + w = candidateMate[v]; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { - //If found a dominating edge: - if (w >= 0) { - - //This piece of code is actually executed under 0.01% of the times - - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { - w = computeCandidateMate(verLocPtr[v], - verLocPtr[v + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - candidateMate[v] = w; - } + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; + } - if (w >= 0) { + if (w >= 0) { - myCard++; - if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex + myCard++; + if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { + //Decrement the counter: + //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) +#pragma omp critical + { + if (Counter[Ghost2LocalMap[w]] > 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + Counter[Ghost2LocalMap[w]] -= 1; //Decrement if (Counter[Ghost2LocalMap[w]] == 0) { S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ @@ -565,79 +564,86 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif } - } //End of if Counter[w] > 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v - } // end of critical region - } //End of if a Ghost Vertex - else { // w is a local vertex - - if (candidateMate[w - StartIndex] == (v + StartIndex)) { -#pragma omp critical(Mate) - { - Mate[v] = w; //v is local - Mate[w - StartIndex] = v + StartIndex; //w is local - //Q.push_back(u); - U.push_back(v + StartIndex); - U.push_back(w); - -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - - //if (w < 0) { -- if it arrives here this one if is useless, it is certainly -1 - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + + + } //End of if a Ghost Vertex + else { // w is a local vertex + + if (candidateMate[w - StartIndex] == (v + StartIndex)) { + privateU.push_back(v + StartIndex); + privateU.push_back(w); + + Mate[v] = w; //v is local + //FIXME this instruction could create errors + Mate[w - StartIndex] = v + StartIndex; //w is local + + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + + //This piece of code is executed a really small amount of times, I will not allocate a + //huge amount of memory to the private data structures. + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<