diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 77494032..4814d32e 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -84,17 +84,17 @@ template<> inline MPI_Datatype TypeMap() { return MPI_FLOAT; } // DOUBLE PRECISION VERSION //WARNING: The vertex block on a given rank is contiguous void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( - MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, - MilanReal* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MPI_Comm comm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, - MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { - + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt* verLocPtr, MilanLongInt* verLocInd, + MilanReal* edgeLocWeight, + MilanLongInt* verDistance, + MilanLongInt* Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, + MilanReal* msgPercent, + MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, + MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { + /* * verDistance: it's a vector long as the number of processors. * verDistance[i] contains the first node index of the i-th processor @@ -424,99 +424,73 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //Compute the Initial Matching Set: S = numGhostVertices; //Initialize S with number of Ghost Vertices - } // end of single region - /* - * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B - * The next portion of code has been splitted - * to make it 100% parallelized - * - * TODO: I think it diminish the cache update, does it? - * - * TODO: would it make any sense to parallelize also the - * inner for? - * - * TODO: we have a false sharing on candidateMate - */ - -#pragma omp for - for ( v=0; v < NLVer; v++ ) { -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< EndIndex)) { //Is it a ghost vertex? - if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched - continue; - } else { //A local vertex - if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched - continue; - } - - if ((edgeLocWeight[k] > heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { - heaviestEdgeWt = edgeLocWeight[k]; - w = verLocInd[k]; - - } - } //End of for loop - candidateMate[v] = w; - //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + /* + * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B + * It is actually not possible to parallelize this cycle + * as it is. + * + * TODO think how it could be parallelizable + */ - } + for ( v=0; v < NLVer; v++ ) { +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< EndIndex)) { //Is it a ghost vertex? + if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched + continue; + } else { //A local vertex + if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched + continue; + } - /* - TODO this cycle has a lot of margin of improvement!!!! - This current version introduce some errors. - 1 - ollback to the previous verison and check if it is - 100% stable - 2 - if the previous verison was stable all right, if not - that's a big deal - 3 - reimplement step by step to check from where the instability - comes from - */ + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; -#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) - for ( v=0; v < NLVer; v++ ) { + } + } //End of for loop + candidateMate[v] = w; + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { - myCard++; - if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) + //If found a dominating edge: + if (w >= 0) { + myCard++; + if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = REQUEST; //TYPE + //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v - } // end of critical region - } //End of if a Ghost Vertex - else { // w is a local vertex + } + } //End of if Counter[w] > 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { // w is a local vertex if (candidateMate[w - StartIndex] == (v + StartIndex)) { -#pragma omp critical - { - Mate[v] = w; //v is local - Mate[w - StartIndex] = v + StartIndex; //w is local - //Q.push_back(u); - U.push_back(v + StartIndex); - U.push_back(w); + + Mate[v] = w; //v is local + Mate[w - StartIndex] = v + StartIndex; //w is local + //Q.push_back(u); + U.push_back(v + StartIndex); + U.push_back(w); #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) + else { + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } @@ -962,7 +934,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } while ( true ) { #ifdef DEBUG_HANG_ - if (myRank == 0) cout<<"\n("< heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } @@ -1112,7 +1084,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); msgInd++; msgActual++; #ifdef DEBUG_GHOST_ - if ((uEndIndex)) { + if ((uEndIndex)) { cout<<"\n("<EndIndex)) { + if ((vEndIndex)) { cout<<"\n("<NLVer)) { + if ((v<0)||(vNLVer)) { cout<<"\n("< 0 //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) #ifdef DEBUG_GHOST_ - if ((v<0)||(vNLVer)) { + if ((v<0)||(vNLVer)) { cout<<"\n("< heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } @@ -1451,8 +1423,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); //MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer if ( BufferSize > 0 ) { - MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer - free(Buffer); //Free the memory that was allocated + MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer + free(Buffer); //Free the memory that was allocated } finishTime = MPI_Wtime(); *ph2_time = finishTime-startTime; //Time taken for Phase-2 @@ -1478,9 +1450,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( *msgActualSent = msgActual; *msgIndSent = msgInd; if (msgInd > 0) { - *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0; + *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0; } else { - *msgPercent = 0; + *msgPercent = 0; } #ifdef DEBUG_HANG_