From 60f90804d53304d270237ff66f476cdec82c91da Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 1 May 2022 04:42:33 -0500 Subject: [PATCH 01/96] Time tracking in MatchBox --- amgprec/impl/aggregator/MatchBoxPC.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp index c1ec0976..8eb4af08 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.cpp +++ b/amgprec/impl/aggregator/MatchBoxPC.cpp @@ -64,6 +64,13 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n", myRank,NLVer, NLEdge,verDistance[0],verDistance[1]); #endif + +#ifdef #IE + + #ifdef TIME_TRACKER + double tmr = MPI_Wtime(); + #endif + dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge, verLocPtr, verLocInd, edgeLocWeight, verDistance, Mate, @@ -71,6 +78,12 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, msgIndSent, msgActualSent, msgPercent, ph0_time, ph1_time, ph2_time, ph1_card, ph2_card ); + + #ifdef TIME_TRACKER + tmr = MPI_Wtime() - tmr; + fprintf(stderr, "Elaboration time: %f\n", tmr); + #endif + #endif } From 1760afbe97acb1d8e18f56b74b3c7dbe83aca863 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 1 May 2022 04:47:03 -0500 Subject: [PATCH 02/96] Time tracking in algoDistEdge --- ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp index 8be438b6..62e5112f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp @@ -92,6 +92,21 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( MilanReal* msgPercent, MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { + + /* + * verDistance: it's a vector long as the number of processors. + * verDistance[i] contains the first node index of the i-th processor + * verDistance[i + 1] contains the last node index of the i-th processor + * NLVer: number of elements in the LocPtr + * NLEdge: number of edges assigned to the current processor + * + * Contains the portion of matrix assigned to the processor in + * Yale notation + * verLocInd: contains the positions on row of the matrix + * verLocPtr: i-th value is the position of the first element on the i-th row and + * i+1-th value is the position of the first element on the i+1-th row + */ + #if !defined(SERIAL_MPI) #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< EndIndex) ) } //End of for(ghost vertices) + +#ifdef TIME_TRACKER + Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization; + fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization); +#endif + #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Date: Sun, 1 May 2022 05:42:42 -0500 Subject: [PATCH 03/96] verGhostIndInitialization and Ghost2LocalInitialization cycles parallelization --- amgprec/impl/aggregator/Makefile | 2 +- amgprec/impl/aggregator/MatchBoxPC.cpp | 3 +- ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 65 ++++++++++++++----- exec.sh | 7 ++ samples/advanced/pdegen/Makefile | 2 +- 5 files changed, 60 insertions(+), 19 deletions(-) create mode 100755 exec.sh diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile index d857a3b0..0444e60d 100644 --- a/amgprec/impl/aggregator/Makefile +++ b/amgprec/impl/aggregator/Makefile @@ -4,7 +4,7 @@ INCDIR=../../../include MODDIR=../../../modules HERE=../.. -FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES) +FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES) -fopenmp CXXINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(INCDIR) $(FMFLAG)/. #CINCLUDES= -I${SUPERLU_INCDIR} -I${HSL_INCDIR} -I${SPRAL_INCDIR} -I/home/users/pasqua/Ambra/BootCMatch/include -lBCM -L/home/users/pasqua/Ambra/BootCMatch/lib -lm diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp index 8eb4af08..fc30e8fd 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.cpp +++ b/amgprec/impl/aggregator/MatchBoxPC.cpp @@ -60,13 +60,12 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { #if !defined(SERIAL_MPI) MPI_Comm C_comm=MPI_Comm_f2c(icomm); + #ifdef DEBUG fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n", myRank,NLVer, NLEdge,verDistance[0],verDistance[1]); #endif -#ifdef #IE - #ifdef TIME_TRACKER double tmr = MPI_Wtime(); #endif diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp index 62e5112f..da8b3896 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp @@ -1,4 +1,6 @@ #include "MatchBoxPC.h" +#include +#include // *********************************************************************** // // MatchboxP: A C++ library for approximate weighted matching @@ -167,25 +169,40 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( if (myRank == 0) cout<<"\n("< EndIndex) ) { //Find a ghost - storedAlready = Ghost2LocalMap.find( insertMe ); - if ( storedAlready != Ghost2LocalMap.end() ) { //Has already been added - //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter - numGhostEdges++; - } else { //Insert an entry for the ghost: - //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter + } else { //Insert an entry for the ghost: + //cout<<"Process "< 0 ) cout< Date: Sun, 1 May 2022 15:26:47 -0500 Subject: [PATCH 05/96] Single parallel regions with multiple for cycles Added OMP for testing --- amgprec/impl/aggregator/Makefile | 2 +- amgprec/impl/aggregator/MatchBoxPC.cpp | 3 +- ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 191 ++++++++++-------- exec.sh | 2 +- 4 files changed, 115 insertions(+), 83 deletions(-) diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile index 0444e60d..d857a3b0 100644 --- a/amgprec/impl/aggregator/Makefile +++ b/amgprec/impl/aggregator/Makefile @@ -4,7 +4,7 @@ INCDIR=../../../include MODDIR=../../../modules HERE=../.. -FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES) -fopenmp +FINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(PSBLAS_INCLUDES) CXXINCLUDES=$(FMFLAG)$(HERE) $(FMFLAG)$(INCDIR) $(FMFLAG)/. #CINCLUDES= -I${SUPERLU_INCDIR} -I${HSL_INCDIR} -I${SPRAL_INCDIR} -I/home/users/pasqua/Ambra/BootCMatch/include -lBCM -L/home/users/pasqua/Ambra/BootCMatch/lib -lm diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp index fc30e8fd..270c6d04 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.cpp +++ b/amgprec/impl/aggregator/MatchBoxPC.cpp @@ -66,6 +66,7 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, myRank,NLVer, NLEdge,verDistance[0],verDistance[1]); #endif +#define TIME_TRACKER #ifdef TIME_TRACKER double tmr = MPI_Wtime(); #endif @@ -80,7 +81,7 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, #ifdef TIME_TRACKER tmr = MPI_Wtime() - tmr; - fprintf(stderr, "Elaboration time: %f\n", tmr); + fprintf(stderr, "Elaboration time: %f for $ld\n", tmr, NLEdge); #endif #endif diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp index 6f5dd9be..818c9f07 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp @@ -124,7 +124,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( //inputSubGraph.getStartEndIndices(StartIndex, EndIndex); MilanLongInt StartIndex = verDistance[myRank]; //The starting vertex owned by the current rank //MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank - MilanLongInt EndIndex = verDistance[myRank+1]-1; //The ending vertex owned by the current rank + MilanLongInt EndIndex = verDistance[myRank + 1] - 1; //The ending vertex owned by the current rank MPI_Status computeStatus; const int ComputeTag = 7; //Predefined tag @@ -135,8 +135,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( int message_length; //MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0; - MilanLongInt msgActual=0, msgInd=0; - MilanReal heaviestEdgeWt=0.0f; //Assumes positive weight + MilanLongInt msgActual = 0, msgInd = 0; + MilanReal heaviestEdgeWt = 0.0f; //Assumes positive weight MilanReal startTime, finishTime; //MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer startTime = MPI_Wtime(); @@ -150,18 +150,18 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( cout<<"\n("< Ghost2LocalMap; //Map each ghost vertex to a local vertex + map Ghost2LocalMap; //Map each ghost vertex to a local vertex // index that starts with zero to |Vg| - 1 map::iterator storedAlready; - vector Counter; //Store the edge count for each ghost vertex - MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe=0; //Number of Ghost vertices + vector Counter; //Store the edge count for each ghost vertex + MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< verGhostPtr, verGhostInd, tempCounter; + //Mate array for ghost vertices: + vector GMate; //Proportional to the number of ghost vertices + #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif -#pragma omp parallel for private(insertMe) firstprivate(StartIndex, EndIndex) default(shared) - for ( i=0; i EndIndex) ) { //Find a ghost +//#define OMP +#ifdef OMP +#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) + { +#endif + //printf("Id %d\n", omp_get_thread_num()); + +#ifdef OMP +#pragma omp for +#endif + for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice + insertMe = verLocInd[i]; + //cout<<"InsertMe on Process "< EndIndex)) { //Find a ghost +#ifdef OMP #pragma omp critical - { - numGhostEdges++; - storedAlready = Ghost2LocalMap.find(insertMe); - if (storedAlready != Ghost2LocalMap.end()) { //Has already been added - //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter - } else { //Insert an entry for the ghost: - //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter + } else { //Insert an entry for the ghost: + //cout<<"Process "<second<<" - "<first<<" : "<second]<second<<" - "<first<<" : "<second]< verGhostPtr, verGhostInd, tempCounter; - //Mate array for ghost vertices: - vector GMate; //Proportional to the number of ghost vertices - try { - verGhostPtr.reserve(numGhostVertices+1); //Pointer Vector - tempCounter.reserve(numGhostVertices); //Pointer Vector - verGhostInd.reserve(numGhostEdges); //Index Vector - GMate.reserve(numGhostVertices); //Ghost Mate Vector - } catch ( length_error ) { - cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; - cout<<"Not enough memory to allocate the internal variables \n"; - exit(1); - } - //Initialize the Vectors: - verGhostPtr.resize(numGhostVertices+1, 0); //Pointer Vector - tempCounter.resize(numGhostVertices, 0); //Temporary Counter - verGhostInd.resize(numGhostEdges, -1); //Index Vector - GMate.resize(numGhostVertices, -1); //Temporary Counter - verGhostPtr[0] = 0; //The first value + + //Initialize adjacency Lists for Ghost Vertices: + try { + verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector + tempCounter.reserve(numGhostVertices); //Pointer Vector + verGhostInd.reserve(numGhostEdges); //Index Vector + GMate.reserve(numGhostVertices); //Ghost Mate Vector + } catch (length_error) { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + //Initialize the Vectors: + verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector + tempCounter.resize(numGhostVertices, 0); //Temporary Counter + verGhostInd.resize(numGhostEdges, -1); //Index Vector + GMate.resize(numGhostVertices, -1); //Temporary Counter + verGhostPtr[0] = 0; //The first value #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) cout< EndIndex) ) { //Find a ghost +#ifdef OMP #pragma omp critical { +#endif insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert verGhostInd[insertMe] = v + StartIndex; //Add the adjacency tempCounter[Ghost2LocalMap[w]]++; //Increment the counter +#ifdef OMP } +#endif } //End of if((w < StartIndex) || (w > EndIndex)) } //End of for(k) } //End of for (v) tempCounter.clear(); //Do not need this any more +#ifdef OMP + } //end of parallel region +#endif #ifdef TIME_TRACKER verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); diff --git a/exec.sh b/exec.sh index 02f4012a..d6e77a21 100755 --- a/exec.sh +++ b/exec.sh @@ -2,6 +2,6 @@ make all cd samples/advanced/pdegen make amg_d_pde3d cd runs -mpirun -np 8 amg_d_pde3d amg_pde3d.inp +mpirun -np 2 amg_d_pde3d amg_pde3d.inp From 76e04ee997e4d24e35414c32276b935902b08035 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Thu, 5 May 2022 15:57:58 -0500 Subject: [PATCH 06/96] The OMP and MPI version is now separated in two different files --- README.md | 3 +- amgprec/impl/aggregator/Makefile | 3 +- amgprec/impl/aggregator/MatchBoxPC.cpp | 17 +- amgprec/impl/aggregator/MatchBoxPC.h | 11 + ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 258 +-- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1467 +++++++++++++++++ exec.sh | 2 +- 7 files changed, 1565 insertions(+), 196 deletions(-) create mode 100644 amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp diff --git a/README.md b/README.md index fbea8c39..1d330385 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ - - AMG4PSBLAS + AMG4PSBLAS Algebraic Multigrid Package based on PSBLAS (Parallel Sparse BLAS version 3.7) Salvatore Filippone (University of Rome Tor Vergata and IAC-CNR) diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile index d857a3b0..1f6f52af 100644 --- a/amgprec/impl/aggregator/Makefile +++ b/amgprec/impl/aggregator/Makefile @@ -62,7 +62,8 @@ amg_s_parmatch_smth_bld.o \ amg_s_parmatch_spmm_bld_inner.o MPCOBJS=MatchBoxPC.o \ -algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o +algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \ +algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o OBJS = $(FOBJS) $(MPCOBJS) diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp index 270c6d04..90b448dc 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.cpp +++ b/amgprec/impl/aggregator/MatchBoxPC.cpp @@ -66,22 +66,35 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, myRank,NLVer, NLEdge,verDistance[0],verDistance[1]); #endif + #define TIME_TRACKER #ifdef TIME_TRACKER double tmr = MPI_Wtime(); #endif - dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge, +#define OMP +#ifdef OMP + dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge, verLocPtr, verLocInd, edgeLocWeight, verDistance, Mate, myRank, numProcs, C_comm, msgIndSent, msgActualSent, msgPercent, ph0_time, ph1_time, ph2_time, ph1_card, ph2_card ); +#else + dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge, + verLocPtr, verLocInd, edgeLocWeight, + verDistance, Mate, + myRank, numProcs, C_comm, + msgIndSent, msgActualSent, msgPercent, + ph0_time, ph1_time, ph2_time, + ph1_card, ph2_card ); +#endif + #ifdef TIME_TRACKER tmr = MPI_Wtime() - tmr; - fprintf(stderr, "Elaboration time: %f for $ld\n", tmr, NLEdge); + fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer); #endif #endif diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 21d0a181..6c3f765f 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -152,6 +152,17 @@ extern "C" { inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt myRank, MilanInt numProcs); +void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP + ( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, + MilanLongInt* verDistance, + MilanLongInt* Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, + MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, + MilanLongInt* ph1_card, MilanLongInt* ph2_card ); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC ( MilanLongInt NLVer, MilanLongInt NLEdge, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp index 818c9f07..8be438b6 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp @@ -1,6 +1,4 @@ #include "MatchBoxPC.h" -#include -#include // *********************************************************************** // // MatchboxP: A C++ library for approximate weighted matching @@ -94,21 +92,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( MilanReal* msgPercent, MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { - - /* - * verDistance: it's a vector long as the number of processors. - * verDistance[i] contains the first node index of the i-th processor - * verDistance[i + 1] contains the last node index of the i-th processor - * NLVer: number of elements in the LocPtr - * NLEdge: number of edges assigned to the current processor - * - * Contains the portion of matrix assigned to the processor in - * Yale notation - * verLocInd: contains the positions on row of the matrix - * verLocPtr: i-th value is the position of the first element on the i-th row and - * i+1-th value is the position of the first element on the i+1-th row - */ - #if !defined(SERIAL_MPI) #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Ghost2LocalMap; //Map each ghost vertex to a local vertex + map Ghost2LocalMap; //Map each ghost vertex to a local vertex // index that starts with zero to |Vg| - 1 map::iterator storedAlready; - vector Counter; //Store the edge count for each ghost vertex - MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices + vector Counter; //Store the edge count for each ghost vertex + MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe=0; //Number of Ghost vertices #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< verGhostPtr, verGhostInd, tempCounter; - //Mate array for ghost vertices: - vector GMate; //Proportional to the number of ghost vertices - -#ifdef TIME_TRACKER - double Ghost2LocalInitialization = MPI_Wtime(); -#endif - -//#define OMP -#ifdef OMP -#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) - { -#endif - //printf("Id %d\n", omp_get_thread_num()); - -#ifdef OMP -#pragma omp for -#endif - for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice - insertMe = verLocInd[i]; - //cout<<"InsertMe on Process "< EndIndex)) { //Find a ghost -#ifdef OMP -#pragma omp critical - { -#endif - numGhostEdges++; - storedAlready = Ghost2LocalMap.find(insertMe); - if (storedAlready != Ghost2LocalMap.end()) { //Has already been added - //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter - } else { //Insert an entry for the ghost: - //cout<<"Process "< EndIndex) ) { //Find a ghost + storedAlready = Ghost2LocalMap.find( insertMe ); + if ( storedAlready != Ghost2LocalMap.end() ) { //Has already been added + //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter + numGhostEdges++; + } else { //Insert an entry for the ghost: + //cout<<"Process "<second<<" - "<first<<" : "<second]<second<<" - "<first<<" : "<second]< verGhostPtr, verGhostInd, tempCounter; + //Mate array for ghost vertices: + vector GMate; //Proportional to the number of ghost vertices + try { + verGhostPtr.reserve(numGhostVertices+1); //Pointer Vector + tempCounter.reserve(numGhostVertices); //Pointer Vector + verGhostInd.reserve(numGhostEdges); //Index Vector + GMate.reserve(numGhostVertices); //Ghost Mate Vector + } catch ( length_error ) { + cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout<<"Not enough memory to allocate the internal variables \n"; + exit(1); + } + //Initialize the Vectors: + verGhostPtr.resize(numGhostVertices+1, 0); //Pointer Vector + tempCounter.resize(numGhostVertices, 0); //Temporary Counter + verGhostInd.resize(numGhostEdges, -1); //Index Vector + GMate.resize(numGhostVertices, -1); //Temporary Counter + verGhostPtr[0] = 0; //The first value #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) cout< EndIndex) ) { //Find a ghost -#ifdef OMP -#pragma omp critical - { -#endif - insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert - verGhostInd[insertMe] = v + StartIndex; //Add the adjacency - tempCounter[Ghost2LocalMap[w]]++; //Increment the counter -#ifdef OMP - } -#endif + insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert + verGhostInd[insertMe] = v+StartIndex; //Add the adjacency + tempCounter[Ghost2LocalMap[w]]++; //Increment the counter } //End of if((w < StartIndex) || (w > EndIndex)) } //End of for(k) } //End of for (v) tempCounter.clear(); //Do not need this any more - -#ifdef OMP - } //end of parallel region -#endif -#ifdef TIME_TRACKER - verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; - fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); -#endif - #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< +#include +// *********************************************************************** +// +// MatchboxP: A C++ library for approximate weighted matching +// Mahantesh Halappanavar (hala@pnnl.gov) +// Pacific Northwest National Laboratory +// +// *********************************************************************** +// +// Copyright (2021) Battelle Memorial Institute +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////// DOMINATING EDGES MODEL /////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////// +/* Function : algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate() + * + * Date : New update: Feb 17, 2019, Richland, Washington. + * Date : Original development: May 17, 2009, E&CS Bldg. + * + * Purpose : Compute Approximate Maximum Weight Matching in Linear Time + * + * Args : inputMatrix - instance of Compressed-Col format of Matrix + * Mate - The Mate array + * + * Returns : By Value: (void) + * By Reference: Mate + * + * Comments : 1/2 Approx Algorithm. Picks the locally available heaviest edge. + * Assumption: The Mate Array is empty. + */ + +/* + NLVer = #of vertices, NLEdge = #of edges + CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers) + verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process + Mate = A vector of size |V_p| (local subgraph) to store the output (matching) + MPI: myRank, numProcs, comm, + Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world + Statistics: ph0_time, ph1_time, ph2_time: Runtimes + Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2) + */ + +#ifdef SERIAL_MPI +#else +//MPI type map +template MPI_Datatype TypeMap(); +template<> inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } +template<> inline MPI_Datatype TypeMap() { return MPI_INT; } +template<> inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } +template<> inline MPI_Datatype TypeMap() { return MPI_FLOAT; } + +// DOUBLE PRECISION VERSION +//WARNING: The vertex block on a given rank is contiguous +void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt* verLocPtr, MilanLongInt* verLocInd, + MilanReal* edgeLocWeight, + MilanLongInt* verDistance, + MilanLongInt* Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, + MilanReal* msgPercent, + MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, + MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { + + /* + * verDistance: it's a vector long as the number of processors. + * verDistance[i] contains the first node index of the i-th processor + * verDistance[i + 1] contains the last node index of the i-th processor + * NLVer: number of elements in the LocPtr + * NLEdge: number of edges assigned to the current processor + * + * Contains the portion of matrix assigned to the processor in + * Yale notation + * verLocInd: contains the positions on row of the matrix + * verLocPtr: i-th value is the position of the first element on the i-th row and + * i+1-th value is the position of the first element on the i+1-th row + */ + +#if !defined(SERIAL_MPI) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<::iterator verLocPtr = inputSubGraph.getVerPtr_b(); + //vector::iterator verLocInd = inputSubGraph.getVerInd_b(); + //vector::iterator edgeLocWeight = inputSubGraph.getEdgeWt_b(); + + //Data structures for sending and receiving messages: + vector Message; // [ u, v, message_type ] + Message.resize(3,-1); + const MilanLongInt REQUEST = 1; + const MilanLongInt SUCCESS = 2; + const MilanLongInt FAILURE = 3; + const MilanLongInt SIZEINFO = 4; + MilanLongInt message_type = 0; + //Data structures for Message Bundling: + //Although up to two messages can be sent along any cross edge, + //only one message will be sent in the initialization phase - + //one of: REQUEST/FAILURE/SUCCESS + vector QLocalVtx, QGhostVtx, QMsgType; + vector QOwner; // Changed by Fabio to be an integer, addresses needs to be integers! + vector PCounter; + MilanLongInt NumMessagesBundled; + MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers! + vector candidateMate; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< Ghost2LocalMap; //Map each ghost vertex to a local vertex + // index that starts with zero to |Vg| - 1 + map::iterator storedAlready; + vector Counter; //Store the edge count for each ghost vertex + MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< verGhostPtr, verGhostInd, tempCounter; + //Mate array for ghost vertices: + vector GMate; //Proportional to the number of ghost vertices + +#ifdef TIME_TRACKER + double Ghost2LocalInitialization = MPI_Wtime(); +#endif + + +#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) + { + +#pragma omp for + for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice + insertMe = verLocInd[i]; + //cout<<"InsertMe on Process "< EndIndex)) { //Find a ghost +#pragma omp critical + { + numGhostEdges++; + storedAlready = Ghost2LocalMap.find(insertMe); + if (storedAlready != Ghost2LocalMap.end()) { //Has already been added + //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter + } else { //Insert an entry for the ghost: + //cout<<"Process "<second<<" - "<first<<" : "<second]< 0 ) + cout< EndIndex) ) { //Find a ghost +#pragma omp critical + { + insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert + verGhostInd[insertMe] = v + StartIndex; //Add the adjacency + tempCounter[Ghost2LocalMap[w]]++; //Increment the counter + } + } //End of if((w < StartIndex) || (w > EndIndex)) + } //End of for(k) + } //End of for (v) + tempCounter.clear(); //Do not need this any more + +#pragma omp single + { + +#ifdef TIME_TRACKER + verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; + fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<EndIndex) ) { //Is it a ghost vertex? + if(GMate[Ghost2LocalMap[verLocInd[k]]] >= 0 )// Already matched + continue; + } else { //A local vertex + if( Mate[verLocInd[k]-StartIndex] >= 0 ) // Already matched + continue; + } + + if( (edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt)&&(w < verLocInd[k])) ) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; + } + } //End of for loop + candidateMate[v] = w; + + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= 0 ) { + if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost vertex + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = REQUEST; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if ( Counter[Ghost2LocalMap[w]] == 0 ) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { // w is a local vertex + if ( candidateMate[w-StartIndex] == (v+StartIndex) ) { + Mate[v] = w; //v is local + Mate[w-StartIndex] = v+StartIndex; //w is local + //Q.push_back(u); + U.push_back(v+StartIndex); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + else { + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v+1]; + for( k1 = adj11; k1 < adj12; k1++ ) { + w = verLocInd[k1]; + if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices + //Get the Adjacency list for u + adj1 = verLocPtr[u-StartIndex]; //Pointer + adj2 = verLocPtr[u-StartIndex+1]; + for( k = adj1; k < adj2; k++ ) { + v = verLocInd[k]; + if ( (v >= StartIndex) && (v <= EndIndex) ) { //If Local Vertex: + if ( (vEndIndex) ) { //Is it a ghost vertex? + if(GMate[Ghost2LocalMap[v]] >= 0 )// Already matched + continue; + } else { //A local vertex + if( Mate[v-StartIndex] >= 0 ) // Already matched + continue; + } //End of else + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<EndIndex) ) { //Is it a ghost vertex? + if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched + continue; + } else { //A local vertex + if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched + continue; + } + if( (edgeLocWeight[k1] > heaviestEdgeWt) || + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + heaviestEdgeWt = edgeLocWeight[k1]; + w = verLocInd[k1]; + } + } //End of for loop + candidateMate[v-StartIndex] = w; + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= 0 ) { + if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + //Build the Message Packet: + //Message[0] = v; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = REQUEST; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if ( Counter[Ghost2LocalMap[w]] == 0 ) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { //w is a local vertex + if ( candidateMate[w-StartIndex] == v ) { + Mate[v-StartIndex] = w; //v is a local vertex + Mate[w-StartIndex] = v; //w is a local vertex + //Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + else { + adj11 = verLocPtr[v-StartIndex]; + adj12 = verLocPtr[v-StartIndex+1]; + for( k1 = adj11; k1 < adj12; k1++ ) { + w = verLocInd[k1]; + if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + //Build the Message Packet: + //Message[0] = v; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { //Neighbor is a ghost vertex + if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u ) + candidateMate[NLVer+Ghost2LocalMap[v]] = -1; + if ( v != Mate[u-StartIndex] ) { //u is local + //Build the Message Packet: + //Message[0] = u; //LOCAL + //Message[1] = v; //GHOST + //Message[2] = SUCCESS; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + } //End of while ( /*!Q.empty()*/ !U.empty() ) + ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// +#ifdef DEBUG_HANG_ + if (myRank == 0) cout<<"\n("< PCumulative, PMessageBundle, PSizeInfoMessages; + MilanLongInt myIndex=0; + try { + PMessageBundle.reserve(NumMessagesBundled*3); //Three integers per message + PCumulative.reserve(numProcs+1); //Similar to Row Pointer vector in CSR data structure + PSizeInfoMessages.reserve(numProcs*3); //Buffer to hold the Size info message packets + } catch ( length_error ) { + cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout<<"Not enough memory to allocate the internal variables \n"; + exit(1); + } + PMessageBundle.resize(NumMessagesBundled*3, -1);//Initialize + PCumulative.resize(numProcs+1, 0); //Only initialize the counter variable + PSizeInfoMessages.resize(numProcs*3, 0); + + for (MilanInt i=0; i SRequest; //Requests that are used for each send message + vector SStatus; //Status of sent messages, used in MPI_Wait + MilanLongInt MessageIndex=0; //Pointer for current message + try { + SRequest.reserve(numProcs*2); //At most two messages per processor + SStatus.reserve(numProcs*2);//At most two messages per processor + } catch ( length_error ) { + cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n"; + cout<<"Not enough memory to allocate the internal variables \n"; + exit(1); + } + MPI_Request myReq; //A sample request + SRequest.resize(numProcs*2,myReq); + MPI_Status myStat; //A sample status + SStatus.resize(numProcs*2,myStat); + //Send the Messages + for (MilanInt i=0; i 0 ) { //Send only if it is a nonempty packet + MPI_Isend(&PSizeInfoMessages[i*3+0], 3, TypeMap(), i, ComputeTag, comm, &SRequest[MessageIndex]); + msgActual++; + MessageIndex++; + //Now Send the message with the data packet: +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), i, BundleTag, comm, &SRequest[MessageIndex]); + MessageIndex++; + } //End of if size > 0 + } + //Free up temporary memory: + PCumulative.clear(); + QLocalVtx.clear(); + QGhostVtx.clear(); + QMsgType.clear(); + QOwner.clear(); + PCounter.clear(); +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), comm, &OneMessageSize); //Size of one message packet + //How many messages to send? + //Potentially three kinds of messages will be sent/received: + //Request, Success, Failure. + //But only two will be sent from a given processor. + //Substract the number of messages that have already been sent as bundled messages: + MilanLongInt numMessagesToSend = numGhostEdges*2 - NumMessagesBundled; + MilanInt BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend; + + MilanLongInt *Buffer=0; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + Buffer = (MilanLongInt *) malloc(BufferSize); //Allocate memory + if ( Buffer == 0 ) { + cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout<<"Not enough memory to allocate for send buffer on process "< ReceiveBuffer; + MilanLongInt bundleSize=0, bundleCounter=0; + try { + ReceiveBuffer.reserve(numGhostEdges*2*3); //Three integers per cross edge + } catch ( length_error ) { + cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout<<"Not enough memory to allocate the internal variables \n"; + exit(1); + } + while ( true ) { +#ifdef DEBUG_HANG_ + if (myRank == 0) cout<<"\n("<= StartIndex) && (u <= EndIndex) ) { //Process Only If a Local Vertex + //Get the Adjacency list for u + adj1 = verLocPtr[u-StartIndex]; //Pointer + adj2 = verLocPtr[u-StartIndex+1]; + for( k = adj1; k < adj2; k++ ) { + v = verLocInd[k]; + if ( (v >= StartIndex) && (v <= EndIndex) ) { //v is a Local Vertex: + if ( Mate[v-StartIndex] >= 0 ) // v is already matched + continue; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<EndIndex) ) { //Is it a ghost vertex? + if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched + continue; + } + else { //A local vertex + if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched + continue; + } + + if( (edgeLocWeight[k1] > heaviestEdgeWt) || + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + heaviestEdgeWt = edgeLocWeight[k1]; + w = verLocInd[k1]; + } + } //End of for loop + candidateMate[v-StartIndex] = w; + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= 0 ) { + if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost + //Build the Message Packet: + Message[0] = v; //LOCAL + Message[1] = w; //GHOST + Message[2] = REQUEST; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), ghostOwner, ComputeTag, comm); + msgInd++; msgActual++; + if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) { + Mate[v-StartIndex] = w; //v is local + GMate[Ghost2LocalMap[w]] = v; //w is ghost + //Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if ( Counter[Ghost2LocalMap[w]] == 0 ) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { //w is a local vertex + if ( candidateMate[w-StartIndex] == v ) { + Mate[v-StartIndex] = w; //v is local + Mate[w-StartIndex] = v; //w is local + //Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + else { //no dominating edge found: w == -1 + adj11 = verLocPtr[v-StartIndex]; + adj12 = verLocPtr[v-StartIndex+1]; + for( k1 = adj11; k1 < adj12; k1++ ) { + w = verLocInd[k1]; + if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + //Build the Message Packet: + Message[0] = v; //LOCAL + Message[1] = w; //GHOST + Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), ghostOwner, ComputeTag, comm); + msgInd++; msgActual++; + } //End of if(GHOST) + } //End of for loop + } // End of Else: w == -1 + //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } //End of If (candidateMate[v-StartIndex] == u) + } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { //Neighbor v is a ghost vertex + if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u ) + candidateMate[NLVer+Ghost2LocalMap[v]] = -1; + if ( v != Mate[u-StartIndex] ) { //u is a local vertex + //Build the Message Packet: + Message[0] = u; //LOCAL + Message[1] = v; //GHOST + Message[2] = SUCCESS; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), ghostOwner, ComputeTag, comm); + msgInd++; msgActual++; +#ifdef DEBUG_GHOST_ + if ((uEndIndex)) { + cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + } //End of while ( /*!Q.empty()*/ !U.empty() ) + ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// + + //// BREAK IF NO MESSAGES EXPECTED ///////// +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus); + if (error_codeC != MPI_SUCCESS ) { + MPI_Error_string(error_codeC, error_message, &message_length); + cout<<"\n*Error in call to MPI_Receive on Slave: "<(), Sender, BundleTag, comm, &computeStatus); + if (error_codeC != MPI_SUCCESS ) { + MPI_Error_string(error_codeC, error_message, &message_length); + cout<<"\n*Error in call to MPI_Receive on processor "<NLVer)) { + cout<<"\n("< 0 ) { + Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement + if ( Counter[Ghost2LocalMap[u]] == 0 ) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + } //End of if ( candidateMate[v-StartIndex] == u )e + } //End of if ( Mate[v] == -1 ) + } //End of REQUEST + else { //CASE II: SUCCESS + if ( message_type == SUCCESS ) { +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement + if ( Counter[Ghost2LocalMap[u]] == 0 ) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) +#ifdef DEBUG_GHOST_ + if ((v<0)||(vNLVer)) { + cout<<"\n("<EndIndex) ) { //Is it a ghost vertex? + if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched + continue; + } + else { //A local vertex + if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched + continue; + } + + if( (edgeLocWeight[k1] > heaviestEdgeWt) || + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + heaviestEdgeWt = edgeLocWeight[k1]; + w = verLocInd[k1]; + } + } //End of for loop + candidateMate[v-StartIndex] = w; + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= 0 ) { + if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost + //Build the Message Packet: + Message[0] = v; //LOCAL + Message[1] = w; //GHOST + Message[2] = REQUEST; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), ghostOwner, ComputeTag, comm); + msgInd++; msgActual++; + if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) { + Mate[v-StartIndex] = w; //v is local + GMate[Ghost2LocalMap[w]] = v; //w is ghost + //Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if ( Counter[Ghost2LocalMap[w]] == 0 ) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { //w is a local vertex + if ( candidateMate[w-StartIndex] == v ) { + Mate[v-StartIndex] = w; //v is local + Mate[w-StartIndex] = v; //w is local + //Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + else { //No dominant edge found + adj11 = verLocPtr[v-StartIndex]; + adj12 = verLocPtr[v-StartIndex+1]; + for( k1 = adj11; k1 < adj12; k1++ ) { + w = verLocInd[k1]; + if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + //Build the Message Packet: + Message[0] = v; //LOCAL + Message[1] = w; //GHOST + Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), ghostOwner, ComputeTag, comm); + msgInd++; msgActual++; + } //End of if(GHOST) + } //End of for loop + } // End of Else: w == -1 + //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } //End of if ( candidateMate[v-StartIndex] == u ) + } //End of if ( Mate[v] == -1 ) + } //End of if ( message_type == SUCCESS ) + else { //CASE III: FAILURE +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement + if ( Counter[Ghost2LocalMap[u]] == 0 ) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + } //End of else: CASE III + } //End of else: CASE I + } //End of if (!MsgQ.empty()) + ///////////////////////// END OF PROCESS MESSAGES ///////////////////////////////// +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 ) { + MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer + free(Buffer); //Free the memory that was allocated + } + finishTime = MPI_Wtime(); + *ph2_time = finishTime-startTime; //Time taken for Phase-2 + *ph2_card = myCard ; //Cardinality at the end of Phase-2 + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0) { + *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0; + } else { + *msgPercent = 0; + } + + } //end single + + } //end of parallel region + +#ifdef DEBUG_HANG_ + if (myRank == 0) cout<<"\n("< vtxIndex ) + End = Current - 1; + else //CASE 3: + Start = Current + 1; + } + } //End of While() + if ( Current == 0 ) + return (Current); + else { + if ( mVerDistance[Current] > vtxIndex ) + return (Current-1); + else + return (Current); + } //End of else + return (-1); //It should not reach here! +} //End of findOwnerOfGhost() +#endif + +#endif \ No newline at end of file diff --git a/exec.sh b/exec.sh index d6e77a21..3174e0a5 100755 --- a/exec.sh +++ b/exec.sh @@ -2,6 +2,6 @@ make all cd samples/advanced/pdegen make amg_d_pde3d cd runs -mpirun -np 2 amg_d_pde3d amg_pde3d.inp +mpirun -np 4 amg_d_pde3d amg_pde3d.inp From a20f0d47e7265b29a97a16e67c01e0ac22183681 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 8 May 2022 12:15:46 -0500 Subject: [PATCH 07/96] Solved the static queue out of scope problem --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 407 ++++++++++-------- exec.sh | 1 + 2 files changed, 223 insertions(+), 185 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index f232cfa2..27014cca 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -204,23 +204,25 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector verGhostPtr, verGhostInd, tempCounter; //Mate array for ghost vertices: vector GMate; //Proportional to the number of ghost vertices - + MilanLongInt S; + staticQueue U; #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif - -#pragma omp parallel private(insertMe, k, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, adj1, adj2, heaviestEdgeWt) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { -#pragma omp for + // TODO comments about the fking reduction + +#pragma omp for reduction(+ : numGhostEdges) for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice insertMe = verLocInd[i]; //cout<<"InsertMe on Process "< EndIndex)) { //Find a ghost + numGhostEdges++; #pragma omp critical { - numGhostEdges++; storedAlready = Ghost2LocalMap.find(insertMe); if (storedAlready != Ghost2LocalMap.end()) { //Has already been added //cout<<"Process "<first<<" - "<second< 0 ) - cout< 0 ) + cout< EndIndex) ) { //Find a ghost + + //TODO why the nowait here fails? + +#pragma omp for nowait + for (v = 0; v < NLVer; v++) { + adj1 = verLocPtr[v]; //Vertex Pointer + adj2 = verLocPtr[v + 1]; + for (k = adj1; k < adj2; k++) { + w = verLocInd[k]; //Get the adjacent vertex + if ((w < StartIndex) || (w > EndIndex)) { //Find a ghost #pragma omp critical - { - insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert - verGhostInd[insertMe] = v + StartIndex; //Add the adjacency - tempCounter[Ghost2LocalMap[w]]++; //Increment the counter - } - } //End of if((w < StartIndex) || (w > EndIndex)) - } //End of for(k) - } //End of for (v) - tempCounter.clear(); //Do not need this any more + { + insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert + verGhostInd[insertMe] = v + StartIndex; //Add the adjacency + tempCounter[Ghost2LocalMap[w]]++; //Increment the counter + } + } //End of if((w < StartIndex) || (w > EndIndex)) + } //End of for(k) + } //End of for (v) #pragma omp single { #ifdef TIME_TRACKER - verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; - fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); + verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; + fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); #endif #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<EndIndex) ) { //Is it a ghost vertex? - if(GMate[Ghost2LocalMap[verLocInd[k]]] >= 0 )// Already matched + for (k = adj1; k < adj2; k++) { + if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex? + if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched continue; } else { //A local vertex - if( Mate[verLocInd[k]-StartIndex] >= 0 ) // Already matched + if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched continue; } - if( (edgeLocWeight[k] > heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt)&&(w < verLocInd[k])) ) { + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { heaviestEdgeWt = edgeLocWeight[k]; w = verLocInd[k]; } } //End of for loop candidateMate[v] = w; + // } - //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0 ) { - if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost vertex - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if ( Counter[Ghost2LocalMap[w]] == 0 ) { - S--; //Decrement S + /* + for ( v=0; v < NLVer; v++ ) { #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { // w is a local vertex - if ( candidateMate[w-StartIndex] == (v+StartIndex) ) { - Mate[v] = w; //v is local - Mate[w-StartIndex] = v+StartIndex; //w is local - //Q.push_back(u); - U.push_back(v+StartIndex); - U.push_back(w); - myCard++; -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v+1]; - for( k1 = adj11; k1 < adj12; k1++ ) { - w = verLocInd[k1]; - if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + adj1 = verLocPtr[v]; + adj2 = verLocPtr[v + 1]; + w = candidateMate[v]; +*/ +//#pragma omp critical + // { + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= 0) { + if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex //Build the Message Packet: //Message[0] = v+StartIndex; //LOCAL //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE + //Message[2] = REQUEST; //TYPE //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { // w is a local vertex + if (candidateMate[w - StartIndex] == (v + StartIndex)) { + Mate[v] = w; //v is local + Mate[w - StartIndex] = v + StartIndex; //w is local + //Q.push_back(u); + U.push_back(v + StartIndex); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + else { + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< Date: Sun, 8 May 2022 15:11:56 -0500 Subject: [PATCH 08/96] PARALLEL_COMPUTE_CANDIDATE_MATE_B --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 60 ++++++++++--------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 27014cca..63e76d6d 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -162,7 +162,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector PCounter; MilanLongInt NumMessagesBundled; MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers! - vector candidateMate; + //vector candidateMate; + MilanLongInt* candidateMate = new MilanLongInt[1]; #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Date: Mon, 9 May 2022 16:52:03 -0500 Subject: [PATCH 09/96] Instable PARALLEL_PROCESS_EXPOSED_VERTEX_B --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 141 +++++++++++------- 1 file changed, 85 insertions(+), 56 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 63e76d6d..77494032 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -159,7 +159,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //one of: REQUEST/FAILURE/SUCCESS vector QLocalVtx, QGhostVtx, QMsgType; vector QOwner; // Changed by Fabio to be an integer, addresses needs to be integers! - vector PCounter; + + MilanLongInt* PCounter = new MilanLongInt [numProcs]; + for (int i = 0; i < numProcs; i++) + PCounter[i] = 0; + MilanLongInt NumMessagesBundled; MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers! //vector candidateMate; @@ -211,7 +215,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( double Ghost2LocalInitialization = MPI_Wtime(); #endif -#pragma omp parallel private(insertMe, k, adj1, adj2, heaviestEdgeWt, w) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { // TODO comments about the reduction @@ -370,13 +374,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( QGhostVtx.reserve(numGhostEdges); //Ghost Vertex QMsgType.reserve(numGhostEdges); //Message Type (Request/Failure) QOwner.reserve(numGhostEdges); //Owner of the ghost: COmpute once and use later - PCounter.reserve(numProcs); //Store How many messages will be sent to each processor } catch (length_error) { cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; cout << "Not enough memory to allocate the internal variables \n"; exit(1); } - PCounter.resize(numProcs, 0); //Only initialize the counter variable #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= 0) { + myCard++; if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex //Build the Message Packet: //Message[0] = v+StartIndex; //LOCAL @@ -495,56 +511,65 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #endif /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w), ComputeTag, comm);*/ - QLocalVtx.push_back(v + StartIndex); - QGhostVtx.push_back(w); - QMsgType.push_back(REQUEST); - //ghostOwner = inputSubGraph.findOwner(w); - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - QOwner.push_back(ghostOwner); - PCounter[ghostOwner]++; - NumMessagesBundled++; msgInd++; - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) { - Mate[v] = w; - GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost - //Q.push_back(u); - U.push_back(v + StartIndex); - U.push_back(w); - myCard++; + NumMessagesBundled++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + PCounter[ghostOwner]++; //TODO maybe reduction? +#pragma omp critical + { + QLocalVtx.push_back(v + StartIndex); + QGhostVtx.push_back(w); + QMsgType.push_back(REQUEST); + //ghostOwner = inputSubGraph.findOwner(w); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + QOwner.push_back(ghostOwner); + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) { + + Mate[v] = w; + GMate[Ghost2LocalMap[w]] = v + StartIndex; //w is a Ghost + U.push_back(v + StartIndex); + U.push_back(w); + #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S + //Decrement the counter: + //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) + if (Counter[Ghost2LocalMap[w]] > 0) { + + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v + } + } //End of if Counter[w] > 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + } // end of critical region } //End of if a Ghost Vertex else { // w is a local vertex - if (candidateMate[w - StartIndex] == (v + StartIndex)) { - Mate[v] = w; //v is local - Mate[w - StartIndex] = v + StartIndex; //w is local - //Q.push_back(u); - U.push_back(v + StartIndex); - U.push_back(w); - myCard++; + + if (candidateMate[w - StartIndex] == (v + StartIndex)) { +#pragma omp critical + { + Mate[v] = w; //v is local + Mate[w - StartIndex] = v + StartIndex; //w is local + //Q.push_back(u); + U.push_back(v + StartIndex); + U.push_back(w); + #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) + } + } //End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) + } //End of Else + } //End of if(w >=0) else { adj11 = verLocPtr[v]; adj12 = verLocPtr[v + 1]; @@ -563,23 +588,28 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #endif /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w), ComputeTag, comm); */ - QLocalVtx.push_back(v + StartIndex); - QGhostVtx.push_back(w); - QMsgType.push_back(FAILURE); - //ghostOwner = inputSubGraph.findOwner(w); - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - QOwner.push_back(ghostOwner); - PCounter[ghostOwner]++; NumMessagesBundled++; msgInd++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + PCounter[ghostOwner]++; +#pragma omp critical + { + QLocalVtx.push_back(v + StartIndex); + QGhostVtx.push_back(w); + QMsgType.push_back(FAILURE); + //ghostOwner = inputSubGraph.findOwner(w); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + QOwner.push_back(ghostOwner); + } + } //End of if(GHOST) } //End of for loop } // End of Else: w == -1 //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) //} // end of critical } //End of for ( v=0; v < NLVer; v++ ) + } // end of parallel region tempCounter.clear(); //Do not need this any more //} // end of parallel region @@ -855,7 +885,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( QGhostVtx.clear(); QMsgType.clear(); QOwner.clear(); - PCounter.clear(); #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< EndIndex)) { //Is it a ghost vertex? - if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched - continue; - } else { //A local vertex - if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched - continue; - } - - if ((edgeLocWeight[k] > heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { - heaviestEdgeWt = edgeLocWeight[k]; - w = verLocInd[k]; - - } - } //End of for loop - candidateMate[v] = w; - //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + /* + * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B + * It is actually not possible to parallelize this cycle + * as it is. + * + * TODO think how it could be parallelizable + */ - } + for ( v=0; v < NLVer; v++ ) { +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< EndIndex)) { //Is it a ghost vertex? + if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched + continue; + } else { //A local vertex + if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched + continue; + } - /* - TODO this cycle has a lot of margin of improvement!!!! - This current version introduce some errors. - 1 - ollback to the previous verison and check if it is - 100% stable - 2 - if the previous verison was stable all right, if not - that's a big deal - 3 - reimplement step by step to check from where the instability - comes from - */ + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; -#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) - for ( v=0; v < NLVer; v++ ) { + } + } //End of for loop + candidateMate[v] = w; + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { - myCard++; - if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) + //If found a dominating edge: + if (w >= 0) { + myCard++; + if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = REQUEST; //TYPE + //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v - } // end of critical region - } //End of if a Ghost Vertex - else { // w is a local vertex + } + } //End of if Counter[w] > 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { // w is a local vertex if (candidateMate[w - StartIndex] == (v + StartIndex)) { -#pragma omp critical - { - Mate[v] = w; //v is local - Mate[w - StartIndex] = v + StartIndex; //w is local - //Q.push_back(u); - U.push_back(v + StartIndex); - U.push_back(w); + + Mate[v] = w; //v is local + Mate[w - StartIndex] = v + StartIndex; //w is local + //Q.push_back(u); + U.push_back(v + StartIndex); + U.push_back(w); #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) + else { + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } @@ -962,7 +934,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } while ( true ) { #ifdef DEBUG_HANG_ - if (myRank == 0) cout<<"\n("< heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } @@ -1112,7 +1084,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); msgInd++; msgActual++; #ifdef DEBUG_GHOST_ - if ((uEndIndex)) { + if ((uEndIndex)) { cout<<"\n("<EndIndex)) { + if ((vEndIndex)) { cout<<"\n("<NLVer)) { + if ((v<0)||(vNLVer)) { cout<<"\n("< 0 //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) #ifdef DEBUG_GHOST_ - if ((v<0)||(vNLVer)) { + if ((v<0)||(vNLVer)) { cout<<"\n("< heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } @@ -1451,8 +1423,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); //MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer if ( BufferSize > 0 ) { - MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer - free(Buffer); //Free the memory that was allocated + MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer + free(Buffer); //Free the memory that was allocated } finishTime = MPI_Wtime(); *ph2_time = finishTime-startTime; //Time taken for Phase-2 @@ -1478,9 +1450,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( *msgActualSent = msgActual; *msgIndSent = msgInd; if (msgInd > 0) { - *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0; + *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0; } else { - *msgPercent = 0; + *msgPercent = 0; } #ifdef DEBUG_HANG_ From 1140669ea7b07fd93b0d528ba12d9bb49742f920 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 21 May 2022 07:01:42 -0500 Subject: [PATCH 11/96] firstComputeCandidateMate --- amgprec/impl/aggregator/MatchBoxPC.h | 5 ++ ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 59 ++++++++++++++++++- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 6c3f765f..94ea7ea8 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -152,6 +152,11 @@ extern "C" { inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt myRank, MilanInt numProcs); +inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanLongInt* verLocInd, + MilanReal* edgeLocWeight); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP ( MilanLongInt NLVer, MilanLongInt NLEdge, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 4814d32e..84d05e08 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -455,9 +455,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { heaviestEdgeWt = edgeLocWeight[k]; w = verLocInd[k]; - } } //End of for loop + //printf("Compare %ld, %ld\n", w, firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight)); candidateMate[v] = w; //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) @@ -594,8 +594,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////////////////////////////////////////////////////////////// /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - while ( /*!Q.empty()*/ !U.empty() ) { - //Q.pop_front(); + while ( !U.empty() ) { u = U.pop_front(); //Get an element from the queue #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; + } + } //End of for loop + return w; +} + +/* +inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanLongInt* verLocInd, + MilanReal* edgeLocWeight) +{ + MilanInt w = -1; + MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN + for (k = adj1; k < adj2; k++) { + if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex? + if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched + continue; + } else { //A local vertex + if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched + continue; + } + + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; + } + } //End of for loop + return w; +} + */ #endif #endif \ No newline at end of file diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index eb254780..bdacc992 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0080 ! IDIM; domain size. Linear system size is IDIM**3 +0123 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC From b4bfdd83e5fdf310e54af8e8127dc30ace50a804 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 21 May 2022 10:22:58 -0500 Subject: [PATCH 12/96] computeCandidateMate and isAlreadyMatched --- amgprec/impl/aggregator/MatchBoxPC.h | 19 +++++ ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 73 ++++++++++++++----- 2 files changed, 74 insertions(+), 18 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 94ea7ea8..73908b9b 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -157,6 +157,25 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, MilanLongInt* verLocInd, MilanReal* edgeLocWeight); +inline bool isAlreadyMatched(MilanLongInt k, + MilanLongInt* verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt* Mate, + map &Ghost2LocalMap); + +inline MilanLongInt computeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanReal* edgeLocWeight, + MilanLongInt k, + MilanLongInt* verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt* Mate, + map &Ghost2LocalMap); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP ( MilanLongInt NLVer, MilanLongInt NLEdge, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 84d05e08..e73c7939 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -443,13 +443,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( w = -1; heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN for (k = adj1; k < adj2; k++) { - if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex? - if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched - continue; - } else { //A local vertex - if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched - continue; - } + if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; if ((edgeLocWeight[k] > heaviestEdgeWt) || ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { @@ -1533,22 +1527,66 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, return w; } -/* -inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, +/** + * //TODO documentation + * @param k + * @param verLocInd + * @param StartIndex + * @param EndIndex + * @param GMate + * @param Mate + * @param Ghost2LocalMap + * @return + */ +inline bool isAlreadyMatched(MilanLongInt k, + MilanLongInt* verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt* Mate, + map &Ghost2LocalMap + ) { + + if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex? + if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched + return true; + } else { //A local vertex + if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched + return true; + } + + return false; +} + +/** + * //TODO documentation + * @param adj1 + * @param adj2 + * @param edgeLocWeight + * @param k + * @param verLocInd + * @param StartIndex + * @param EndIndex + * @param GMate + * @param Mate + * @param Ghost2LocalMap + * @return + */ +inline MilanLongInt computeCandidateMate(MilanLongInt adj1, MilanLongInt adj2, + MilanReal* edgeLocWeight, + MilanLongInt k, MilanLongInt* verLocInd, - MilanReal* edgeLocWeight) + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt* Mate, + map &Ghost2LocalMap) { MilanInt w = -1; MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN for (k = adj1; k < adj2; k++) { - if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex? - if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched - continue; - } else { //A local vertex - if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched - continue; - } + if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; if ((edgeLocWeight[k] > heaviestEdgeWt) || ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { @@ -1558,7 +1596,6 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, } //End of for loop return w; } - */ #endif #endif \ No newline at end of file From 6180f29f39380436c2a03d5e07c9fd76c0039d4d Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 21 May 2022 11:23:39 -0500 Subject: [PATCH 13/96] PARALLEL_COMPUTE_CANDIDATE_MATE_B is now paralle and correct --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 63 +++++++++++-------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index e73c7939..1b7014f5 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -424,38 +424,35 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //Compute the Initial Matching Set: S = numGhostVertices; //Initialize S with number of Ghost Vertices + } // end of single region - /* - * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B - * It is actually not possible to parallelize this cycle - * as it is. - * - * TODO think how it could be parallelizable - */ + /* + * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from + * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize + * the two. + * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel. + */ +#pragma omp for for ( v=0; v < NLVer; v++ ) { #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { - heaviestEdgeWt = edgeLocWeight[k]; - w = verLocInd[k]; - } - } //End of for loop - //printf("Compare %ld, %ld\n", w, firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight)); - candidateMate[v] = w; + candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight); //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + } +#pragma omp single + { + + + for ( v=0; v < NLVer; v++ ) + { //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + k = candidateMate[v]; + candidateMate[v] = verLocInd[k]; + w = candidateMate[v]; #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= 0) { myCard++; @@ -1516,15 +1527,17 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, { MilanInt w = -1; MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN + int finalK; for (int k = adj1; k < adj2; k++) { if ((edgeLocWeight[k] > heaviestEdgeWt) || ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { heaviestEdgeWt = edgeLocWeight[k]; w = verLocInd[k]; + finalK = k; } } //End of for loop - return w; + return finalK; } /** @@ -1579,9 +1592,9 @@ inline MilanLongInt computeCandidateMate(MilanLongInt adj1, MilanLongInt* verLocInd, MilanLongInt StartIndex, MilanLongInt EndIndex, - vector &GMate, + vector & GMate, MilanLongInt* Mate, - map &Ghost2LocalMap) + map & Ghost2LocalMap) { MilanInt w = -1; MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN From 2cac21b345bfdc2e0eb1e2116bc7e8212f602ba7 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 21 May 2022 11:46:40 -0500 Subject: [PATCH 14/96] fix and reformatting --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 208 +++++++++--------- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 2 files changed, 109 insertions(+), 101 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 1b7014f5..9349e9a2 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -446,7 +446,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #pragma omp single { - for ( v=0; v < NLVer; v++ ) { //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) @@ -461,131 +460,140 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= 0) { - myCard++; - if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) + + //This piece of code is actually executed under 0.01% of the times + if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; + } + + if (w >= 0) { + + myCard++; + if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = REQUEST; //TYPE + //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { + //Decrement the counter: + //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) + if (Counter[Ghost2LocalMap[w]] > 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { // w is a local vertex + } + } //End of if Counter[w] > 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { // w is a local vertex - if (candidateMate[w - StartIndex] == (v + StartIndex)) { + if (candidateMate[w - StartIndex] == (v + StartIndex)) { - Mate[v] = w; //v is local - Mate[w - StartIndex] = v + StartIndex; //w is local - //Q.push_back(u); - U.push_back(v + StartIndex); - U.push_back(w); + Mate[v] = w; //v is local + Mate[w - StartIndex] = v + StartIndex; //w is local + //Q.push_back(u); + U.push_back(v + StartIndex); + U.push_back(w); #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) + + //if (w < 0) { -- if it arrives here this one if is useless, it is certainly -1 + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + //Build the Message Packet: + //Message[0] = v+StartIndex; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< Date: Sun, 22 May 2022 16:50:07 -0500 Subject: [PATCH 15/96] PARALLEL_PROCESS_EXPOSED_VERTEX_B named critical sections --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 9349e9a2..e83c37b8 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -443,9 +443,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) } -#pragma omp single - { + /* + * PARALLEL_PROCESS_EXPOSED_VERTEX_B + * The sequential version could be a bit more + * efficient. + * + * TODO: Test when it's more efficient to execute this code + * in parallel. + */ +#pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) for ( v=0; v < NLVer; v++ ) { //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) @@ -461,23 +468,24 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( cout<<"\n("<= 0) { //This piece of code is actually executed under 0.01% of the times - if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { - w = computeCandidateMate(verLocPtr[v], - verLocPtr[v + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - candidateMate[v] = w; +#pragma omp critical + { + if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; + } } if (w >= 0) { @@ -500,7 +508,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( NumMessagesBundled++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); PCounter[ghostOwner]++; - +#pragma omp critical (QLocalPush) + { QLocalVtx.push_back(v + StartIndex); QGhostVtx.push_back(w); QMsgType.push_back(REQUEST); @@ -534,11 +543,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of if Counter[w] > 0 //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) } //End of if CandidateMate[w] = v + } // end of critical region } //End of if a Ghost Vertex else { // w is a local vertex if (candidateMate[w - StartIndex] == (v + StartIndex)) { - +#pragma omp critical (UPush) + { Mate[v] = w; //v is local Mate[w - StartIndex] = v + StartIndex; //w is local //Q.push_back(u); @@ -548,6 +559,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Date: Sun, 22 May 2022 17:35:08 -0500 Subject: [PATCH 16/96] False sharing fix --- ...DomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index e83c37b8..e598a21f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -293,8 +293,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /* * OMP verGhostPtrInitialization * + * schedule(static) assign to each thread an huge chunk + * it is used in this case to reduce the overhead of chunk assignment + * and to reduce false sharing */ -#pragma omp for nowait +#pragma omp for nowait schedule(static) for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|) verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; #ifdef PRINT_DEBUG_INFO_ @@ -332,7 +335,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( double verGhostIndInitialization = MPI_Wtime(); #endif -#pragma omp for nowait +#pragma omp for nowait schedule(static) for (v = 0; v < NLVer; v++) { adj1 = verLocPtr[v]; //Vertex Pointer adj2 = verLocPtr[v + 1]; @@ -433,7 +436,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel. */ -#pragma omp for +#pragma omp for schedule(static) for ( v=0; v < NLVer; v++ ) { #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Date: Sun, 29 May 2022 12:01:24 -0500 Subject: [PATCH 17/96] Refactoring + critical(Mate) --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 79 +++++++++---------- 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index e598a21f..a08c5485 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -215,7 +215,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( double Ghost2LocalInitialization = MPI_Wtime(); #endif -#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner, u) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { // TODO comments about the reduction @@ -475,9 +475,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( if (w >= 0) { //This piece of code is actually executed under 0.01% of the times -#pragma omp critical - { - if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { w = computeCandidateMate(verLocPtr[v], verLocPtr[v + 1], edgeLocWeight, 0, @@ -489,7 +488,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( Ghost2LocalMap); candidateMate[v] = w; } - } if (w >= 0) { @@ -511,7 +509,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( NumMessagesBundled++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); PCounter[ghostOwner]++; -#pragma omp critical +#pragma omp critical(Mate) { QLocalVtx.push_back(v + StartIndex); QGhostVtx.push_back(w); @@ -551,7 +549,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( else { // w is a local vertex if (candidateMate[w - StartIndex] == (v + StartIndex)) { -#pragma omp critical +#pragma omp critical(Mate) { Mate[v] = w; //v is local Mate[w - StartIndex] = v + StartIndex; //w is local @@ -632,41 +630,32 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( adj2 = verLocPtr[u-StartIndex+1]; for( k = adj1; k < adj2; k++ ) { v = verLocInd[k]; + if ( (v >= StartIndex) && (v <= EndIndex) ) { //If Local Vertex: - if ( (vEndIndex) ) { //Is it a ghost vertex? - if(GMate[Ghost2LocalMap[v]] >= 0 )// Already matched - continue; - } else { //A local vertex - if( Mate[v-StartIndex] >= 0 ) // Already matched - continue; - } //End of else + + if (isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<EndIndex) ) { //Is it a ghost vertex? - if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched - continue; - } else { //A local vertex - if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched - continue; - } - if( (edgeLocWeight[k1] > heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { - heaviestEdgeWt = edgeLocWeight[k1]; - w = verLocInd[k1]; - } - } //End of for loop - candidateMate[v-StartIndex] = w; + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); +#pragma omp critical + { + candidateMate[v - StartIndex] = w; + } //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< &GMate, @@ -1582,15 +1570,20 @@ inline bool isAlreadyMatched(MilanLongInt k, map &Ghost2LocalMap ) { - if ((verLocInd[k] < StartIndex) || (verLocInd[k] > EndIndex)) { //Is it a ghost vertex? - if (GMate[Ghost2LocalMap[verLocInd[k]]] >= 0)// Already matched - return true; - } else { //A local vertex - if (Mate[verLocInd[k] - StartIndex] >= 0) // Already matched - return true; + bool result = false; +#pragma omp critical(Mate) + { + if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex? + if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched + result = true; + } else { //A local vertex + if (Mate[node - StartIndex] >= 0) // Already matched + result = true; + } + } - return false; + return result; } /** @@ -1621,7 +1614,7 @@ inline MilanLongInt computeCandidateMate(MilanLongInt adj1, MilanInt w = -1; MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN for (k = adj1; k < adj2; k++) { - if (isAlreadyMatched(k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; if ((edgeLocWeight[k] > heaviestEdgeWt) || ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { From f3d7b3ab5e508857dddb8eb5b4aac5fa43c92c57 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 29 May 2022 12:01:28 -0500 Subject: [PATCH 18/96] False sharing fix --- amgprec/impl/aggregator/MatchBoxPC.h | 3 +-- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 73908b9b..ba7cb5c8 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -157,8 +157,7 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, MilanLongInt* verLocInd, MilanReal* edgeLocWeight); -inline bool isAlreadyMatched(MilanLongInt k, - MilanLongInt* verLocInd, +inline bool isAlreadyMatched(MilanLongInt node, MilanLongInt StartIndex, MilanLongInt EndIndex, vector &GMate, diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index eb254780..bdacc992 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0080 ! IDIM; domain size. Linear system size is IDIM**3 +0123 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC From 2c8dc2ffddba0669e93c837e0bc440e674644eda Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Mon, 30 May 2022 13:49:34 -0500 Subject: [PATCH 19/96] PROCESS MATCHED VERTICES parallelization draft --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 126 ++++++++++++------ samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 2 files changed, 87 insertions(+), 41 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index a08c5485..0140c0c6 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -210,12 +210,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //Mate array for ghost vertices: vector GMate; //Proportional to the number of ghost vertices MilanLongInt S; - staticQueue U; + MilanLongInt privateMyCard = 0; + staticQueue U, privateU; + bool isEmpty; #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif -#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner, u) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, w, ghostOwner, u, privateU, privateMyCard, isEmpty) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { // TODO comments about the reduction @@ -451,6 +453,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( * The sequential version could be a bit more * efficient. * + * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner + * first in a local variable and then, only at the end, append them to the real data structure + * to remove the critical sections. + * * TODO: Test when it's more efficient to execute this code * in parallel. */ @@ -508,6 +514,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( msgInd++; NumMessagesBundled++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); PCounter[ghostOwner]++; #pragma omp critical(Mate) { @@ -590,15 +598,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( NumMessagesBundled++; msgInd++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); PCounter[ghostOwner]++; #pragma omp critical { QLocalVtx.push_back(v + StartIndex); QGhostVtx.push_back(w); QMsgType.push_back(FAILURE); - //ghostOwner = inputSubGraph.findOwner(w); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); QOwner.push_back(ghostOwner); } @@ -607,7 +614,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //} // End of Else: w == -1 //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) } //End of for ( v=0; v < NLVer; v++ ) - } // end of parallel region tempCounter.clear(); //Do not need this any more @@ -619,19 +625,37 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////////////////////////////////////////////////////////////// /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// + privateU.~staticQueue(); + new(&privateU) staticQueue(1000); //TODO how can I put a meaningfull size? + /* while ( !U.empty() ) { u = U.pop_front(); //Get an element from the queue + */ + isEmpty = false; + while( true ) + { + +#pragma omp critical(U) + { + if (U.empty()) isEmpty = true; + else u = U.pop_front(); + } // End of critical U + if (isEmpty) break; + #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices + +#pragma omp critical + { //Get the Adjacency list for u - adj1 = verLocPtr[u-StartIndex]; //Pointer - adj2 = verLocPtr[u-StartIndex+1]; - for( k = adj1; k < adj2; k++ ) { + adj1 = verLocPtr[u - StartIndex]; //Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) { v = verLocInd[k]; - if ( (v >= StartIndex) && (v <= EndIndex) ) { //If Local Vertex: + if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex: if (isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; @@ -640,7 +664,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif - if ( candidateMate[v-StartIndex] == u ) { //Only if pointing to the matched vertex + if (candidateMate[v - StartIndex] == u) { //Only if pointing to the matched vertex //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) w = computeCandidateMate(verLocPtr[v - StartIndex], @@ -652,17 +676,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( GMate, Mate, Ghost2LocalMap); -#pragma omp critical - { + candidateMate[v - StartIndex] = w; - } + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= 0 ) { - if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + if (w >= 0) { + if ((w < StartIndex) || (w > EndIndex)) { //A ghost //Build the Message Packet: //Message[0] = v; //LOCAL //Message[1] = w; //GHOST @@ -678,26 +701,28 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( QGhostVtx.push_back(w); QMsgType.push_back(REQUEST); //ghostOwner = inputSubGraph.findOwner(w); - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); QOwner.push_back(ghostOwner); PCounter[ghostOwner]++; NumMessagesBundled++; msgInd++; - if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) { - Mate[v-StartIndex] = w; //v is a local vertex + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + Mate[v - StartIndex] = w; //v is a local vertex GMate[Ghost2LocalMap[w]] = v; //w is a ghost vertex //Q.push_back(u); - U.push_back(v); - U.push_back(w); - myCard++; + privateU.push_back(v); + privateU.push_back(w); + privateMyCard++; #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< 0 ) { + if (Counter[Ghost2LocalMap[w]] > 0) { Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if ( Counter[Ghost2LocalMap[w]] == 0 ) { + if (Counter[Ghost2LocalMap[w]] == 0) { S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<=0) else { - adj11 = verLocPtr[v-StartIndex]; - adj12 = verLocPtr[v-StartIndex+1]; - for( k1 = adj11; k1 < adj12; k1++ ) { + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { w = verLocInd[k1]; - if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost + if ((w < StartIndex) || (w > EndIndex)) { //A ghost //Build the Message Packet: //Message[0] = v; //LOCAL //Message[1] = w; //GHOST @@ -744,7 +769,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( QGhostVtx.push_back(w); QMsgType.push_back(FAILURE); //ghostOwner = inputSubGraph.findOwner(w); - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); QOwner.push_back(ghostOwner); PCounter[ghostOwner]++; NumMessagesBundled++; @@ -756,9 +783,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of If (candidateMate[v-StartIndex] == u) } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: else { //Neighbor is a ghost vertex - if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u ) - candidateMate[NLVer+Ghost2LocalMap[v]] = -1; - if ( v != Mate[u-StartIndex] ) { //u is local + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) { //u is local //Build the Message Packet: //Message[0] = u; //LOCAL //Message[1] = v; //GHOST @@ -774,7 +801,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( QGhostVtx.push_back(v); QMsgType.push_back(SUCCESS); //ghostOwner = inputSubGraph.findOwner(v); - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); QOwner.push_back(ghostOwner); PCounter[ghostOwner]++; NumMessagesBundled++; @@ -782,8 +811,24 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of If( v != Mate[u] ) } //End of Else //A Ghost Vertex } //End of For Loop adj(u) + + } + } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + +#pragma omp critical(U) + { + while(!privateU.empty()) { + U.push_back(privateU.pop_front()); + } + + myCard += privateMyCard; + } //End of critical U + } //End of while ( /*!Q.empty()*/ !U.empty() ) + + } // end of parallel region + ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// #ifdef DEBUG_HANG_ if (myRank == 0) cout<<"\n("<(), ghostOwner, ComputeTag, comm); - msgInd++; msgActual++; + msgInd++; + msgActual++; #ifdef DEBUG_GHOST_ if ((uEndIndex)) { cout<<"\n("<= 0) { - if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - } //End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { //w is a local vertex - if (candidateMate[w - StartIndex] == v) { - Mate[v - StartIndex] = w; //v is a local vertex - Mate[w - StartIndex] = v; //w is a local vertex - //Q.push_back(u); - privateU.push_back(v); - privateU.push_back(w); - privateMyCard++; -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; + //If found a dominating edge: + if (w >= 0) { if ((w < StartIndex) || (w > EndIndex)) { //A ghost //Build the Message Packet: - //Message[0] = v; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE + //Message[0] = v; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = REQUEST; //TYPE //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { //w is a local vertex + if (candidateMate[w - StartIndex] == v) { + Mate[v - StartIndex] = w; //v is a local vertex + Mate[w - StartIndex] = v; //w is a local vertex + //Q.push_back(u); + privateU.push_back(v); + privateU.push_back(w); + privateMyCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + else { + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + //Build the Message Packet: + //Message[0] = v; //LOCAL + //Message[1] = w; //GHOST + //Message[2] = FAILURE; //TYPE + //Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= StartIndex) && (v <= EndIndex) ) //If Local Vertex: else { //Neighbor is a ghost vertex - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) { //u is local - //Build the Message Packet: - //Message[0] = u; //LOCAL - //Message[1] = v; //GHOST - //Message[2] = SUCCESS; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex #pragma omp critical(U) @@ -827,8 +834,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of while ( /*!Q.empty()*/ !U.empty() ) +#ifdef COUNT_LOCAL_VERTEX + printf("Count local vertexes: %ld for thread %d of processor %d\n", + localVertices, + omp_get_thread_num(), + myRank); +#endif } // end of parallel region + ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// #ifdef DEBUG_HANG_ if (myRank == 0) cout<<"\n("< Date: Tue, 31 May 2022 16:04:56 -0500 Subject: [PATCH 21/96] Removed one critical region from PARALLEL_PROCESS_EXPOSED_VERTEX_B --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 86 ++++++++++++------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 43584b77..783ed17e 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -211,13 +211,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector GMate; //Proportional to the number of ghost vertices MilanLongInt S; MilanLongInt privateMyCard = 0; - staticQueue U, privateU; + staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; + + /* + staticQueue privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner; + */ bool isEmpty; #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif -#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner /*, privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner*/) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { // TODO comments about the reduction @@ -291,21 +295,18 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( double verGhostPtrInitialization = MPI_Wtime(); #endif - } + /* - * OMP verGhostPtrInitialization - * - * schedule(static) assign to each thread an huge chunk - * it is used in this case to reduce the overhead of chunk assignment - * and to reduce false sharing + * Not parallelizable */ -#pragma omp for nowait schedule(static) + for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|) verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; #ifdef PRINT_DEBUG_INFO_ cout< EndIndex)) { //w is a ghost vertex - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Date: Thu, 2 Jun 2022 07:29:21 -0500 Subject: [PATCH 22/96] Further optimizations PARALLEL_PROCESS_EXPOSED_VERTEX_B --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 284 +++++++++--------- 1 file changed, 143 insertions(+), 141 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 783ed17e..9f3cbb97 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -164,6 +164,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( for (int i = 0; i < numProcs; i++) PCounter[i] = 0; + MilanLongInt NumMessagesBundled; MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers! //vector candidateMate; @@ -213,15 +214,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt privateMyCard = 0; staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; - /* - staticQueue privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner; - */ bool isEmpty; #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif -#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner /*, privateReqQLocalVtx, privateReqQGhostVtx, privateReqQMsgType, privateReqQOwner*/) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { // TODO comments about the reduction @@ -402,7 +400,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( * Create the Queue Data Structure for the Dominating Set * * I had to declare the staticuQueue U before the parallel region - * to have it in the correct scope. Since we can't chane the dimension + * to have it in the correct scope. Since we can't change the dimension * of a staticQueue I had to destroy the previous object and instantiate * a new one of the correct size. */ @@ -462,102 +460,103 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( * in parallel. */ - MilanLongInt size = numGhostEdges; //TODO how can I decide a meaningfull size? + MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size? //Fail messages privateQLocalVtx.~staticQueue(); privateQGhostVtx.~staticQueue(); privateQMsgType.~staticQueue(); privateQOwner.~staticQueue(); - //Request messages - /* - privateReqQLocalVtx.~staticQueue(); - privateReqQGhostVtx.~staticQueue(); - privateReqQMsgType.~staticQueue(); - privateReqQOwner.~staticQueue(); - */ + privateU.~staticQueue(); + + new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size? new(&privateQLocalVtx) staticQueue(size); new(&privateQGhostVtx) staticQueue(size); new(&privateQMsgType) staticQueue(size); new(&privateQOwner) staticQueue(size); - /* - new(&privateReqQLocalVtx) staticQueue(size); - new(&privateReqQGhostVtx) staticQueue(size); - new(&privateReqQMsgType) staticQueue(size); - new(&privateReqQOwner) staticQueue(size); - */ #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static) - for ( v=0; v < NLVer; v++ ) - { - //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - k = candidateMate[v]; - candidateMate[v] = verLocInd[k]; - w = candidateMate[v]; + for (v = 0; v < NLVer; v++) { + //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + k = candidateMate[v]; + candidateMate[v] = verLocInd[k]; + w = candidateMate[v]; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { - //If found a dominating edge: - if (w >= 0) { - - //This piece of code is actually executed under 0.01% of the times - - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { - w = computeCandidateMate(verLocPtr[v], - verLocPtr[v + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - candidateMate[v] = w; - } + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; + } - if (w >= 0) { + if (w >= 0) { - myCard++; - if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex + myCard++; + if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { + //Decrement the counter: + //Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) +#pragma omp critical + { + if (Counter[Ghost2LocalMap[w]] > 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + Counter[Ghost2LocalMap[w]] -= 1; //Decrement if (Counter[Ghost2LocalMap[w]] == 0) { S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ @@ -565,79 +564,86 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif } - } //End of if Counter[w] > 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v - } // end of critical region - } //End of if a Ghost Vertex - else { // w is a local vertex - - if (candidateMate[w - StartIndex] == (v + StartIndex)) { -#pragma omp critical(Mate) - { - Mate[v] = w; //v is local - Mate[w - StartIndex] = v + StartIndex; //w is local - //Q.push_back(u); - U.push_back(v + StartIndex); - U.push_back(w); - -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - - //if (w < 0) { -- if it arrives here this one if is useless, it is certainly -1 - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v+StartIndex; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } //End of if CandidateMate[w] = v + + + } //End of if a Ghost Vertex + else { // w is a local vertex + + if (candidateMate[w - StartIndex] == (v + StartIndex)) { + privateU.push_back(v + StartIndex); + privateU.push_back(w); + + Mate[v] = w; //v is local + //FIXME this instruction could create errors + Mate[w - StartIndex] = v + StartIndex; //w is local + + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + + //This piece of code is executed a really small amount of times, I will not allocate a + //huge amount of memory to the private data structures. + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< Date: Thu, 2 Jun 2022 09:15:31 -0500 Subject: [PATCH 23/96] Extendend parallel region after SEND PACKET BUNDLE Nothing parallelizable founded --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 176 ++++++++++-------- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 2 files changed, 99 insertions(+), 79 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 9f3cbb97..2a541e9f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -213,7 +213,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt S; MilanLongInt privateMyCard = 0; staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; - + MilanLongInt myIndex = 0; + vector PCumulative, PMessageBundle, PSizeInfoMessages; + vector SRequest; //Requests that are used for each send message + vector SStatus; //Status of sent messages, used in MPI_Wait + MilanLongInt MessageIndex = 0; //Pointer for current message + MilanInt OneMessageSize = 0; + MilanLongInt numMessagesToSend; + MilanInt BufferSize; + MilanLongInt *Buffer; bool isEmpty; #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); @@ -868,7 +876,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( omp_get_thread_num(), myRank); #endif - } // end of parallel region ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// @@ -878,28 +885,34 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( ///////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////// SEND BUNDLED MESSAGES ///////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////// - //Data structures for Bundled Messages: - vector PCumulative, PMessageBundle, PSizeInfoMessages; - MilanLongInt myIndex=0; - try { - PMessageBundle.reserve(NumMessagesBundled*3); //Three integers per message - PCumulative.reserve(numProcs+1); //Similar to Row Pointer vector in CSR data structure - PSizeInfoMessages.reserve(numProcs*3); //Buffer to hold the Size info message packets - } catch ( length_error ) { - cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; - cout<<"Not enough memory to allocate the internal variables \n"; - exit(1); - } - PMessageBundle.resize(NumMessagesBundled*3, -1);//Initialize - PCumulative.resize(numProcs+1, 0); //Only initialize the counter variable - PSizeInfoMessages.resize(numProcs*3, 0); - - for (MilanInt i=0; i SRequest; //Requests that are used for each send message - vector SStatus; //Status of sent messages, used in MPI_Wait - MilanLongInt MessageIndex=0; //Pointer for current message - try { - SRequest.reserve(numProcs*2); //At most two messages per processor - SStatus.reserve(numProcs*2);//At most two messages per processor - } catch ( length_error ) { - cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n"; - cout<<"Not enough memory to allocate the internal variables \n"; - exit(1); - } - MPI_Request myReq; //A sample request - SRequest.resize(numProcs*2,myReq); - MPI_Status myStat; //A sample status - SStatus.resize(numProcs*2,myStat); - //Send the Messages - for (MilanInt i=0; i 0 ) { //Send only if it is a nonempty packet - MPI_Isend(&PSizeInfoMessages[i*3+0], 3, TypeMap(), i, ComputeTag, comm, &SRequest[MessageIndex]); - msgActual++; - MessageIndex++; - //Now Send the message with the data packet: -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { //Send only if it is a nonempty packet + MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, + &SRequest[MessageIndex]); + msgActual++; + MessageIndex++; + //Now Send the message with the data packet: +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<(), i, BundleTag, comm, &SRequest[MessageIndex]); - MessageIndex++; - } //End of if size > 0 - } - //Free up temporary memory: - PCumulative.clear(); - QLocalVtx.clear(); - QGhostVtx.clear(); - QMsgType.clear(); - QOwner.clear(); + MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], + TypeMap(), i, BundleTag, comm, &SRequest[MessageIndex]); + MessageIndex++; + } //End of if size > 0 + } + //Free up temporary memory: + PCumulative.clear(); + QLocalVtx.clear(); + QGhostVtx.clear(); + QMsgType.clear(); + QOwner.clear(); + + #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= 0) { if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Date: Fri, 10 Jun 2022 15:34:29 -0500 Subject: [PATCH 25/96] PROCESS MATCHED VERTICES draft of parallelization --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 334 +++++++++--------- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 2 files changed, 175 insertions(+), 161 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index dee2a019..d93337c9 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -1,6 +1,7 @@ #include "MatchBoxPC.h" #include #include + // *********************************************************************** // // MatchboxP: A C++ library for approximate weighted matching @@ -602,8 +603,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of if(w >=0) - //This piece of code is executed a really small amount of times, I will not allocate a - //huge amount of memory to the private data structures. adj11 = verLocPtr[v]; adj12 = verLocPtr[v + 1]; for (k1 = adj11; k1 < adj12; k1++) { @@ -622,16 +621,28 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( assert(ghostOwner != -1); assert(ghostOwner != myRank); PCounter[ghostOwner]++; - QLocalVtx.push_back(v + StartIndex); - QGhostVtx.push_back(w); - QMsgType.push_back(FAILURE); - QOwner.push_back(ghostOwner); + privateQLocalVtx.push_back(v + StartIndex); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); } //End of if(GHOST) } //End of for loop //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) } //End of for ( v=0; v < NLVer; v++ ) +#pragma omp critical(privateMsg) + { + while (!privateQLocalVtx.empty()) { + + QLocalVtx.push_back(privateQLocalVtx.pop_back()); + QGhostVtx.push_back(privateQGhostVtx.pop_back()); + QMsgType.push_back(privateQMsgType.pop_back()); + QOwner.push_back(privateQOwner.pop_back()); + + } + } + #pragma omp critical(U) { while (!privateU.empty()) @@ -658,202 +669,205 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt localVertices = 0; #endif - while( true ) - { + while (true) { #pragma omp critical(U) - { - if (U.empty()) isEmpty = true; - else u = U.pop_front(); - } // End of critical U - if (isEmpty) break; + { + if (U.empty()) isEmpty = true; + else u = U.pop_front(); + } // End of critical U + if (isEmpty) break; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices + if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices #ifdef COUNT_LOCAL_VERTEX - localVertices ++; + localVertices ++; #endif - //Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; //Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) { - v = verLocInd[k]; + //Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; //Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) { + v = verLocInd[k]; - if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex: #pragma omp critical { -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (v <= EndIndex)) { //If Local Vertex: - //If the current vertex is pointing to a matched vertex and is not matched - //FIXME is there a way to make candidateMate private? - // for the moment it could generate an error. - if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and - candidateMate[v - StartIndex] == u) { - //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], - verLocPtr[v - StartIndex + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - - candidateMate[v - StartIndex] = w; - //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { - if ((w < StartIndex) || (w > EndIndex)) { //A ghost -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - } //End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { //w is a local vertex - if (candidateMate[w - StartIndex] == v) { - Mate[v - StartIndex] = w; //v is a local vertex - Mate[w - StartIndex] = v; //w is a local vertex - //Q.push_back(u); - privateU.push_back(v); - privateU.push_back(w); - privateMyCard++; + + //If the current vertex is pointing to a matched vertex and is not matched + //FIXME is there a way to make candidateMate private? + // for the moment it could generate errors. + if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and + candidateMate[v - StartIndex] == u) { + + //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + + //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; + //If found a dominating edge: + + if (w >= 0) { + if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else { //Neighbor is a ghost vertex + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + Mate[v - StartIndex] = w; //v is a local vertex + GMate[Ghost2LocalMap[w]] = v; //w is a ghost vertex + //Q.push_back(u); + privateU.push_back(v); + privateU.push_back(w); + privateMyCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 -#pragma omp critical - { - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) { //u is local - //Build the Message Packet: - //Message[0] = u; //LOCAL - //Message[1] = v; //GHOST - //Message[2] = SUCCESS; //TYPE - //Send a Request (Asynchronous) + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } //End of if CandidateMate[w] = v + + } //End of if a Ghost Vertex + else { //w is a local vertex + if (candidateMate[w - StartIndex] == v) { + Mate[v - StartIndex] = w; //v is a local vertex + Mate[w - StartIndex] = v; //w is a local vertex + privateU.push_back(v); + privateU.push_back(w); + privateMyCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + } //End of Else + + } //End of if(w >=0) + else { + + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { //Neighbor is a ghost vertex + + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + + if (v != Mate[u - StartIndex]) { //u is local + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex #pragma omp critical(U) - { - while(!privateU.empty()) { - U.push_back(privateU.pop_front()); - } + { + while (!privateU.empty()) { + U.push_back(privateU.pop_front()); + } - myCard += privateMyCard; - } //End of critical U + myCard += privateMyCard; + } //End of critical U - } //End of while ( /*!Q.empty()*/ !U.empty() ) + } //End of while #pragma omp critical(privateMsg) { diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index bdacc992..1b59d29b 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0123 ! IDIM; domain size. Linear system size is IDIM**3 +00080 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC From b2230a6d6d87e7098c59655f435ef2a6c4319751 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Mon, 13 Jun 2022 16:09:00 -0500 Subject: [PATCH 26/96] Improved critical region U --- ...istEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index dee2a019..966b86a2 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -841,9 +841,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of If( v != Mate[u] ) } //End of critical region } //End of Else //A Ghost Vertex + } //End of For Loop adj(u) + } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + //Avoid to ask for the critical section if there is nothing to add + if(privateU.empty()) continue; #pragma omp critical(U) { while(!privateU.empty()) { @@ -852,7 +856,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( myCard += privateMyCard; } //End of critical U - } //End of while ( /*!Q.empty()*/ !U.empty() ) #pragma omp critical(privateMsg) From bf35c1659b4f8aa7b9ebd635a6d39fe19f37654f Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Mon, 13 Jun 2022 16:53:12 -0500 Subject: [PATCH 27/96] Further improved critical region U --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 38 +++++++++++++++---- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 966b86a2..82ca4c44 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -72,6 +72,8 @@ Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2) */ +#define UCHUNK 1000 + #ifdef SERIAL_MPI #else //MPI type map @@ -658,23 +660,41 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt localVertices = 0; #endif - while( true ) - { + //TODO what would be the optimal UCHUNK + vector Us; + Us.reserve(UCHUNK); + + while( true ) { + Us.clear(); #pragma omp critical(U) { - if (U.empty()) isEmpty = true; - else u = U.pop_front(); + //If U is emptu and there are no new node to add to U + if (U.empty() && privateU.empty()) + isEmpty = true; + else { + if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U + while (!privateU.empty()) { + U.push_back(privateU.pop_front()); + myCard += privateMyCard; + } + for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes + if (U.empty()) break; + Us.push_back(U.pop_front()); + } + } } // End of critical U if (isEmpty) break; + for (MilanLongInt u : Us) + { #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= StartIndex) && (u <= EndIndex) ) { //Process Only the Local Vertices + if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices #ifdef COUNT_LOCAL_VERTEX - localVertices ++; + localVertices ++; #endif //Get the Adjacency list for u @@ -847,15 +867,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex //Avoid to ask for the critical section if there is nothing to add - if(privateU.empty()) continue; + if (privateU.size() < UCHUNK && !U.empty()) continue; #pragma omp critical(U) { - while(!privateU.empty()) { + while (!privateU.empty()) { U.push_back(privateU.pop_front()); } myCard += privateMyCard; } //End of critical U + + } } //End of while ( /*!Q.empty()*/ !U.empty() ) #pragma omp critical(privateMsg) diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index bdacc992..eb254780 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0123 ! IDIM; domain size. Linear system size is IDIM**3 +0080 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC From 6fd571ecb2d30a037c668019553e707664e8270e Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Tue, 14 Jun 2022 14:33:31 -0500 Subject: [PATCH 28/96] Lock error --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 48 +++++++++++++++---- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 82ca4c44..c9568a9f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -225,6 +225,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanInt BufferSize; MilanLongInt *Buffer; bool isEmpty; + + //Declare the locks + omp_lock_t MateLock[NLVer]; #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif @@ -232,8 +235,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) { - // TODO comments about the reduction + //Initialize the locks + //TODO this can be executed as task in parallel with other unparallelizable tasks + //TODO destroy the locks +#pragma omp for schedule(static) + for(int i = 0; i < NLVer; i++) + omp_init_lock(&MateLock[i]); + // TODO comments about the reduction #pragma omp for reduction(+ : numGhostEdges) for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice insertMe = verLocInd[i]; @@ -704,7 +713,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( v = verLocInd[k]; if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex: -#pragma omp critical +#pragma omp critical(innerProcessMatched) { #ifdef PRINT_DEBUG_INFO_ @@ -712,11 +721,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif + //If the current vertex is pointing to a matched vertex and is not matched //FIXME is there a way to make candidateMate private? // for the moment it could generate an error. if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and candidateMate[v - StartIndex] == u) { + + //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) w = computeCandidateMate(verLocPtr[v - StartIndex], @@ -737,6 +749,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #endif //If found a dominating edge: if (w >= 0) { + + //TODO is it possible to lock without a critical region? + //TODO there must be a more elegant and efficient way to do this + while(true) { + if (omp_test_lock(&MateLock[v - StartIndex])) { + if (omp_test_lock(&MateLock[w - StartIndex])) break; + else omp_unset_lock(&MateLock[v - StartIndex]); + } + } + + if ((w < StartIndex) || (w > EndIndex)) { //A ghost #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<=0) else { adj11 = verLocPtr[v - StartIndex]; @@ -798,11 +825,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( for (k1 = adj11; k1 < adj12; k1++) { w = verLocInd[k1]; if ((w < StartIndex) || (w > EndIndex)) { //A ghost - //Build the Message Packet: - //Message[0] = v; //LOCAL - //Message[1] = w; //GHOST - //Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) + #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<= StartIndex) && (v <= EndIndex) ) //If Local Vertex: else { //Neighbor is a ghost vertex -#pragma omp critical +#pragma omp critical(innerProcessMatched) { + + while(!omp_test_lock(&MateLock[u - StartIndex])); + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) candidateMate[NLVer + Ghost2LocalMap[v]] = -1; if (v != Mate[u - StartIndex]) { //u is local @@ -859,6 +886,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( NumMessagesBundled++; msgInd++; } //End of If( v != Mate[u] ) + + omp_unset_lock(&MateLock[u - StartIndex]); + } //End of critical region } //End of Else //A Ghost Vertex diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index eb254780..904b6551 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0080 ! IDIM; domain size. Linear system size is IDIM**3 +020 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC From 2044c5c8eb6a0c307a8d316140aec951954394b3 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Tue, 14 Jun 2022 14:47:45 -0500 Subject: [PATCH 29/96] Merge fix, lock error --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 610 +++++++++--------- 1 file changed, 298 insertions(+), 312 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 6e58d724..f747f1fc 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -1,7 +1,6 @@ #include "MatchBoxPC.h" #include #include - // *********************************************************************** // // MatchboxP: A C++ library for approximate weighted matching @@ -314,17 +313,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #endif - /* - * Not parallelizable - */ + /* + * Not parallelizable + */ - for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|) - verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; + for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|) + verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; #ifdef PRINT_DEBUG_INFO_ - cout< 0) { + if (Counter[Ghost2LocalMap[w]] > 0) { Counter[Ghost2LocalMap[w]] -= 1; //Decrement if (Counter[Ghost2LocalMap[w]] == 0) { @@ -614,6 +613,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of if(w >=0) + //This piece of code is executed a really small amount of times, I will not allocate a + //huge amount of memory to the private data structures. adj11 = verLocPtr[v]; adj12 = verLocPtr[v + 1]; for (k1 = adj11; k1 < adj12; k1++) { @@ -632,28 +633,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( assert(ghostOwner != -1); assert(ghostOwner != myRank); PCounter[ghostOwner]++; - privateQLocalVtx.push_back(v + StartIndex); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); + QLocalVtx.push_back(v + StartIndex); + QGhostVtx.push_back(w); + QMsgType.push_back(FAILURE); + QOwner.push_back(ghostOwner); } //End of if(GHOST) } //End of for loop //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) } //End of for ( v=0; v < NLVer; v++ ) -#pragma omp critical(privateMsg) - { - while (!privateQLocalVtx.empty()) { - - QLocalVtx.push_back(privateQLocalVtx.pop_back()); - QGhostVtx.push_back(privateQGhostVtx.pop_back()); - QMsgType.push_back(privateQMsgType.pop_back()); - QOwner.push_back(privateQOwner.pop_back()); - - } - } - #pragma omp critical(U) { while (!privateU.empty()) @@ -668,261 +657,258 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< Us; - Us.reserve(UCHUNK); + //TODO what would be the optimal UCHUNK + vector Us; + Us.reserve(UCHUNK); - while( true ) { + while( true ) { - Us.clear(); + Us.clear(); #pragma omp critical(U) - { - //If U is emptu and there are no new node to add to U - if (U.empty() && privateU.empty()) - isEmpty = true; - else { - if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U - while (!privateU.empty()) { - U.push_back(privateU.pop_front()); - myCard += privateMyCard; + { + //If U is emptu and there are no new node to add to U + if (U.empty() && privateU.empty()) + isEmpty = true; + else { + if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U + while (!privateU.empty()) { + U.push_back(privateU.pop_front()); + myCard += privateMyCard; + } + for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes + if (U.empty()) break; + Us.push_back(U.pop_front()); } - for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes - if (U.empty()) break; - Us.push_back(U.pop_front()); } - } - } // End of critical U - if (isEmpty) break; + } // End of critical U + if (isEmpty) break; - for (MilanLongInt u : Us) - { + for (MilanLongInt u : Us) + { #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices + if ((u >= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices #ifdef COUNT_LOCAL_VERTEX - localVertices ++; + localVertices ++; #endif - //Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; //Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) { - v = verLocInd[k]; - - if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex: -#pragma omp critical(innerProcessMatched) - { + //Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; //Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) { + v = verLocInd[k]; if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex: +#pragma omp critical(innerProcessMatched) + { - - //If the current vertex is pointing to a matched vertex and is not matched - //FIXME is there a way to make candidateMate private? - // for the moment it could generate an error. - if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and - candidateMate[v - StartIndex] == u) { - - - //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - //Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], - verLocPtr[v - StartIndex + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - - candidateMate[v - StartIndex] = w; - - //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { - - //TODO is it possible to lock without a critical region? - //TODO there must be a more elegant and efficient way to do this - while(true) { - if (omp_test_lock(&MateLock[v - StartIndex])) { - if (omp_test_lock(&MateLock[w - StartIndex])) break; - else omp_unset_lock(&MateLock[v - StartIndex]); - } - } - if ((w < StartIndex) || (w > EndIndex)) { //A ghost -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { - omp_unset_lock(&MateLock[v - StartIndex]); - omp_unset_lock(&MateLock[w - StartIndex]); + //TODO is it possible to lock without a critical region? + //TODO there must be a more elegant and efficient way to do this + while(true) { + if (omp_test_lock(&MateLock[v - StartIndex])) { + if (omp_test_lock(&MateLock[w - StartIndex])) break; + else omp_unset_lock(&MateLock[v - StartIndex]); + } + } - } //End of if(w >=0) - else { - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost + if ((w < StartIndex) || (w > EndIndex)) { //A ghost #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement + if (Counter[Ghost2LocalMap[w]] == 0) { + S--; //Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 + //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } //End of if CandidateMate[w] = v + } //End of if a Ghost Vertex + else { //w is a local vertex + if (candidateMate[w - StartIndex] == v) { + Mate[v - StartIndex] = w; //v is a local vertex + Mate[w - StartIndex] = v; //w is a local vertex + //Q.push_back(u); + privateU.push_back(v); + privateU.push_back(w); + privateMyCard++; +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("<=0) + else { + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { //A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< 0 + /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w), + ComputeTag, comm); */ + QLocalVtx.push_back(v); + QGhostVtx.push_back(w); + QMsgType.push_back(FAILURE); + //ghostOwner = inputSubGraph.findOwner(w); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + QOwner.push_back(ghostOwner); + PCounter[ghostOwner]++; + NumMessagesBundled++; + msgInd++; + } //End of if(GHOST) + } //End of for loop + } // End of Else: w == -1 + //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + + } //End of If (candidateMate[v-StartIndex] == u + + } //End of critical region if + + } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { //Neighbor is a ghost vertex #pragma omp critical(innerProcessMatched) - { + { - while(!omp_test_lock(&MateLock[u - StartIndex])); + while(!omp_test_lock(&MateLock[u - StartIndex])); - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) { //u is local - //Build the Message Packet: - //Message[0] = u; //LOCAL - //Message[1] = v; //GHOST - //Message[2] = SUCCESS; //TYPE - //Send a Request (Asynchronous) + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) { //u is local + //Build the Message Packet: + //Message[0] = u; //LOCAL + //Message[1] = v; //GHOST + //Message[2] = SUCCESS; //TYPE + //Send a Request (Asynchronous) - if (candidateMate[w - StartIndex] == v) { - Mate[v - StartIndex] = w; //v is a local vertex - Mate[w - StartIndex] = v; //w is a local vertex - privateU.push_back(v); - privateU.push_back(w); - privateMyCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + } //End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex - //Avoid to ask for the critical section if there is nothing to add - if (privateU.size() < UCHUNK && !U.empty()) continue; + //Avoid to ask for the critical section if there is nothing to add + if (privateU.size() < UCHUNK && !U.empty()) continue; #pragma omp critical(U) - { - while (!privateU.empty()) { - U.push_back(privateU.pop_front()); - } + { + while (!privateU.empty()) { + U.push_back(privateU.pop_front()); + } - myCard += privateMyCard; - } //End of critical U + myCard += privateMyCard; + } //End of critical U - } - } //End of while ( /*!Q.empty()*/ !U.empty() ) + } + } //End of while ( /*!Q.empty()*/ !U.empty() ) #pragma omp critical(privateMsg) { @@ -938,20 +924,20 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #ifdef COUNT_LOCAL_VERTEX - printf("Count local vertexes: %ld for thread %d of processor %d\n", + printf("Count local vertexes: %ld for thread %d of processor %d\n", localVertices, omp_get_thread_num(), myRank); #endif - ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// + ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// #ifdef DEBUG_HANG_ - if (myRank == 0) cout<<"\n("< 0 ) { - Buffer = (MilanLongInt *) malloc(BufferSize); //Allocate memory - if ( Buffer == 0 ) { - cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; - cout<<"Not enough memory to allocate for send buffer on process "< 0 ) { + Buffer = (MilanLongInt *) malloc(BufferSize); //Allocate memory + if ( Buffer == 0 ) { + cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout<<"Not enough memory to allocate for send buffer on process "< &GMate, - MilanLongInt* Mate, - map &Ghost2LocalMap - ) { + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt* Mate, + map &Ghost2LocalMap +) { bool result = false; #pragma omp critical(Mate) @@ -1776,15 +1762,15 @@ inline bool isAlreadyMatched(MilanLongInt node, * @return */ inline MilanLongInt computeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanReal* edgeLocWeight, - MilanLongInt k, - MilanLongInt* verLocInd, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector & GMate, - MilanLongInt* Mate, - map & Ghost2LocalMap) + MilanLongInt adj2, + MilanReal* edgeLocWeight, + MilanLongInt k, + MilanLongInt* verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector & GMate, + MilanLongInt* Mate, + map & Ghost2LocalMap) { MilanInt w = -1; MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN From bf0532867d4b5d7e891a45609d496d213fb3ce84 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 25 Jun 2022 08:48:49 -0500 Subject: [PATCH 30/96] Functions in different files --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 168 ++---------------- .../impl/aggregator/computeCandidateMate.cpp | 69 +++++++ amgprec/impl/aggregator/findOwnerOfGhost.cpp | 52 ++++++ amgprec/impl/aggregator/isAlreadyMatched.cpp | 42 +++++ samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 5 files changed, 177 insertions(+), 156 deletions(-) create mode 100644 amgprec/impl/aggregator/computeCandidateMate.cpp create mode 100644 amgprec/impl/aggregator/findOwnerOfGhost.cpp create mode 100644 amgprec/impl/aggregator/isAlreadyMatched.cpp diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index f747f1fc..ea0e460f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -1,6 +1,10 @@ #include "MatchBoxPC.h" #include #include +#include "isAlreadyMatched.cpp" +#include "findOwnerOfGhost.cpp" +#include "computeCandidateMate.cpp" + // *********************************************************************** // // MatchboxP: A C++ library for approximate weighted matching @@ -238,9 +242,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //Initialize the locks //TODO this can be executed as task in parallel with other unparallelizable tasks //TODO destroy the locks -#pragma omp for schedule(static) - for(int i = 0; i < NLVer; i++) - omp_init_lock(&MateLock[i]); +//#pragma omp for schedule(static) +// for(int i = 0; i < NLVer; i++) +// omp_init_lock(&MateLock[i]); // TODO comments about the reduction #pragma omp for reduction(+ : numGhostEdges) @@ -752,12 +756,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //TODO is it possible to lock without a critical region? //TODO there must be a more elegant and efficient way to do this + /* while(true) { if (omp_test_lock(&MateLock[v - StartIndex])) { if (omp_test_lock(&MateLock[w - StartIndex])) break; else omp_unset_lock(&MateLock[v - StartIndex]); } } + */ if ((w < StartIndex) || (w > EndIndex)) { //A ghost @@ -815,8 +821,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } //End of if(CandidateMate(w) = v } //End of Else - omp_unset_lock(&MateLock[v - StartIndex]); - omp_unset_lock(&MateLock[w - StartIndex]); + //omp_unset_lock(&MateLock[v - StartIndex]); + //omp_unset_lock(&MateLock[w - StartIndex]); } //End of if(w >=0) else { @@ -859,7 +865,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #pragma omp critical(innerProcessMatched) { - while(!omp_test_lock(&MateLock[u - StartIndex])); + //while(!omp_test_lock(&MateLock[u - StartIndex])); if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) candidateMate[NLVer + Ghost2LocalMap[v]] = -1; @@ -887,7 +893,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( msgInd++; } //End of If( v != Mate[u] ) - omp_unset_lock(&MateLock[u - StartIndex]); + //omp_unset_lock(&MateLock[u - StartIndex]); } //End of critical region } //End of Else //A Ghost Vertex @@ -1637,154 +1643,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //MPI_Barrier(comm); } //End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate - -///Find the owner of a ghost node: -inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, - MilanInt myRank, MilanInt numProcs) { - //MilanLongInt Size = mVerDistance.size(); - MilanLongInt mStartInd = mVerDistance[myRank]; - MilanInt Start = 0; - MilanInt End = numProcs; - MilanInt Current = 0; - -#if 0 - if ( vtxIndex < mStartInd ) - End = myRank; - else - Start = myRank; -#endif - - while ( Start <= End ) { - Current = (End + Start)/2; - //CASE-1: - if ( mVerDistance[Current] == vtxIndex ) { - while ( mVerDistance[Current+1] == vtxIndex ) { - Current++; - if ( Current == numProcs ) - return (-1); - } - return (Current); - } - else { //CASE 2: - if ( mVerDistance[Current] > vtxIndex ) - End = Current - 1; - else //CASE 3: - Start = Current + 1; - } - } //End of While() - if ( Current == 0 ) - return (Current); - else { - if ( mVerDistance[Current] > vtxIndex ) - return (Current-1); - else - return (Current); - } //End of else - return (-1); //It should not reach here! -} //End of findOwnerOfGhost() - -/** - * Execute the research fr the Candidate Mate without controlling if the vertices are already matched. - * Returns the vertices with the highest weight - * @param adj1 - * @param adj2 - * @param verLocInd - * @param edgeLocWeight - * @return - */ -inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanLongInt* verLocInd, - MilanReal* edgeLocWeight) -{ - MilanInt w = -1; - MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN - int finalK; - for (int k = adj1; k < adj2; k++) { - - if ((edgeLocWeight[k] > heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { - heaviestEdgeWt = edgeLocWeight[k]; - w = verLocInd[k]; - finalK = k; - } - } //End of for loop - return finalK; -} - -/** - * //TODO documentation - * @param k - * @param verLocInd - * @param StartIndex - * @param EndIndex - * @param GMate - * @param Mate - * @param Ghost2LocalMap - * @return - */ -inline bool isAlreadyMatched(MilanLongInt node, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector &GMate, - MilanLongInt* Mate, - map &Ghost2LocalMap -) { - - bool result = false; -#pragma omp critical(Mate) - { - if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex? - if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched - result = true; - } else { //A local vertex - if (Mate[node - StartIndex] >= 0) // Already matched - result = true; - } - - } - - return result; -} - -/** - * //TODO documentation - * @param adj1 - * @param adj2 - * @param edgeLocWeight - * @param k - * @param verLocInd - * @param StartIndex - * @param EndIndex - * @param GMate - * @param Mate - * @param Ghost2LocalMap - * @return - */ -inline MilanLongInt computeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanReal* edgeLocWeight, - MilanLongInt k, - MilanLongInt* verLocInd, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector & GMate, - MilanLongInt* Mate, - map & Ghost2LocalMap) -{ - MilanInt w = -1; - MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN - for (k = adj1; k < adj2; k++) { - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; - - if ((edgeLocWeight[k] > heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { - heaviestEdgeWt = edgeLocWeight[k]; - w = verLocInd[k]; - } - } //End of for loop - return w; -} #endif #endif \ No newline at end of file diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp new file mode 100644 index 00000000..92e3c92b --- /dev/null +++ b/amgprec/impl/aggregator/computeCandidateMate.cpp @@ -0,0 +1,69 @@ +#include "MatchBoxPC.h" + +/** + * Execute the research fr the Candidate Mate without controlling if the vertices are already matched. + * Returns the vertices with the highest weight + * @param adj1 + * @param adj2 + * @param verLocInd + * @param edgeLocWeight + * @return + */ +inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanLongInt* verLocInd, + MilanReal* edgeLocWeight) +{ + MilanInt w = -1; + MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN + int finalK; + for (int k = adj1; k < adj2; k++) { + + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; + finalK = k; + } + } //End of for loop + return finalK; +} + +/** + * //TODO documentation + * @param adj1 + * @param adj2 + * @param edgeLocWeight + * @param k + * @param verLocInd + * @param StartIndex + * @param EndIndex + * @param GMate + * @param Mate + * @param Ghost2LocalMap + * @return + */ +inline MilanLongInt computeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanReal* edgeLocWeight, + MilanLongInt k, + MilanLongInt* verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector & GMate, + MilanLongInt* Mate, + map & Ghost2LocalMap) +{ + MilanInt w = -1; + MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN + for (k = adj1; k < adj2; k++) { + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; + + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; + } + } //End of for loop + return w; +} \ No newline at end of file diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp new file mode 100644 index 00000000..10850607 --- /dev/null +++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp @@ -0,0 +1,52 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" + +///Find the owner of a ghost node: +inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, + MilanInt myRank, MilanInt numProcs) { + //MilanLongInt Size = mVerDistance.size(); + MilanLongInt mStartInd = mVerDistance[myRank]; + MilanInt Start = 0; + MilanInt End = numProcs; + MilanInt Current = 0; + +#if 0 + if ( vtxIndex < mStartInd ) + End = myRank; + else + Start = myRank; +#endif + + while ( Start <= End ) { + Current = (End + Start)/2; + //CASE-1: + if ( mVerDistance[Current] == vtxIndex ) { + while ( mVerDistance[Current+1] == vtxIndex ) { + Current++; + if ( Current == numProcs ) + return (-1); + } + return (Current); + } + else { //CASE 2: + if ( mVerDistance[Current] > vtxIndex ) + End = Current - 1; + else //CASE 3: + Start = Current + 1; + } + } //End of While() + if ( Current == 0 ) + return (Current); + else { + if ( mVerDistance[Current] > vtxIndex ) + return (Current-1); + else + return (Current); + } //End of else + return (-1); //It should not reach here! +} //End of findOwnerOfGhost() diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp new file mode 100644 index 00000000..5a9cf476 --- /dev/null +++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp @@ -0,0 +1,42 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" + +/** + * //TODO documentation + * @param k + * @param verLocInd + * @param StartIndex + * @param EndIndex + * @param GMate + * @param Mate + * @param Ghost2LocalMap + * @return + */ +inline bool isAlreadyMatched(MilanLongInt node, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt* Mate, + map &Ghost2LocalMap +) { + + bool result = false; +#pragma omp critical(Mate) + { + if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex? + if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched + result = true; + } else { //A local vertex + if (Mate[node - StartIndex] >= 0) // Already matched + result = true; + } + + } + + return result; +} \ No newline at end of file diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index 904b6551..b6c448c3 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -020 ! IDIM; domain size. Linear system size is IDIM**3 +0020 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC From a54f084ffb57b51735e2fb2412ab74bef7d87ac2 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 25 Jun 2022 10:16:30 -0500 Subject: [PATCH 31/96] refactoring, initialization --- amgprec/impl/aggregator/MatchBoxPC.h | 25 +- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 236 +++-------------- amgprec/impl/aggregator/initialize.cpp | 239 ++++++++++++++++++ 3 files changed, 305 insertions(+), 195 deletions(-) create mode 100644 amgprec/impl/aggregator/initialize.cpp diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index ba7cb5c8..9b0218bc 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -59,7 +59,7 @@ #include #include #include -// #include "matchboxp.h" +#include "omp.h" #include "primitiveDataTypeDefinitions.h" #include "dataStrStaticQueue.h" @@ -175,6 +175,29 @@ inline MilanLongInt computeCandidateMate(MilanLongInt adj1, MilanLongInt* Mate, map &Ghost2LocalMap); +inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt* numGhostEdgesPtr, + MilanLongInt* numGhostVerticesPtr, + MilanLongInt* insertMePtr, + MilanLongInt* verLocInd, + MilanLongInt* verLocPtr, + omp_lock_t* MateLock, + map &Ghost2LocalMap, + vector & Counter, + vector & verGhostPtr, + vector & verGhostInd, + vector & tempCounter, + vector & GMate, + vector& Message, + vector& QLocalVtx, + vector& QGhostVtx, + vector& QMsgType, + vector& QOwner, + MilanLongInt* candidateMate, + staticQueue& U + ); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP ( MilanLongInt NLVer, MilanLongInt NLEdge, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index ea0e460f..dc3606c3 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -4,6 +4,7 @@ #include "isAlreadyMatched.cpp" #include "findOwnerOfGhost.cpp" #include "computeCandidateMate.cpp" +#include "initialize.cpp" // *********************************************************************** // @@ -146,10 +147,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanReal startTime, finishTime; //MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer startTime = MPI_Wtime(); - //Get the iterators for the graph: - //vector::iterator verLocPtr = inputSubGraph.getVerPtr_b(); - //vector::iterator verLocInd = inputSubGraph.getVerInd_b(); - //vector::iterator edgeLocWeight = inputSubGraph.getEdgeWt_b(); //Data structures for sending and receiving messages: vector Message; // [ u, v, message_type ] @@ -171,9 +168,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( PCounter[i] = 0; - MilanLongInt NumMessagesBundled; - MilanInt ghostOwner; // Changed by Fabio to be an integer, addresses needs to be integers! - //vector candidateMate; + MilanLongInt NumMessagesBundled = 0; + MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers! MilanLongInt* candidateMate = new MilanLongInt[1]; #ifdef PRINT_DEBUG_INFO_ cout<<"\n("< Ghost2LocalMap; //Map each ghost vertex to a local vertex - // index that starts with zero to |Vg| - 1 - map::iterator storedAlready; vector Counter; //Store the edge count for each ghost vertex MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices @@ -200,17 +194,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( if (myRank == 0) cout<<"\n("< verGhostPtr, verGhostInd, tempCounter; @@ -232,182 +215,37 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //Declare the locks omp_lock_t MateLock[NLVer]; -#ifdef TIME_TRACKER - double Ghost2LocalInitialization = MPI_Wtime(); -#endif - -#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) - { - - //Initialize the locks - //TODO this can be executed as task in parallel with other unparallelizable tasks - //TODO destroy the locks -//#pragma omp for schedule(static) -// for(int i = 0; i < NLVer; i++) -// omp_init_lock(&MateLock[i]); - - // TODO comments about the reduction -#pragma omp for reduction(+ : numGhostEdges) - for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice - insertMe = verLocInd[i]; - //cout<<"InsertMe on Process "< EndIndex)) { //Find a ghost - numGhostEdges++; -#pragma omp critical - { - storedAlready = Ghost2LocalMap.find(insertMe); - if (storedAlready != Ghost2LocalMap.end()) { //Has already been added - //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter - } else { //Insert an entry for the ghost: - //cout<<"Process "<second<<" - "<first<<" : "<second]< 0 ) - cout< EndIndex)) { //Find a ghost -#pragma omp critical - { - insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert - verGhostInd[insertMe] = v + StartIndex; //Add the adjacency - tempCounter[Ghost2LocalMap[w]]++; //Increment the counter - } - } //End of if((w < StartIndex) || (w > EndIndex)) - } //End of for(k) - } //End of for (v) +#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) + { #pragma omp single - { - -#ifdef TIME_TRACKER - verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; - fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); -#endif - -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt* numGhostEdgesPtr, + MilanLongInt* numGhostVerticesPtr, + MilanLongInt* insertMePtr, + MilanLongInt* verLocInd, + MilanLongInt* verLocPtr, + omp_lock_t* MateLock, + map &Ghost2LocalMap, + vector & Counter, + vector & verGhostPtr, + vector & verGhostInd, + vector & tempCounter, + vector & GMate, + vector& Message, + vector& QLocalVtx, + vector& QGhostVtx, + vector& QMsgType, + vector& QOwner, + MilanLongInt* candidateMate, + staticQueue& U + ) +{ + + MilanLongInt insertMe = 0, numGhostEdges = 0, numGhostVertices = 0; + MilanLongInt adj1, adj2; + int i, v, k, w; + + + // index that starts with zero to |Vg| - 1 + map::iterator storedAlready; + +#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) + { + + //Initialize the locks + //TODO this can be executed as task in parallel with other unparallelizable tasks + //TODO destroy the locks +#pragma omp for schedule(static) + for(i = 0; i < NLVer; i++) + omp_init_lock(&MateLock[i]); + + +#ifdef TIME_TRACKER + double Ghost2LocalInitialization = MPI_Wtime(); +#endif + + /* + * OMP Ghost2LocalInitialization + * The cycle analyzes all the edges and when finds a ghost edge + * puts it in the Ghost2LocalMap. + * A critical region is needed when inserting data in the map. + * + * Despite the critical region it is still productive to + * parallelize this for because the critical region is exeuted + * only when a ghost edge is found and ghost edges are a minority. + */ + + // TODO comments about the reduction +#pragma omp for reduction(+ : numGhostEdges) + for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice + insertMe = verLocInd[i]; + //cout<<"InsertMe on Process "< EndIndex)) { //Find a ghost + numGhostEdges++; +#pragma omp critical + { + storedAlready = Ghost2LocalMap.find(insertMe); + if (storedAlready != Ghost2LocalMap.end()) { //Has already been added + //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter + } else { //Insert an entry for the ghost: + //cout<<"Process "<second<<" - "<first<<" : "<second]< 0 ) + cout< EndIndex)) { //Find a ghost +#pragma omp critical + { + insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert + verGhostInd[insertMe] = v + StartIndex; //Add the adjacency + tempCounter[Ghost2LocalMap[w]]++; //Increment the counter + } + } //End of if((w < StartIndex) || (w > EndIndex)) + } //End of for(k) + } //End of for (v) + + } + + #pragma omp single + { + +#ifdef TIME_TRACKER + verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; + fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout<<"\n("< Date: Sat, 25 Jun 2022 12:10:14 -0500 Subject: [PATCH 32/96] Refactoring Initialization --- amgprec/impl/aggregator/MatchBoxPC.h | 3 +- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 66 ++----------------- amgprec/impl/aggregator/initialize.cpp | 44 ++++++++++++- 3 files changed, 49 insertions(+), 64 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 9b0218bc..f1cb257a 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -180,6 +180,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt* numGhostEdgesPtr, MilanLongInt* numGhostVerticesPtr, MilanLongInt* insertMePtr, + MilanLongInt* S, MilanLongInt* verLocInd, MilanLongInt* verLocPtr, omp_lock_t* MateLock, @@ -194,7 +195,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, vector& QGhostVtx, vector& QMsgType, vector& QOwner, - MilanLongInt* candidateMate, + MilanLongInt* &candidateMate, staticQueue& U ); diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index dc3606c3..174eb578 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -170,7 +170,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt NumMessagesBundled = 0; MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers! - MilanLongInt* candidateMate = new MilanLongInt[1]; + MilanLongInt* candidateMate = nullptr; #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<& QGhostVtx, vector& QMsgType, vector& QOwner, - MilanLongInt* candidateMate, + MilanLongInt* &candidateMate, staticQueue& U ) { @@ -233,7 +234,48 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, } // end of single region +#ifdef PRINT_DEBUG_INFO_ +cout<<"\n("< Date: Sat, 25 Jun 2022 15:25:13 -0500 Subject: [PATCH 33/96] Refactoring private queues, still not working --- amgprec/impl/aggregator/MatchBoxPC.h | 8 ++- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 33 +++++------ amgprec/impl/aggregator/dataStrStaticQueue.h | 6 +- amgprec/impl/aggregator/initialize.cpp | 57 ++++++++----------- 4 files changed, 48 insertions(+), 56 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index f1cb257a..54830919 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -179,7 +179,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt StartIndex, MilanLongInt EndIndex, MilanLongInt* numGhostEdgesPtr, MilanLongInt* numGhostVerticesPtr, - MilanLongInt* insertMePtr, MilanLongInt* S, MilanLongInt* verLocInd, MilanLongInt* verLocPtr, @@ -196,7 +195,12 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, vector& QMsgType, vector& QOwner, MilanLongInt* &candidateMate, - staticQueue& U + staticQueue& U, + staticQueue& privateU, + staticQueue& privateQLocalVtx, + staticQueue& privateQGhostVtx, + staticQueue& privateQMsgType, + staticQueue& privateQOwner ); void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 174eb578..b4ead45d 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -185,7 +185,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //Build the Ghost Vertex Set: Vg map Ghost2LocalMap; //Map each ghost vertex to a local vertex vector Counter; //Store the edge count for each ghost vertex - MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices + MilanLongInt numGhostVertices = 0, numGhostEdges = 0; //Number of Ghost vertices #ifdef PRINT_DEBUG_INFO_ cout<<"\n("<& QMsgType, vector& QOwner, MilanLongInt* &candidateMate, - staticQueue& U + staticQueue& U, + staticQueue& privateU, + staticQueue& privateQLocalVtx, + staticQueue& privateQGhostVtx, + staticQueue& privateQMsgType, + staticQueue& privateQOwner ) { @@ -37,7 +41,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt adj1, adj2; int i, v, k, w; - // index that starts with zero to |Vg| - 1 map::iterator storedAlready; @@ -64,10 +67,9 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * * Despite the critical region it is still productive to * parallelize this for because the critical region is exeuted - * only when a ghost edge is found and ghost edges are a minority. + * only when a ghost edge is found and ghost edges are a minority, + * circa 3.5% during the tests. */ - - // TODO comments about the reduction #pragma omp for reduction(+ : numGhostEdges) for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice insertMe = verLocInd[i]; @@ -90,8 +92,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) } //End of for(ghost vertices) - - #pragma omp single { //numGhostEdges = atomicNumGhostEdges; @@ -143,7 +143,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, /* * Not parallelizable */ - for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|) verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; #ifdef PRINT_DEBUG_INFO_ @@ -163,6 +162,10 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, fflush(stdout); #endif +#ifdef TIME_TRACKER + double verGhostIndInitialization = MPI_Wtime(); +#endif + /* * OMP verGhostIndInitialization * @@ -175,13 +178,8 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * Despite the critical region it's still useful to * parallelize the for cause the ghost nodes * are a minority hence the critical region is executed - * few times. + * few times, circa 3.5% of the times in the tests. */ - -#ifdef TIME_TRACKER - double verGhostIndInitialization = MPI_Wtime(); -#endif - #pragma omp for nowait schedule(static) for (v = 0; v < NLVer; v++) { adj1 = verLocPtr[v]; //Vertex Pointer @@ -192,17 +190,14 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp critical { insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert - verGhostInd[insertMe] = v + StartIndex; //Add the adjacency tempCounter[Ghost2LocalMap[w]]++; //Increment the counter } + verGhostInd[insertMe] = v + StartIndex; //Add the adjacency } //End of if((w < StartIndex) || (w > EndIndex)) } //End of for(k) } //End of for (v) - - } - #pragma omp single - { + } // End of parallel region #ifdef TIME_TRACKER verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; @@ -216,11 +211,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, cout< Date: Sun, 26 Jun 2022 04:40:13 -0500 Subject: [PATCH 34/96] Initialize parallelized with task --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 30 ++++++++++---- amgprec/impl/aggregator/initialize.cpp | 41 ++++++++++++------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index b4ead45d..9d5b6417 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -326,9 +326,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( assert(ghostOwner != myRank); PCounter[ghostOwner]++; - //TODO why does it fail if I use a private data structure??? - /* + //TODO why does it fail if I use a private data structure??? privateQLocalVtx.push_back(v + StartIndex); privateQGhostVtx.push_back(w); privateQMsgType.push_back(REQUEST); @@ -343,7 +342,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( QMsgType.push_back(REQUEST); QOwner.push_back(ghostOwner); } // end of critical region - + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) { @@ -430,6 +429,20 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) } //End of for ( v=0; v < NLVer; v++ ) + + #pragma omp critical(privateMsg) + { + while (!privateQLocalVtx.empty()) { + + QLocalVtx.push_back(privateQLocalVtx.pop_front()); + QGhostVtx.push_back(privateQGhostVtx.pop_front()); + QMsgType.push_back(privateQMsgType.pop_front()); + QOwner.push_back(privateQOwner.pop_front()); + + } + + } + #pragma omp critical(U) { while (!privateU.empty()) @@ -699,16 +712,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } } //End of while ( /*!Q.empty()*/ !U.empty() ) -#pragma omp critical(privateMsg) + #pragma omp critical(privateMsg) { while (!privateQLocalVtx.empty()) { - QLocalVtx.push_back(privateQLocalVtx.pop_back()); - QGhostVtx.push_back(privateQGhostVtx.pop_back()); - QMsgType.push_back(privateQMsgType.pop_back()); - QOwner.push_back(privateQOwner.pop_back()); + QLocalVtx.push_back(privateQLocalVtx.pop_front()); + QGhostVtx.push_back(privateQGhostVtx.pop_front()); + QMsgType.push_back(privateQMsgType.pop_front()); + QOwner.push_back(privateQOwner.pop_front()); } + } diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 117057b5..908bd1d3 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -8,6 +8,8 @@ #include "dataStrStaticQueue.h" #include "omp.h" +#define NUM_THREAD 4 + inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt StartIndex, MilanLongInt EndIndex, MilanLongInt* numGhostEdgesPtr, @@ -44,17 +46,19 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, // index that starts with zero to |Vg| - 1 map::iterator storedAlready; -#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4) +#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD) { + #pragma omp single + { + //Initialize the locks //TODO this can be executed as task in parallel with other unparallelizable tasks //TODO destroy the locks -#pragma omp for schedule(static) +#pragma omp taskloop num_tasks(NUM_THREAD) for(i = 0; i < NLVer; i++) omp_init_lock(&MateLock[i]); - - + #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); #endif @@ -70,7 +74,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * only when a ghost edge is found and ghost edges are a minority, * circa 3.5% during the tests. */ -#pragma omp for reduction(+ : numGhostEdges) +#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ : numGhostEdges) depend ( out : numGhostEdges, Counter, Ghost2LocalMap ) for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice insertMe = verLocInd[i]; //cout<<"InsertMe on Process "< EndIndex) ) } //End of for(ghost vertices) - #pragma omp single - { - //numGhostEdges = atomicNumGhostEdges; #ifdef TIME_TRACKER Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization; fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization); @@ -114,6 +115,9 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, } #endif + #pragma omp task depend ( out : verGhostPtr, tempCounter, verGhostInd, GMate) depend ( in : numGhostVertices) + { + //Initialize adjacency Lists for Ghost Vertices: try { verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector @@ -139,18 +143,17 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, double verGhostPtrInitialization = MPI_Wtime(); #endif + } // End of task - /* - * Not parallelizable - */ +#pragma omp task depent ( out : verGhostPtr ) depend ( in : Counter, numGhostVertices) + { for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|) verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; #ifdef PRINT_DEBUG_INFO_ cout< Date: Sun, 26 Jun 2022 04:48:49 -0500 Subject: [PATCH 35/96] Reformat initialize, refactoring of initialize completed --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1 + amgprec/impl/aggregator/initialize.cpp | 435 ++++++++++-------- 2 files changed, 236 insertions(+), 200 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 9d5b6417..d6c58852 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -214,6 +214,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( bool isEmpty; //Declare the locks + // TODO destroy the locks omp_lock_t MateLock[NLVer]; initialize(NLVer, NLEdge, StartIndex, diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 908bd1d3..21210c34 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -8,35 +8,34 @@ #include "dataStrStaticQueue.h" #include "omp.h" -#define NUM_THREAD 4 +#define NUM_THREAD 12 inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt* numGhostEdgesPtr, - MilanLongInt* numGhostVerticesPtr, - MilanLongInt* S, - MilanLongInt* verLocInd, - MilanLongInt* verLocPtr, - omp_lock_t* MateLock, - map &Ghost2LocalMap, - vector & Counter, - vector & verGhostPtr, - vector & verGhostInd, - vector & tempCounter, - vector & GMate, - vector& Message, - vector& QLocalVtx, - vector& QGhostVtx, - vector& QMsgType, - vector& QOwner, - MilanLongInt* &candidateMate, - staticQueue& U, - staticQueue& privateU, - staticQueue& privateQLocalVtx, - staticQueue& privateQGhostVtx, - staticQueue& privateQMsgType, - staticQueue& privateQOwner - ) + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt *numGhostEdgesPtr, + MilanLongInt *numGhostVerticesPtr, + MilanLongInt *S, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + omp_lock_t *MateLock, + map &Ghost2LocalMap, + vector &Counter, + vector &verGhostPtr, + vector &verGhostInd, + vector &tempCounter, + vector &GMate, + vector &Message, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + MilanLongInt *&candidateMate, + staticQueue &U, + staticQueue &privateU, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner) { MilanLongInt insertMe = 0, numGhostEdges = 0, numGhostVertices = 0; @@ -48,53 +47,55 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD) { - - #pragma omp single + +#pragma omp single { - //Initialize the locks - //TODO this can be executed as task in parallel with other unparallelizable tasks - //TODO destroy the locks + // Initialize the locks #pragma omp taskloop num_tasks(NUM_THREAD) - for(i = 0; i < NLVer; i++) - omp_init_lock(&MateLock[i]); - + for (i = 0; i < NLVer; i++) + omp_init_lock(&MateLock[i]); + #ifdef TIME_TRACKER - double Ghost2LocalInitialization = MPI_Wtime(); + double Ghost2LocalInitialization = MPI_Wtime(); #endif - /* - * OMP Ghost2LocalInitialization - * The cycle analyzes all the edges and when finds a ghost edge - * puts it in the Ghost2LocalMap. - * A critical region is needed when inserting data in the map. - * - * Despite the critical region it is still productive to - * parallelize this for because the critical region is exeuted - * only when a ghost edge is found and ghost edges are a minority, - * circa 3.5% during the tests. - */ -#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ : numGhostEdges) depend ( out : numGhostEdges, Counter, Ghost2LocalMap ) - for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice - insertMe = verLocInd[i]; - //cout<<"InsertMe on Process "< EndIndex)) { //Find a ghost - numGhostEdges++; + /* + * OMP Ghost2LocalInitialization + * This loop analyzes all the edges and when finds a ghost edge + * puts it in the Ghost2LocalMap. + * A critical region is needed when inserting data in the map. + * + * Despite the critical region it is still productive to + * parallelize this cycle because the critical region is exeuted + * only when a ghost edge is found and ghost edges are a minority, + * circa 3.5% during the tests. + */ +#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \ + : numGhostEdges) depend(out \ + : numGhostEdges, Counter, Ghost2LocalMap) + for (i = 0; i < NLEdge; i++) + { // O(m) - Each edge stored twice + insertMe = verLocInd[i]; + if ((insertMe < StartIndex) || (insertMe > EndIndex)) + { // Find a ghost + numGhostEdges++; #pragma omp critical - { - storedAlready = Ghost2LocalMap.find(insertMe); - if (storedAlready != Ghost2LocalMap.end()) { //Has already been added - //cout<<"Process "<first<<" - "<second<second]++; //Increment the counter - } else { //Insert an entry for the ghost: - //cout<<"Process "<second<<" - "<first<<" : "<second]<second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl; fflush(stdout); storedAlready++; - } while ( storedAlready != Ghost2LocalMap.end() ); + } while (storedAlready != Ghost2LocalMap.end()); } #endif - #pragma omp task depend ( out : verGhostPtr, tempCounter, verGhostInd, GMate) depend ( in : numGhostVertices) - { +#pragma omp task depend(out \ + : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \ + : numGhostVertices) + { - //Initialize adjacency Lists for Ghost Vertices: - try { - verGhostPtr.reserve(numGhostVertices + 1); //Pointer Vector - tempCounter.reserve(numGhostVertices); //Pointer Vector - verGhostInd.reserve(numGhostEdges); //Index Vector - GMate.reserve(numGhostVertices); //Ghost Mate Vector - } catch (length_error) { - cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; - cout << "Not enough memory to allocate the internal variables \n"; - exit(1); - } - //Initialize the Vectors: - verGhostPtr.resize(numGhostVertices + 1, 0); //Pointer Vector - tempCounter.resize(numGhostVertices, 0); //Temporary Counter - verGhostInd.resize(numGhostEdges, -1); //Index Vector - GMate.resize(numGhostVertices, -1); //Temporary Counter - verGhostPtr[0] = 0; //The first value + // Initialize adjacency Lists for Ghost Vertices: + try + { + verGhostPtr.reserve(numGhostVertices + 1); // Pointer Vector + tempCounter.reserve(numGhostVertices); // Pointer Vector + verGhostInd.reserve(numGhostEdges); // Index Vector + GMate.reserve(numGhostVertices); // Ghost Mate Vector + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + // Initialize the Vectors: + verGhostPtr.resize(numGhostVertices + 1, 0); // Pointer Vector + tempCounter.resize(numGhostVertices, 0); // Temporary Counter + verGhostInd.resize(numGhostEdges, -1); // Index Vector + GMate.resize(numGhostVertices, -1); // Temporary Counter + verGhostPtr[0] = 0; // The first value #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) - cout< 0) + cout << verGhostPtr[numGhostVertices] << "\n"; + fflush(stdout); #endif #ifdef TIME_TRACKER - double verGhostIndInitialization = MPI_Wtime(); + double verGhostIndInitialization = MPI_Wtime(); #endif - /* - * OMP verGhostIndInitialization - * - * In this cycle the verGhostInd is initialized - * with the datas related to ghost edges. - * The check to see if a node is a ghost node is - * executed in paralle and when a ghost node - * is found a critical region is started. - * - * Despite the critical region it's still useful to - * parallelize the for cause the ghost nodes - * are a minority hence the critical region is executed - * few times, circa 3.5% of the times in the tests. - */ -#pragma omp taskloop num_tasks(NUM_THREAD) depend ( in : insertMe, Ghost2LocalMap, tempCounter) depend ( out : verGhostInd) - for (v = 0; v < NLVer; v++) { - adj1 = verLocPtr[v]; //Vertex Pointer - adj2 = verLocPtr[v + 1]; - for (k = adj1; k < adj2; k++) { - w = verLocInd[k]; //Get the adjacent vertex - if ((w < StartIndex) || (w > EndIndex)) { //Find a ghost + /* + * OMP verGhostIndInitialization + * + * In this cycle the verGhostInd is initialized + * with the datas related to ghost edges. + * The check to see if a node is a ghost node is + * executed in paralle and when a ghost node + * is found a critical region is started. + * + * Despite the critical region it's still useful to + * parallelize the for cause the ghost nodes + * are a minority hence the critical region is executed + * few times, circa 3.5% of the times in the tests. + */ +#pragma omp taskloop num_tasks(NUM_THREAD) depend(in \ + : insertMe, Ghost2LocalMap, tempCounter) depend(out \ + : verGhostInd) + for (v = 0; v < NLVer; v++) + { + adj1 = verLocPtr[v]; // Vertex Pointer + adj2 = verLocPtr[v + 1]; + for (k = adj1; k < adj2; k++) + { + w = verLocInd[k]; // Get the adjacent vertex + if ((w < StartIndex) || (w > EndIndex)) + { // Find a ghost #pragma omp critical - { - insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert - tempCounter[Ghost2LocalMap[w]]++; //Increment the counter - } - verGhostInd[insertMe] = v + StartIndex; //Add the adjacency - } //End of if((w < StartIndex) || (w > EndIndex)) - } //End of for(k) - } //End of for (v) + { + insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert + tempCounter[Ghost2LocalMap[w]]++; // Increment the counter + } + verGhostInd[insertMe] = v + StartIndex; // Add the adjacency + } // End of if((w < StartIndex) || (w > EndIndex)) + } // End of for(k) + } // End of for (v) - } // End of parallel region + } // End of parallel region #ifdef TIME_TRACKER - verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; - fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); + verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; + fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); #endif #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< Date: Sun, 26 Jun 2022 10:02:11 -0500 Subject: [PATCH 36/96] initialize fix --- amgprec/impl/aggregator/initialize.cpp | 240 +++++++++++++------------ exec.sh | 1 + 2 files changed, 125 insertions(+), 116 deletions(-) diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 21210c34..c5ae3f26 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -8,7 +8,7 @@ #include "dataStrStaticQueue.h" #include "omp.h" -#define NUM_THREAD 12 +#define NUM_THREAD 4 inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt StartIndex, MilanLongInt EndIndex, @@ -50,7 +50,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp single { - // Initialize the locks #pragma omp taskloop num_tasks(NUM_THREAD) for (i = 0; i < NLVer; i++) @@ -71,32 +70,38 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * only when a ghost edge is found and ghost edges are a minority, * circa 3.5% during the tests. */ -#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \ - : numGhostEdges) depend(out \ - : numGhostEdges, Counter, Ghost2LocalMap) - for (i = 0; i < NLEdge; i++) - { // O(m) - Each edge stored twice - insertMe = verLocInd[i]; - if ((insertMe < StartIndex) || (insertMe > EndIndex)) - { // Find a ghost - numGhostEdges++; + +#pragma omp task depend(out \ + : numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, numGhostVertices) + { + +#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \ + : numGhostEdges) + for (i = 0; i < NLEdge; i++) + { // O(m) - Each edge stored twice + insertMe = verLocInd[i]; + if ((insertMe < StartIndex) || (insertMe > EndIndex)) + { // Find a ghost + numGhostEdges++; #pragma omp critical - { - storedAlready = Ghost2LocalMap.find(insertMe); - if (storedAlready != Ghost2LocalMap.end()) - { // Has already been added - Counter[storedAlready->second]++; // Increment the counter + { + storedAlready = Ghost2LocalMap.find(insertMe); + if (storedAlready != Ghost2LocalMap.end()) + { // Has already been added + Counter[storedAlready->second]++; // Increment the counter + } + else + { // Insert an entry for the ghost: + Ghost2LocalMap[insertMe] = numGhostVertices; // Add a map entry + Counter.push_back(1); // Initialize the counter + numGhostVertices++; // Increment the number of ghost vertices + } // End of else() } - else - { // Insert an entry for the ghost: - Ghost2LocalMap[insertMe] = numGhostVertices; // Add a map entry - Counter.push_back(1); // Initialize the counter - numGhostVertices++; // Increment the number of ghost vertices - } // End of else() - } - } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) - } // End of for(ghost vertices) + } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) + } // End of for(ghost vertices) + } // end of task depend + // numGhostEdges = atomicNumGhostEdges; #ifdef TIME_TRACKER Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization; fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization); @@ -121,7 +126,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp task depend(out \ : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \ - : numGhostVertices) + : numGhostVertices, numGhostEdges) { // Initialize adjacency Lists for Ghost Vertices: @@ -151,7 +156,7 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, } // End of task -#pragma omp task depent(out \ +#pragma omp task depend(out \ : verGhostPtr) depend(in \ : Counter, numGhostVertices) { @@ -198,123 +203,126 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * are a minority hence the critical region is executed * few times, circa 3.5% of the times in the tests. */ -#pragma omp taskloop num_tasks(NUM_THREAD) depend(in \ - : insertMe, Ghost2LocalMap, tempCounter) depend(out \ - : verGhostInd) - for (v = 0; v < NLVer; v++) +#pragma omp task depend(in \ + : insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \ + : verGhostInd) { - adj1 = verLocPtr[v]; // Vertex Pointer - adj2 = verLocPtr[v + 1]; - for (k = adj1; k < adj2; k++) +#pragma omp taskloop num_tasks(NUM_THREAD) + for (v = 0; v < NLVer; v++) { - w = verLocInd[k]; // Get the adjacent vertex - if ((w < StartIndex) || (w > EndIndex)) - { // Find a ghost + adj1 = verLocPtr[v]; // Vertex Pointer + adj2 = verLocPtr[v + 1]; + for (k = adj1; k < adj2; k++) + { + w = verLocInd[k]; // Get the adjacent vertex + if ((w < StartIndex) || (w > EndIndex)) + { // Find a ghost #pragma omp critical - { - insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert - tempCounter[Ghost2LocalMap[w]]++; // Increment the counter - } - verGhostInd[insertMe] = v + StartIndex; // Add the adjacency - } // End of if((w < StartIndex) || (w > EndIndex)) - } // End of for(k) - } // End of for (v) - - } // End of parallel region + { + insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert + tempCounter[Ghost2LocalMap[w]]++; // Increment the counter + } + verGhostInd[insertMe] = v + StartIndex; // Add the adjacency + } // End of if((w < StartIndex) || (w > EndIndex)) + } // End of for(k) + } // End of for (v) + } // end of tasklopp #ifdef TIME_TRACKER - verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; - fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); + verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; + fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); #endif #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Ghost Vertex Index: "; - for (v = 0; v < numGhostEdges; v++) - cout << verGhostInd[v] << "\t"; - cout << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Ghost Vertex Index: "; + for (v = 0; v < numGhostEdges; v++) + cout << verGhostInd[v] << "\t"; + cout << endl; + fflush(stdout); #endif #pragma omp task depend(in \ : numGhostEdges) depend(out \ : QLocalVtx, QGhostVtx, QMsgType, QOwner) - { - try { - QLocalVtx.reserve(numGhostEdges); // Local Vertex - QGhostVtx.reserve(numGhostEdges); // Ghost Vertex - QMsgType.reserve(numGhostEdges); // Message Type (Request/Failure) - QOwner.reserve(numGhostEdges); // Owner of the ghost: COmpute once and use later - } - catch (length_error) - { - cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; - cout << "Not enough memory to allocate the internal variables \n"; - exit(1); - } - } + try + { + QLocalVtx.reserve(numGhostEdges); // Local Vertex + QGhostVtx.reserve(numGhostEdges); // Ghost Vertex + QMsgType.reserve(numGhostEdges); // Message Type (Request/Failure) + QOwner.reserve(numGhostEdges); // Owner of the ghost: COmpute once and use later + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + } // end of task #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Allocating CandidateMate.. "; - fflush(stdout); + cout << "\n(" << myRank << ")Allocating CandidateMate.. "; + fflush(stdout); #endif #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << "=========================************===============================" << endl; - fflush(stdout); - fflush(stdout); + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); #endif #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl; - fflush(stdout); - fflush(stdout); + cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl; + fflush(stdout); + fflush(stdout); #endif + #ifdef DEBUG_HANG_ - if (myRank == 0) - cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl; - fflush(stdout); + if (myRank == 0) + cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl; + fflush(stdout); #endif #pragma omp task depend(in \ : numGhostEdges, numGhostVertices) depend(out \ : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) - { - - - //The values calculated in this function are sent back to the calling function - *numGhostEdgesPtr = numGhostEdges; - *numGhostVerticesPtr = numGhostVertices; - - // Allocate Data Structures: - /* - * candidateMate was a vector and has been replaced with an array - * there is no point in using the vector (or maybe there is (???)) - * so I replaced it with an array wich is slightly faster - */ - candidateMate = new MilanLongInt[NLVer + numGhostVertices]; - - *S = numGhostVertices; // Initialize S with number of Ghost Vertices + { - /* - * Create the Queue Data Structure for the Dominating Set - * - * I had to declare the staticuQueue U before the parallel region - * to have it in the correct scope. Since we can't change the dimension - * of a staticQueue I had to destroy the previous object and instantiate - * a new one of the correct size. - */ - new (&U) staticQueue(NLVer + numGhostVertices); - - // TODO how can I decide a more meaningfull size? - MilanLongInt size = numGhostVertices; - - // Initialize the privte data structure - new (&privateU) staticQueue(NLVer + numGhostVertices); // TODO how can I put a meaningfull size? - new (&privateQLocalVtx) staticQueue(size); - new (&privateQGhostVtx) staticQueue(size); - new (&privateQMsgType) staticQueue(size); - new (&privateQOwner) staticQueue(size); - } - } // End of single + // The values calculated in this function are sent back to the calling function + *numGhostEdgesPtr = numGhostEdges; + *numGhostVerticesPtr = numGhostVertices; + + // Allocate Data Structures: + /* + * candidateMate was a vector and has been replaced with an array + * there is no point in using the vector (or maybe there is (???)) + * so I replaced it with an array wich is slightly faster + */ + candidateMate = new MilanLongInt[NLVer + numGhostVertices]; + + *S = numGhostVertices; // Initialize S with number of Ghost Vertices + + /* + * Create the Queue Data Structure for the Dominating Set + * + * I had to declare the staticuQueue U before the parallel region + * to have it in the correct scope. Since we can't change the dimension + * of a staticQueue I had to destroy the previous object and instantiate + * a new one of the correct size. + */ + new (&U) staticQueue(NLVer + numGhostVertices); + + // TODO how can I decide a more meaningfull size? + MilanLongInt size = numGhostVertices; + + // Initialize the privte data structure + new (&privateU) staticQueue(NLVer + numGhostVertices); // TODO how can I put a meaningfull size? + new (&privateQLocalVtx) staticQueue(size); + new (&privateQGhostVtx) staticQueue(size); + new (&privateQMsgType) staticQueue(size); + new (&privateQOwner) staticQueue(size); + } // end of task + + } // End of single region + } // End of parallel region } diff --git a/exec.sh b/exec.sh index 3bb7bd90..50edf4ad 100755 --- a/exec.sh +++ b/exec.sh @@ -1,3 +1,4 @@ +rm amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o make all cd samples/advanced/pdegen make amg_d_pde3d From 7cfe198d0f383c541c83d61b7e99e016699e086d Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 26 Jun 2022 10:45:06 -0500 Subject: [PATCH 37/96] Format --- amgprec/impl/aggregator/MatchBoxPC.h | 315 ++-- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1562 +++++++++-------- 2 files changed, 1024 insertions(+), 853 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 54830919..530933e5 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -52,7 +52,7 @@ #ifndef _matchboxpC_H_ #define _matchboxpC_H_ -//Turn on a lot of debugging information with this switch: +// Turn on a lot of debugging information with this switch: //#define PRINT_DEBUG_INFO_ #include #include @@ -66,193 +66,190 @@ using namespace std; #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif #if !defined(SERIAL_MPI) - -#define MilanMpiLongInt MPI_LONG_LONG + +#define MilanMpiLongInt MPI_LONG_LONG #ifndef _primitiveDataType_Definition_ #define _primitiveDataType_Definition_ - //Regular integer: - #ifndef INTEGER_H - #define INTEGER_H - typedef int32_t MilanInt; - #endif +// Regular integer: +#ifndef INTEGER_H +#define INTEGER_H + typedef int32_t MilanInt; +#endif - //Regular long integer: - #ifndef LONG_INT_H - #define LONG_INT_H - #ifdef BIT64 - typedef int64_t MilanLongInt; - typedef MPI_LONG MilanMpiLongInt; - #else - typedef int32_t MilanLongInt; - typedef MPI_INT MilanMpiLongInt; - #endif - #endif +// Regular long integer: +#ifndef LONG_INT_H +#define LONG_INT_H +#ifdef BIT64 + typedef int64_t MilanLongInt; + typedef MPI_LONG MilanMpiLongInt; +#else + typedef int32_t MilanLongInt; + typedef MPI_INT MilanMpiLongInt; +#endif +#endif - //Regular boolean - #ifndef BOOL_H - #define BOOL_H - typedef bool MilanBool; - #endif +// Regular boolean +#ifndef BOOL_H +#define BOOL_H + typedef bool MilanBool; +#endif - //Regular double and absolute value computation: - #ifndef REAL_H - #define REAL_H - typedef double MilanReal; - typedef MPI_DOUBLE MilanMpiReal; - inline MilanReal MilanAbs(MilanReal value) - { - return fabs(value); - } - #endif +// Regular double and absolute value computation: +#ifndef REAL_H +#define REAL_H + typedef double MilanReal; + typedef MPI_DOUBLE MilanMpiReal; + inline MilanReal MilanAbs(MilanReal value) + { + return fabs(value); + } +#endif - //Regular float and absolute value computation: - #ifndef FLOAT_H - #define FLOAT_H - typedef float MilanFloat; - typedef MPI_FLOAT MilanMpiFloat; - inline MilanFloat MilanAbsFloat(MilanFloat value) - { - return fabs(value); - } - #endif +// Regular float and absolute value computation: +#ifndef FLOAT_H +#define FLOAT_H + typedef float MilanFloat; + typedef MPI_FLOAT MilanMpiFloat; + inline MilanFloat MilanAbsFloat(MilanFloat value) + { + return fabs(value); + } +#endif - //// Define the limits: - #ifndef LIMITS_H - #define LIMITS_H - //Integer Maximum and Minimum: - // #define MilanIntMax INT_MAX - // #define MilanIntMin INT_MIN - #define MilanIntMax INT32_MAX - #define MilanIntMin INT32_MIN +//// Define the limits: +#ifndef LIMITS_H +#define LIMITS_H + // Integer Maximum and Minimum: + // #define MilanIntMax INT_MAX + // #define MilanIntMin INT_MIN +#define MilanIntMax INT32_MAX +#define MilanIntMin INT32_MIN - #ifdef BIT64 - #define MilanLongIntMax INT64_MAX - #define MilanLongIntMin -INT64_MAX - #else - #define MilanLongIntMax INT32_MAX - #define MilanLongIntMin -INT32_MAX - #endif +#ifdef BIT64 +#define MilanLongIntMax INT64_MAX +#define MilanLongIntMin -INT64_MAX +#else +#define MilanLongIntMax INT32_MAX +#define MilanLongIntMin -INT32_MAX +#endif - #endif +#endif // +INFINITY const double PLUS_INFINITY = numeric_limits::infinity(); const double MINUS_INFINITY = -PLUS_INFINITY; - //#define MilanRealMax LDBL_MAX - #define MilanRealMax PLUS_INFINITY - #define MilanRealMin MINUS_INFINITY +//#define MilanRealMax LDBL_MAX +#define MilanRealMax PLUS_INFINITY +#define MilanRealMin MINUS_INFINITY #endif -//Function of find the owner of a ghost vertex using binary search: -inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, + // Function of find the owner of a ghost vertex using binary search: + inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt myRank, MilanInt numProcs); -inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanLongInt* verLocInd, - MilanReal* edgeLocWeight); + inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanLongInt *verLocInd, + MilanReal *edgeLocWeight); -inline bool isAlreadyMatched(MilanLongInt node, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector &GMate, - MilanLongInt* Mate, - map &Ghost2LocalMap); + inline bool isAlreadyMatched(MilanLongInt node, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap); -inline MilanLongInt computeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanReal* edgeLocWeight, - MilanLongInt k, - MilanLongInt* verLocInd, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector &GMate, - MilanLongInt* Mate, - map &Ghost2LocalMap); + inline MilanLongInt computeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanReal *edgeLocWeight, + MilanLongInt k, + MilanLongInt *verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap); -inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt* numGhostEdgesPtr, - MilanLongInt* numGhostVerticesPtr, - MilanLongInt* S, - MilanLongInt* verLocInd, - MilanLongInt* verLocPtr, - omp_lock_t* MateLock, - map &Ghost2LocalMap, - vector & Counter, - vector & verGhostPtr, - vector & verGhostInd, - vector & tempCounter, - vector & GMate, - vector& Message, - vector& QLocalVtx, - vector& QGhostVtx, - vector& QMsgType, - vector& QOwner, - MilanLongInt* &candidateMate, - staticQueue& U, - staticQueue& privateU, - staticQueue& privateQLocalVtx, - staticQueue& privateQGhostVtx, - staticQueue& privateQMsgType, - staticQueue& privateQOwner - ); + inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt *numGhostEdgesPtr, + MilanLongInt *numGhostVerticesPtr, + MilanLongInt *S, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + omp_lock_t *MateLock, + map &Ghost2LocalMap, + vector &Counter, + vector &verGhostPtr, + vector &verGhostInd, + vector &tempCounter, + vector &GMate, + vector &Message, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + MilanLongInt *&candidateMate, + staticQueue &U, + staticQueue &privateU, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); -void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP - ( - MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MPI_Comm comm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); - void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC -( - MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MPI_Comm comm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); - void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC -( -MilanLongInt NLVer, MilanLongInt NLEdge, -MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight, -MilanLongInt* verDistance, -MilanLongInt* Mate, -MilanInt myRank, MilanInt numProcs, MPI_Comm comm, -MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, -MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, -MilanLongInt* ph1_card, MilanLongInt* ph2_card ); + void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); -void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MilanInt icomm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ); + void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MilanInt icomm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); -void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MilanInt icomm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ); + void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MilanInt icomm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); #endif #ifdef __cplusplus diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index d6c58852..d470b1ab 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -81,26 +81,32 @@ #ifdef SERIAL_MPI #else -//MPI type map -template MPI_Datatype TypeMap(); -template<> inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } -template<> inline MPI_Datatype TypeMap() { return MPI_INT; } -template<> inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } -template<> inline MPI_Datatype TypeMap() { return MPI_FLOAT; } +// MPI type map +template +MPI_Datatype TypeMap(); +template <> +inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } +template <> +inline MPI_Datatype TypeMap() { return MPI_INT; } +template <> +inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } +template <> +inline MPI_Datatype TypeMap() { return MPI_FLOAT; } // DOUBLE PRECISION VERSION -//WARNING: The vertex block on a given rank is contiguous +// WARNING: The vertex block on a given rank is contiguous void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( - MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, - MilanReal* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MPI_Comm comm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, - MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, + MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, + MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card) +{ /* * verDistance: it's a vector long as the number of processors. @@ -118,152 +124,159 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #if !defined(SERIAL_MPI) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< Message; // [ u, v, message_type ] - Message.resize(3,-1); - const MilanLongInt REQUEST = 1; - const MilanLongInt SUCCESS = 2; - const MilanLongInt FAILURE = 3; + Message.resize(3, -1); + const MilanLongInt REQUEST = 1; + const MilanLongInt SUCCESS = 2; + const MilanLongInt FAILURE = 3; const MilanLongInt SIZEINFO = 4; MilanLongInt message_type = 0; - //Data structures for Message Bundling: - //Although up to two messages can be sent along any cross edge, - //only one message will be sent in the initialization phase - - //one of: REQUEST/FAILURE/SUCCESS + // Data structures for Message Bundling: + // Although up to two messages can be sent along any cross edge, + // only one message will be sent in the initialization phase - + // one of: REQUEST/FAILURE/SUCCESS vector QLocalVtx, QGhostVtx, QMsgType; vector QOwner; // Changed by Fabio to be an integer, addresses needs to be integers! - MilanLongInt* PCounter = new MilanLongInt [numProcs]; + MilanLongInt *PCounter = new MilanLongInt[numProcs]; for (int i = 0; i < numProcs; i++) PCounter[i] = 0; - MilanLongInt NumMessagesBundled = 0; MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers! - MilanLongInt* candidateMate = nullptr; + MilanLongInt *candidateMate = nullptr; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< Ghost2LocalMap; //Map each ghost vertex to a local vertex - vector Counter; //Store the edge count for each ghost vertex - MilanLongInt numGhostVertices = 0, numGhostEdges = 0; //Number of Ghost vertices + // Build the Ghost Vertex Set: Vg + map Ghost2LocalMap; // Map each ghost vertex to a local vertex + vector Counter; // Store the edge count for each ghost vertex + MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< verGhostPtr, verGhostInd, tempCounter; - //Mate array for ghost vertices: - vector GMate; //Proportional to the number of ghost vertices + // Define Adjacency Lists for Ghost Vertices: + // cout<<"Building Ghost data structures ... \n\n"; + vector verGhostPtr, verGhostInd, tempCounter; + // Mate array for ghost vertices: + vector GMate; // Proportional to the number of ghost vertices MilanLongInt S; MilanLongInt privateMyCard = 0; staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; MilanLongInt myIndex = 0; - vector PCumulative, PMessageBundle, PSizeInfoMessages; - vector SRequest; //Requests that are used for each send message - vector SStatus; //Status of sent messages, used in MPI_Wait - MilanLongInt MessageIndex = 0; //Pointer for current message + vector PCumulative, PMessageBundle, PSizeInfoMessages; + vector SRequest; // Requests that are used for each send message + vector SStatus; // Status of sent messages, used in MPI_Wait + MilanLongInt MessageIndex = 0; // Pointer for current message MilanInt OneMessageSize = 0; MilanLongInt numMessagesToSend; MilanInt BufferSize; MilanLongInt *Buffer; bool isEmpty; - //Declare the locks - // TODO destroy the locks + // Declare the locks + // TODO destroy the locks omp_lock_t MateLock[NLVer]; - initialize(NLVer, NLEdge, StartIndex, - EndIndex, &numGhostEdges, - &numGhostVertices, &S, - verLocInd, verLocPtr, - MateLock, - Ghost2LocalMap, Counter, - verGhostPtr, verGhostInd, - tempCounter, GMate, - Message, QLocalVtx, - QGhostVtx, QMsgType, QOwner, - candidateMate, U, - privateU, - privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner - ); - + initialize(NLVer, NLEdge, StartIndex, + EndIndex, &numGhostEdges, + &numGhostVertices, &S, + verLocInd, verLocPtr, + MateLock, + Ghost2LocalMap, Counter, + verGhostPtr, verGhostInd, + tempCounter, GMate, + Message, QLocalVtx, + QGhostVtx, QMsgType, QOwner, + candidateMate, U, + privateU, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + finishTime = MPI_Wtime(); - *ph0_time = finishTime - startTime; //Time taken for Phase-0: Initialization + *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization - startTime = MPI_Wtime(); - ///////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////// INITIALIZATION ///////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////// - //Compute the Initial Matching Set: + // Compute the Initial Matching Set: #pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) { /* - * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from - * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize - * the two. - * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel. - */ + * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from + * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize + * the two. + * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel. + */ #pragma omp for schedule(static) - for ( v=0; v < NLVer; v++ ) { + for (v = 0; v < NLVer; v++) + { #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { + // If found a dominating edge: + if (w >= 0) + { - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + { w = computeCandidateMate(verLocPtr[v], verLocPtr[v + 1], edgeLocWeight, 0, @@ -310,13 +328,15 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( candidateMate[v] = w; } - if (w >= 0) { + if (w >= 0) + { myCard++; - if ((w < StartIndex) || (w > EndIndex)) { //w is a ghost vertex + if ((w < StartIndex) || (w > EndIndex)) + { // w is a ghost vertex #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { + if (Counter[Ghost2LocalMap[w]] > 0) + { - Counter[Ghost2LocalMap[w]] -= 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S + Counter[Ghost2LocalMap[w]] -= 1; // Decrement + if (Counter[Ghost2LocalMap[w]] == 0) + { + S--; // Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } //End of if CandidateMate[w] = v - + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } // End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { // w is a local vertex + } // End of if a Ghost Vertex + else + { // w is a local vertex - if (candidateMate[w - StartIndex] == (v + StartIndex)) { + if (candidateMate[w - StartIndex] == (v + StartIndex)) + { privateU.push_back(v + StartIndex); privateU.push_back(w); - Mate[v] = w; //v is local - //FIXME this instruction could create errors - Mate[w - StartIndex] = v + StartIndex; //w is local - + Mate[v] = w; // v is local + // FIXME this instruction could create errors + Mate[w - StartIndex] = v + StartIndex; // w is local #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) + } // End of if(w >=0) - //This piece of code is executed a really small amount of times, I will not allocate a - //huge amount of memory to the private data structures. + // This piece of code is executed a really small amount of times, I will not allocate a + // huge amount of memory to the private data structures. adj11 = verLocPtr[v]; adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) { + for (k1 = adj11; k1 < adj12; k1++) + { w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< Us; + // TODO what would be the optimal UCHUNK + vector Us; Us.reserve(UCHUNK); - while( true ) { + while (true) + { Us.clear(); #pragma omp critical(U) { - //If U is emptu and there are no new node to add to U + // If U is emptu and there are no new node to add to U if (U.empty() && privateU.empty()) isEmpty = true; - else { + else + { if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U - while (!privateU.empty()) { + while (!privateU.empty()) + { U.push_back(privateU.pop_front()); myCard += privateMyCard; } - for (int i = 0; i < UCHUNK; i++) { // Pop the new nodes - if (U.empty()) break; + for (int i = 0; i < UCHUNK; i++) + { // Pop the new nodes + if (U.empty()) + break; Us.push_back(U.pop_front()); } } } // End of critical U - if (isEmpty) break; + if (isEmpty) + break; for (MilanLongInt u : Us) { #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (u <= EndIndex)) { //Process Only the Local Vertices + if ((u >= StartIndex) && (u <= EndIndex)) + { // Process Only the Local Vertices #ifdef COUNT_LOCAL_VERTEX - localVertices ++; + localVertices++; #endif - //Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; //Pointer + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) { + for (k = adj1; k < adj2; k++) + { v = verLocInd[k]; - if ((v >= StartIndex) && (v <= EndIndex)) { //If Local Vertex: + if ((v >= StartIndex) && (v <= EndIndex)) + { // If Local Vertex: #pragma omp critical(innerProcessMatched) { #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) { + // If found a dominating edge: + if (w >= 0) + { - //TODO is it possible to lock without a critical region? - //TODO there must be a more elegant and efficient way to do this + // TODO is it possible to lock without a critical region? + // TODO there must be a more elegant and efficient way to do this /* while(true) { if (omp_test_lock(&MateLock[v - StartIndex])) { @@ -562,11 +598,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } */ - - if ((w < StartIndex) || (w > EndIndex)) { //A ghost + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if (Counter[Ghost2LocalMap[w]] == 0) { - S--; //Decrement S + // Decrement the counter: + // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + if (Counter[Ghost2LocalMap[w]] > 0) + { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement + if (Counter[Ghost2LocalMap[w]] == 0) + { + S--; // Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - } //End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { //w is a local vertex - if (candidateMate[w - StartIndex] == v) { - Mate[v - StartIndex] = w; //v is a local vertex - Mate[w - StartIndex] = v; //w is a local vertex - //Q.push_back(u); + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + // Q.push_back(u); privateU.push_back(v); privateU.push_back(w); privateMyCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { + } // End of if(w >=0) + else + { adj11 = verLocPtr[v - StartIndex]; adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) { + for (k1 = adj11; k1 < adj12; k1++) + { w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) { //A ghost + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else { //Neighbor is a ghost vertex + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else + { // Neighbor is a ghost vertex #pragma omp critical(innerProcessMatched) { - //while(!omp_test_lock(&MateLock[u - StartIndex])); + // while(!omp_test_lock(&MateLock[u - StartIndex])); if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) { //u is local - //Build the Message Packet: - //Message[0] = u; //LOCAL - //Message[1] = v; //GHOST - //Message[2] = SUCCESS; //TYPE - //Send a Request (Asynchronous) + if (v != Mate[u - StartIndex]) + { // u is local + // Build the Message Packet: + // Message[0] = u; //LOCAL + // Message[1] = v; //GHOST + // Message[2] = SUCCESS; //TYPE + // Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex - //Avoid to ask for the critical section if there is nothing to add - if (privateU.size() < UCHUNK && !U.empty()) continue; + // Avoid to ask for the critical section if there is nothing to add + if (privateU.size() < UCHUNK && !U.empty()) + continue; #pragma omp critical(U) { - while (!privateU.empty()) { + while (!privateU.empty()) + { U.push_back(privateU.pop_front()); } myCard += privateMyCard; - } //End of critical U - + } // End of critical U } - } //End of while ( /*!Q.empty()*/ !U.empty() ) + } // End of while ( /*!Q.empty()*/ !U.empty() ) - #pragma omp critical(privateMsg) +#pragma omp critical(privateMsg) { - while (!privateQLocalVtx.empty()) { + while (!privateQLocalVtx.empty()) + { QLocalVtx.push_back(privateQLocalVtx.pop_front()); QGhostVtx.push_back(privateQGhostVtx.pop_front()); QMsgType.push_back(privateQMsgType.pop_front()); QOwner.push_back(privateQOwner.pop_front()); - } - } - #ifdef COUNT_LOCAL_VERTEX printf("Count local vertexes: %ld for thread %d of processor %d\n", - localVertices, - omp_get_thread_num(), - myRank); + localVertices, + omp_get_thread_num(), + myRank); #endif - ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// #ifdef DEBUG_HANG_ - if (myRank == 0) cout<<"\n("< 0) { //Send only if it is a nonempty packet + if (PSizeInfoMessages[i * 3 + 0] > 0) + { // Send only if it is a nonempty packet MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, &SRequest[MessageIndex]); msgActual++; MessageIndex++; - //Now Send the message with the data packet: + // Now Send the message with the data packet: #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<(), i, BundleTag, comm, &SRequest[MessageIndex]); MessageIndex++; - } //End of if size > 0 + } // End of if size > 0 } - //Free up temporary memory: + // Free up temporary memory: PCumulative.clear(); QLocalVtx.clear(); QGhostVtx.clear(); QMsgType.clear(); QOwner.clear(); - #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<(), comm, &OneMessageSize); //Size of one message packet - //How many messages to send? - //Potentially three kinds of messages will be sent/received: - //Request, Success, Failure. - //But only two will be sent from a given processor. - //Substract the number of messages that have already been sent as bundled messages: - numMessagesToSend = numGhostEdges*2 - NumMessagesBundled; - BufferSize = (OneMessageSize+MPI_BSEND_OVERHEAD)*numMessagesToSend; - - Buffer=0; -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<(), comm, &OneMessageSize); // Size of one message packet + // How many messages to send? + // Potentially three kinds of messages will be sent/received: + // Request, Success, Failure. + // But only two will be sent from a given processor. + // Substract the number of messages that have already been sent as bundled messages: + numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled; + BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; + + Buffer = 0; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize; + cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD; + cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges; + cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend; + cout << "\n(" << myRank << ")BufferSize = " << BufferSize; + cout << "\n(" << myRank << ")Attaching Buffer on.. "; + fflush(stdout); #endif - if ( BufferSize > 0 ) { - Buffer = (MilanLongInt *) malloc(BufferSize); //Allocate memory - if ( Buffer == 0 ) { - cout<<"Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; - cout<<"Not enough memory to allocate for send buffer on process "< 0) + { + Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory + if (Buffer == 0) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n"; exit(1); } - MPI_Buffer_attach(Buffer, BufferSize); //Attach the Buffer + MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer } - } //End of master + } // End of master } // end of parallel region ///////////////////////// END OF SEND BUNDLED MESSAGES ////////////////////////////////// finishTime = MPI_Wtime(); - *ph1_time = finishTime-startTime; //Time taken for Phase-1 - *ph1_card = myCard; //Cardinality at the end of Phase-1 + *ph1_time = finishTime - startTime; // Time taken for Phase-1 + *ph1_card = myCard; // Cardinality at the end of Phase-1 startTime = MPI_Wtime(); ///////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////// MAIN LOOP ////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////// - //Main While Loop: + // Main While Loop: #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< ReceiveBuffer; - MilanLongInt bundleSize=0, bundleCounter=0; - try { - ReceiveBuffer.reserve(numGhostEdges*2*3); //Three integers per cross edge - } catch ( length_error ) { - cout<<"Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; - cout<<"Not enough memory to allocate the internal variables \n"; + MilanLongInt bundleSize = 0, bundleCounter = 0; + try + { + ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; exit(1); } - while ( true ) { + while (true) + { #ifdef DEBUG_HANG_ - if (myRank == 0) cout<<"\n("<= StartIndex) && (u <= EndIndex) ) { //Process Only If a Local Vertex - //Get the Adjacency list for u - adj1 = verLocPtr[u-StartIndex]; //Pointer - adj2 = verLocPtr[u-StartIndex+1]; - for( k = adj1; k < adj2; k++ ) { + if ((u >= StartIndex) && (u <= EndIndex)) + { // Process Only If a Local Vertex + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) + { v = verLocInd[k]; - if ( (v >= StartIndex) && (v <= EndIndex) ) { //v is a Local Vertex: - if ( Mate[v-StartIndex] >= 0 ) // v is already matched + if ((v >= StartIndex) && (v <= EndIndex)) + { // v is a Local Vertex: + if (Mate[v - StartIndex] >= 0) // v is already matched continue; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<EndIndex) ) { //Is it a ghost vertex? - if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched + heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN + for (k1 = adj11; k1 < adj12; k1++) + { + if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex)) + { // Is it a ghost vertex? + if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched continue; } - else { //A local vertex - if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched + else + { // A local vertex + if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched continue; } - if( (edgeLocWeight[k1] > heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + if ((edgeLocWeight[k1] > heaviestEdgeWt) || + ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1]))) + { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } - } //End of for loop - candidateMate[v-StartIndex] = w; - //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0 ) { - if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost - //Build the Message Packet: - Message[0] = v; //LOCAL - Message[1] = w; //GHOST - Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) + { + if ((w < StartIndex) || (w > EndIndex)) + { // w is a ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; msgActual++; - if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) { - Mate[v-StartIndex] = w; //v is local - GMate[Ghost2LocalMap[w]] = v; //w is ghost - //Q.push_back(u); + msgInd++; + msgActual++; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) + { + Mate[v - StartIndex] = w; // v is local + GMate[Ghost2LocalMap[w]] = v; // w is ghost + // Q.push_back(u); U.push_back(v); U.push_back(w); myCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if ( Counter[Ghost2LocalMap[w]] == 0 ) { - S--; //Decrement S + // Decrement the counter: + // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + if (Counter[Ghost2LocalMap[w]] > 0) + { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement + if (Counter[Ghost2LocalMap[w]] == 0) + { + S--; // Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - } //End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { //w is a local vertex - if ( candidateMate[w-StartIndex] == v ) { - Mate[v-StartIndex] = w; //v is local - Mate[w-StartIndex] = v; //w is local - //Q.push_back(u); + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + Mate[v - StartIndex] = w; // v is local + Mate[w - StartIndex] = v; // w is local + // Q.push_back(u); U.push_back(v); U.push_back(w); myCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { //no dominating edge found: w == -1 - adj11 = verLocPtr[v-StartIndex]; - adj12 = verLocPtr[v-StartIndex+1]; - for( k1 = adj11; k1 < adj12; k1++ ) { + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + { // no dominating edge found: w == -1 + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { w = verLocInd[k1]; - if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost - //Build the Message Packet: - Message[0] = v; //LOCAL - Message[1] = w; //GHOST - Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< EndIndex)) + { // A ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; msgActual++; - } //End of if(GHOST) - } //End of for loop - } // End of Else: w == -1 - //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } //End of If (candidateMate[v-StartIndex] == u) - } //End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else { //Neighbor v is a ghost vertex - if ( candidateMate[NLVer+Ghost2LocalMap[v]] == u ) - candidateMate[NLVer+Ghost2LocalMap[v]] = -1; - if ( v != Mate[u-StartIndex] ) { //u is a local vertex - //Build the Message Packet: - Message[0] = u; //LOCAL - Message[1] = v; //GHOST - Message[2] = SUCCESS; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else + { // Neighbor v is a ghost vertex + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + { // u is a local vertex + // Build the Message Packet: + Message[0] = u; // LOCAL + Message[1] = v; // GHOST + Message[2] = SUCCESS; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs); fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); msgInd++; msgActual++; #ifdef DEBUG_GHOST_ - if ((uEndIndex)) { - cout<<"\n("<= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex - } //End of while ( /*!Q.empty()*/ !U.empty() ) + if ((u < StartIndex) || (u > EndIndex)) + { + cout << "\n(" << myRank << ") " << __LINE__ << " From Send: should not happen: u= " << u << " v= " << v << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; + fflush(stdout); + } +#endif + + } // End of If( v != Mate[u] ) + } // End of Else //A Ghost Vertex + } // End of For Loop adj(u) + } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + } // End of while ( /*!Q.empty()*/ !U.empty() ) ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// //// BREAK IF NO MESSAGES EXPECTED ///////// #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus); - if (error_codeC != MPI_SUCCESS ) { + if (error_codeC != MPI_SUCCESS) + { MPI_Error_string(error_codeC, error_message, &message_length); - cout<<"\n*Error in call to MPI_Receive on Slave: "<(), Sender, BundleTag, comm, &computeStatus); - if (error_codeC != MPI_SUCCESS ) { + if (error_codeC != MPI_SUCCESS) + { MPI_Error_string(error_codeC, error_message, &message_length); - cout<<"\n*Error in call to MPI_Receive on processor "< EndIndex)) + { + cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; + fflush(stdout); + } #endif #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<NLVer)) { - cout<<"\n("< NLVer)) + { + cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } #endif - if ( Mate[v-StartIndex] == -1 ) { //Process only if not already matched (v is local) - candidateMate[NLVer+Ghost2LocalMap[u]] = v; //Set CandidateMate for the ghost - if ( candidateMate[v-StartIndex] == u ) { - GMate[Ghost2LocalMap[u]] = v; //u is ghost - Mate[v-StartIndex] = u; //v is local - //Q.push_back(u); + if (Mate[v - StartIndex] == -1) + { // Process only if not already matched (v is local) + candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost + if (candidateMate[v - StartIndex] == u) + { + GMate[Ghost2LocalMap[u]] = v; // u is ghost + Mate[v - StartIndex] = u; // v is local + // Q.push_back(u); U.push_back(v); U.push_back(u); myCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) { - Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement - if ( Counter[Ghost2LocalMap[u]] == 0 ) { - S--; //Decrement S + // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + if (Counter[Ghost2LocalMap[u]] > 0) + { + Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement + if (Counter[Ghost2LocalMap[u]] == 0) + { + S--; // Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) - } //End of if ( candidateMate[v-StartIndex] == u )e - } //End of if ( Mate[v] == -1 ) - } //End of REQUEST - else { //CASE II: SUCCESS - if ( message_type == SUCCESS ) { -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) { - Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement - if ( Counter[Ghost2LocalMap[u]] == 0 ) { - S--; //Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + } // End of if ( candidateMate[v-StartIndex] == u )e + } // End of if ( Mate[v] == -1 ) + } // End of REQUEST + else + { // CASE II: SUCCESS + if (message_type == SUCCESS) + { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; + fflush(stdout); +#endif + // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) + // process it again + if (Counter[Ghost2LocalMap[u]] > 0) + { + Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement + if (Counter[Ghost2LocalMap[u]] == 0) + { + S--; // Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages"; fflush(stdout); #endif } - } //End of if Counter[w] > 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) #ifdef DEBUG_GHOST_ - if ((v<0)||(vNLVer)) { - cout<<"\n("< NLVer)) + { + cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } +#endif + if (Mate[v - StartIndex] == -1) + { // Process only if not already matched ( v is local) + if (candidateMate[v - StartIndex] == u) + { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; w = -1; - heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN - for( k1 = adj11; k1 < adj12; k1++ ) { - if ( (verLocInd[k1]EndIndex) ) { //Is it a ghost vertex? - if(GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0 )// Already matched + heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN + for (k1 = adj11; k1 < adj12; k1++) + { + if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex)) + { // Is it a ghost vertex? + if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched continue; } - else { //A local vertex - if( Mate[verLocInd[k1]-StartIndex] >= 0 ) // Already matched + else + { // A local vertex + if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched continue; } - if( (edgeLocWeight[k1] > heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt)&&(w < verLocInd[k1])) ) { + if ((edgeLocWeight[k1] > heaviestEdgeWt) || + ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1]))) + { heaviestEdgeWt = edgeLocWeight[k1]; w = verLocInd[k1]; } - } //End of for loop - candidateMate[v-StartIndex] = w; - //End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0 ) { - if ( (w < StartIndex) || (w > EndIndex) ) { //w is a ghost - //Build the Message Packet: - Message[0] = v; //LOCAL - Message[1] = w; //GHOST - Message[2] = REQUEST; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<= 0) + { + if ((w < StartIndex) || (w > EndIndex)) + { // w is a ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; msgActual++; - if ( candidateMate[NLVer+Ghost2LocalMap[w]] == v ) { - Mate[v-StartIndex] = w; //v is local - GMate[Ghost2LocalMap[w]] = v; //w is ghost - //Q.push_back(u); + msgInd++; + msgActual++; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) + { + Mate[v - StartIndex] = w; // v is local + GMate[Ghost2LocalMap[w]] = v; // w is ghost + // Q.push_back(u); U.push_back(v); U.push_back(w); myCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; //Decrement - if ( Counter[Ghost2LocalMap[w]] == 0 ) { - S--; //Decrement S + // Decrement the counter: + // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + if (Counter[Ghost2LocalMap[w]] > 0) + { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement + if (Counter[Ghost2LocalMap[w]] == 0) + { + S--; // Decrement S #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - } //End of if CandidateMate[w] = v - } //End of if a Ghost Vertex - else { //w is a local vertex - if ( candidateMate[w-StartIndex] == v ) { - Mate[v-StartIndex] = w; //v is local - Mate[w-StartIndex] = v; //w is local - //Q.push_back(u); + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + Mate[v - StartIndex] = w; // v is local + Mate[w - StartIndex] = v; // w is local + // Q.push_back(u); U.push_back(v); U.push_back(w); myCard++; #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("<=0) - else { //No dominant edge found - adj11 = verLocPtr[v-StartIndex]; - adj12 = verLocPtr[v-StartIndex+1]; - for( k1 = adj11; k1 < adj12; k1++ ) { + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + { // No dominant edge found + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { w = verLocInd[k1]; - if ( (w < StartIndex) || (w > EndIndex) ) { //A ghost - //Build the Message Packet: - Message[0] = v; //LOCAL - Message[1] = w; //GHOST - Message[2] = FAILURE; //TYPE - //Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< EndIndex)) + { // A ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; fflush(stdout); #endif - //MPI_Bsend(&Message[0], 3, MilanMpiLongInt, findOwnerOfGhost(w, verDistance, myRank, numProcs), - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + // MPI_Bsend(&Message[0], 3, MilanMpiLongInt, findOwnerOfGhost(w, verDistance, myRank, numProcs), + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; msgActual++; - } //End of if(GHOST) - } //End of for loop - } // End of Else: w == -1 - //End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } //End of if ( candidateMate[v-StartIndex] == u ) - } //End of if ( Mate[v] == -1 ) - } //End of if ( message_type == SUCCESS ) - else { //CASE III: FAILURE -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) { - Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; //Decrement - if ( Counter[Ghost2LocalMap[u]] == 0 ) { - S--; //Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) + { + Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement + if (Counter[Ghost2LocalMap[u]] == 0) + { + S--; // Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages"; fflush(stdout); #endif } - } //End of if Counter[w] > 0 - //End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) - } //End of else: CASE III - } //End of else: CASE I - } //End of if (!MsgQ.empty()) + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + } // End of else: CASE III + } // End of else: CASE I + } // End of if (!MsgQ.empty()) ///////////////////////// END OF PROCESS MESSAGES ///////////////////////////////// #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0 ) { - MPI_Buffer_detach(&Buffer, &BufferSize); //Detach the Buffer - free(Buffer); //Free the memory that was allocated + // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer + if (BufferSize > 0) + { + MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer + free(Buffer); // Free the memory that was allocated } finishTime = MPI_Wtime(); - *ph2_time = finishTime-startTime; //Time taken for Phase-2 - *ph2_card = myCard ; //Cardinality at the end of Phase-2 + *ph2_time = finishTime - startTime; // Time taken for Phase-2 + *ph2_card = myCard; // Cardinality at the end of Phase-2 #ifdef PRINT_DEBUG_INFO_ - cout<<"\n("< 0) { - *msgPercent = ((double)NumMessagesBundled/(double)(msgInd))*100.0; - } else { + if (msgInd > 0) + { + *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0; + } + else + { *msgPercent = 0; } #ifdef DEBUG_HANG_ - if (myRank == 0) cout<<"\n("< Date: Thu, 30 Jun 2022 16:48:18 -0500 Subject: [PATCH 38/96] refactoring parallelComputeCandidateMateB --- amgprec/impl/aggregator/MatchBoxPC.h | 8 ++++ ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 45 +++++++++---------- .../parallelComputeCandidateMateB.cpp | 36 +++++++++++++++ 3 files changed, 65 insertions(+), 24 deletions(-) create mode 100644 amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 530933e5..88e205ba 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -64,6 +64,7 @@ #include "dataStrStaticQueue.h" using namespace std; +#define NUM_THREAD 4 #ifdef __cplusplus extern "C" @@ -203,6 +204,13 @@ extern "C" staticQueue &privateQMsgType, staticQueue &privateQOwner); + inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanInt myRank, + MilanReal *edgeLocWeight, + MilanLongInt *candidateMate); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index d470b1ab..980824aa 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -5,6 +5,7 @@ #include "findOwnerOfGhost.cpp" #include "computeCandidateMate.cpp" #include "initialize.cpp" +#include "parallelComputeCandidateMateB.cpp" // *********************************************************************** // @@ -258,26 +259,22 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( ///////////////////////////////////////////////////////////////////////////////////////// // Compute the Initial Matching Set: + /* + * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from + * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize + * the two. + * PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel. + */ + + PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer, + verLocPtr, + verLocInd, + myRank, + edgeLocWeight, + candidateMate); + #pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) { - /* - * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from - * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize - * the two. - * In particular PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel. - */ - -#pragma omp for schedule(static) - for (v = 0; v < NLVer; v++) - { -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl; - fflush(stdout); -#endif - // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight); - // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - } /* * PARALLEL_PROCESS_EXPOSED_VERTEX_B @@ -476,7 +473,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } } -#pragma omp master +#pragma omp single { tempCounter.clear(); // Do not need this any more } @@ -715,11 +712,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( candidateMate[NLVer + Ghost2LocalMap[v]] = -1; if (v != Mate[u - StartIndex]) { // u is local - // Build the Message Packet: - // Message[0] = u; //LOCAL - // Message[1] = v; //GHOST - // Message[2] = SUCCESS; //TYPE - // Send a Request (Asynchronous) + // Build the Message Packet: + // Message[0] = u; //LOCAL + // Message[1] = v; //GHOST + // Message[2] = SUCCESS; //TYPE + // Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Sending a success message: "; diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp new file mode 100644 index 00000000..ced93456 --- /dev/null +++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp @@ -0,0 +1,36 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanInt myRank, + MilanReal *edgeLocWeight, + MilanLongInt *candidateMate) +{ + + MilanLongInt v = -1; + +#pragma omp parallel private(v) default(shared) num_threads(4) + { + +#pragma omp for schedule(static) + for (v = 0; v < NLVer; v++) + { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl; + fflush(stdout); +#endif + // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight); + // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + } + } +} From b66de7f25cf15ca1b69d3a7150a5d15c797c34e4 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 6 Jul 2022 12:58:00 -0500 Subject: [PATCH 39/96] Refactoring PARALLEL_PROCESS_EXPOSED_VERTEX_B --- amgprec/impl/aggregator/MatchBoxPC.h | 58 +++- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 237 +++------------- .../impl/aggregator/processExposedVertex.cpp | 256 ++++++++++++++++++ 3 files changed, 334 insertions(+), 217 deletions(-) create mode 100644 amgprec/impl/aggregator/processExposedVertex.cpp diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 88e205ba..122a1380 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -205,21 +205,51 @@ extern "C" staticQueue &privateQOwner); inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, - MilanLongInt *verLocPtr, - MilanLongInt *verLocInd, - MilanInt myRank, - MilanReal *edgeLocWeight, - MilanLongInt *candidateMate); + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanInt myRank, + MilanReal *edgeLocWeight, + MilanLongInt *candidateMate); - void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( - MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, - MilanLongInt *verDistance, - MilanLongInt *Mate, - MilanInt myRank, MilanInt numProcs, MPI_Comm comm, - MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, - MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, - MilanLongInt *ph1_card, MilanLongInt *ph2_card); + inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, + MilanLongInt *candidateMate, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *Mate, + vector &GMate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + staticQueue &U, + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); + + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( MilanLongInt NLVer, MilanLongInt NLEdge, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 980824aa..930a4d37 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -6,6 +6,7 @@ #include "computeCandidateMate.cpp" #include "initialize.cpp" #include "parallelComputeCandidateMateB.cpp" +#include "processExposedVertex.cpp" // *********************************************************************** // @@ -273,211 +274,41 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( edgeLocWeight, candidateMate); + PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer, + candidateMate, + verLocInd, + verLocPtr, + StartIndex, + EndIndex, + Mate, + GMate, + Ghost2LocalMap, + edgeLocWeight, + &myCard, + &msgInd, + &NumMessagesBundled, + &S, + verDistance, + PCounter, + Counter, + myRank, + numProcs, + U, + privateU, + QLocalVtx, + QGhostVtx, + QMsgType, + QOwner, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + tempCounter.clear(); // Do not need this any more + #pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) { - /* - * PARALLEL_PROCESS_EXPOSED_VERTEX_B - * The sequential version could be a bit more - * efficient. - * - * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner - * first in a local variable and then, only at the end, append them to the real data structure - * to remove the critical sections. - * - * TODO: Test when it's more efficient to execute this code - * in parallel. - */ - -#pragma omp for reduction(+ \ - : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static) - for (v = 0; v < NLVer; v++) - { - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - k = candidateMate[v]; - candidateMate[v] = verLocInd[k]; - w = candidateMate[v]; - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl; - fflush(stdout); -#endif - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w; - fflush(stdout); -#endif - // If found a dominating edge: - if (w >= 0) - { - - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) - { - w = computeCandidateMate(verLocPtr[v], - verLocPtr[v + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - candidateMate[v] = w; - } - - if (w >= 0) - { - - myCard++; - if ((w < StartIndex) || (w > EndIndex)) - { // w is a ghost vertex -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message (291):"; - cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; - fflush(stdout); -#endif - - msgInd++; - NumMessagesBundled++; - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - PCounter[ghostOwner]++; - - /* - //TODO why does it fail if I use a private data structure??? - privateQLocalVtx.push_back(v + StartIndex); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(REQUEST); - privateQOwner.push_back(ghostOwner); - */ - -#pragma omp critical(MSG) - { - - QLocalVtx.push_back(v + StartIndex); - QGhostVtx.push_back(w); - QMsgType.push_back(REQUEST); - QOwner.push_back(ghostOwner); - } // end of critical region - - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) - { - - privateU.push_back(v + StartIndex); - privateU.push_back(w); - Mate[v] = w; - // FIXME could this instruction create errors? - GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")"; - fflush(stdout); -#endif - // Decrement the counter: - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) -#pragma omp critical - { - if (Counter[Ghost2LocalMap[w]] > 0) - { - - Counter[Ghost2LocalMap[w]] -= 1; // Decrement - if (Counter[Ghost2LocalMap[w]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; - fflush(stdout); -#endif - } - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v) - } // End of if CandidateMate[w] = v - - } // End of if a Ghost Vertex - else - { // w is a local vertex - - if (candidateMate[w - StartIndex] == (v + StartIndex)) - { - privateU.push_back(v + StartIndex); - privateU.push_back(w); - - Mate[v] = w; // v is local - // FIXME this instruction could create errors - Mate[w - StartIndex] = v + StartIndex; // w is local - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") "; - fflush(stdout); -#endif - - } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) - } // End of Else - - continue; - } // End of second if - - } // End of if(w >=0) - - // This piece of code is executed a really small amount of times, I will not allocate a - // huge amount of memory to the private data structures. - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); -#endif - - msgInd++; - NumMessagesBundled++; - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - PCounter[ghostOwner]++; - QLocalVtx.push_back(v + StartIndex); - QGhostVtx.push_back(w); - QMsgType.push_back(FAILURE); - QOwner.push_back(ghostOwner); - - } // End of if(GHOST) - } // End of for loop - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of for ( v=0; v < NLVer; v++ ) - -#pragma omp critical(privateMsg) - { - while (!privateQLocalVtx.empty()) - { - - QLocalVtx.push_back(privateQLocalVtx.pop_front()); - QGhostVtx.push_back(privateQGhostVtx.pop_front()); - QMsgType.push_back(privateQMsgType.pop_front()); - QOwner.push_back(privateQOwner.pop_front()); - } - } - -#pragma omp critical(U) - { - while (!privateU.empty()) - { - U.push_back(privateU.pop_front()); - } - } - -#pragma omp single - { - tempCounter.clear(); // Do not need this any more - } - #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; fflush(stdout); @@ -788,7 +619,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( ///////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////// SEND BUNDLED MESSAGES ///////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////// -#pragma omp barrier +#pragma omp barrier // TODO check if necessary #pragma omp master { // Data structures for Bundled Messages: diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp new file mode 100644 index 00000000..a76d3df8 --- /dev/null +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -0,0 +1,256 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +/* + * PARALLEL_PROCESS_EXPOSED_VERTEX_B + * The sequential version could be a bit more + * efficient. + * + * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner + * first in a local variable and then, only at the end, append them to the real data structure + * to remove the critical sections. + * + * TODO: Test when it's more efficient to execute this code + * in parallel. + */ + +inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, + MilanLongInt *candidateMate, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *Mate, + vector &GMate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + staticQueue &U, + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner) +{ + + const MilanLongInt REQUEST = 1; + const MilanLongInt SUCCESS = 2; + const MilanLongInt FAILURE = 3; + const MilanLongInt SIZEINFO = 4; + MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0, S = *SPtr; + MilanLongInt myCard = 0, msgInd = 0; + MilanLongInt NumMessagesBundled = 0; + MilanInt ghostOwner = 0; + +#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) + { +#pragma omp for reduction(+ \ + : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static) + for (v = 0; v < NLVer; v++) + { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + k = candidateMate[v]; + candidateMate[v] = verLocInd[k]; + w = candidateMate[v]; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl; + fflush(stdout); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) + { + + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; + } + + if (w >= 0) + { + + myCard++; + if ((w < StartIndex) || (w > EndIndex)) + { // w is a ghost vertex +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message (291):"; + cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); +#endif + + msgInd++; + NumMessagesBundled++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + + /* + //TODO why does it fail if I use a private data structure??? + privateQLocalVtx.push_back(v + StartIndex); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + */ + +#pragma omp critical(MSG) + { + + QLocalVtx.push_back(v + StartIndex); + QGhostVtx.push_back(w); + QMsgType.push_back(REQUEST); + QOwner.push_back(ghostOwner); + } // end of critical region + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) + { + + privateU.push_back(v + StartIndex); + privateU.push_back(w); + Mate[v] = w; + // FIXME could this instruction create errors? + GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")"; + fflush(stdout); +#endif + // Decrement the counter: + // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) +#pragma omp critical + { + if (Counter[Ghost2LocalMap[w]] > 0) + { + + Counter[Ghost2LocalMap[w]] -= 1; // Decrement + if (Counter[Ghost2LocalMap[w]] == 0) + { + S--; // Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; + fflush(stdout); +#endif + } + } + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + } // End of if CandidateMate[w] = v + + } // End of if a Ghost Vertex + else + { // w is a local vertex + + if (candidateMate[w - StartIndex] == (v + StartIndex)) + { + privateU.push_back(v + StartIndex); + privateU.push_back(w); + + Mate[v] = w; // v is local + // FIXME this instruction could create errors + Mate[w - StartIndex] = v + StartIndex; // w is local + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") "; + fflush(stdout); +#endif + + } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) + } // End of Else + + continue; + } // End of second if + + } // End of if(w >=0) + + // This piece of code is executed a really small amount of times, I will not allocate a + // huge amount of memory for the private data structures. + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + + msgInd++; + NumMessagesBundled++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + QLocalVtx.push_back(v + StartIndex); + QGhostVtx.push_back(w); + QMsgType.push_back(FAILURE); + QOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of for ( v=0; v < NLVer; v++ ) + +#pragma omp critical(U) + { + while (!privateU.empty()) + U.push_back(privateU.pop_front()); + } + +#pragma omp master + { + *myCardPtr = myCard; + *msgIndPtr = msgInd; + *NumMessagesBundledPtr = NumMessagesBundled; + *SPtr = S; + } + +#pragma omp critical(privateMsg) + { + while (!privateQLocalVtx.empty()) + { + QLocalVtx.push_back(privateQLocalVtx.pop_front()); + QGhostVtx.push_back(privateQGhostVtx.pop_front()); + QMsgType.push_back(privateQMsgType.pop_front()); + QOwner.push_back(privateQOwner.pop_front()); + } + } + + } // End of parallel region +} \ No newline at end of file From 63b7602d3abd1adb2bbd41eacd80d994b2ea2ea9 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 6 Jul 2022 13:12:31 -0500 Subject: [PATCH 40/96] refactoring queueTransfer --- amgprec/impl/aggregator/MatchBoxPC.h | 11 ++++++ .../impl/aggregator/processExposedVertex.cpp | 23 ++++------- amgprec/impl/aggregator/queueTransfer.cpp | 38 +++++++++++++++++++ 3 files changed, 56 insertions(+), 16 deletions(-) create mode 100644 amgprec/impl/aggregator/queueTransfer.cpp diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 122a1380..96630f9c 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -241,6 +241,17 @@ extern "C" staticQueue &privateQMsgType, staticQueue &privateQOwner); + inline void queuesTransfer(staticQueue &U, + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index a76d3df8..148951a5 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -7,6 +7,7 @@ #include "primitiveDataTypeDefinitions.h" #include "dataStrStaticQueue.h" #include "omp.h" +#include "queueTransfer.cpp" /* * PARALLEL_PROCESS_EXPOSED_VERTEX_B @@ -227,11 +228,12 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) } // End of for ( v=0; v < NLVer; v++ ) -#pragma omp critical(U) - { - while (!privateU.empty()) - U.push_back(privateU.pop_front()); - } + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); #pragma omp master { @@ -241,16 +243,5 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, *SPtr = S; } -#pragma omp critical(privateMsg) - { - while (!privateQLocalVtx.empty()) - { - QLocalVtx.push_back(privateQLocalVtx.pop_front()); - QGhostVtx.push_back(privateQGhostVtx.pop_front()); - QMsgType.push_back(privateQMsgType.pop_front()); - QOwner.push_back(privateQOwner.pop_front()); - } - } - } // End of parallel region } \ No newline at end of file diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp new file mode 100644 index 00000000..55b0983a --- /dev/null +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -0,0 +1,38 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +inline void queuesTransfer(staticQueue &U, + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner) +{ + +#pragma omp critical(U) + { + while (!privateU.empty()) + U.push_back(privateU.pop_front()); + } + +#pragma omp critical(privateMsg) + { + while (!privateQLocalVtx.empty()) + { + QLocalVtx.push_back(privateQLocalVtx.pop_front()); + QGhostVtx.push_back(privateQGhostVtx.pop_front()); + QMsgType.push_back(privateQMsgType.pop_front()); + QOwner.push_back(privateQOwner.pop_front()); + } + } +} \ No newline at end of file From 6dcae6d0c175da7246d5775b3f83820334ed4588 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 6 Jul 2022 15:33:29 -0500 Subject: [PATCH 41/96] fix private queues in PARALLEL_PROCESS_EXPOSED_VERTEX_B --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 92 +++++++++---------- .../impl/aggregator/processExposedVertex.cpp | 33 ++----- amgprec/impl/aggregator/queueTransfer.cpp | 10 +- 3 files changed, 60 insertions(+), 75 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 930a4d37..1fb1d90f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -274,35 +274,35 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( edgeLocWeight, candidateMate); - PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer, - candidateMate, - verLocInd, - verLocPtr, - StartIndex, - EndIndex, - Mate, - GMate, - Ghost2LocalMap, - edgeLocWeight, - &myCard, - &msgInd, - &NumMessagesBundled, - &S, - verDistance, - PCounter, - Counter, - myRank, - numProcs, - U, - privateU, - QLocalVtx, - QGhostVtx, - QMsgType, - QOwner, - privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); + PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer, + candidateMate, + verLocInd, + verLocPtr, + StartIndex, + EndIndex, + Mate, + GMate, + Ghost2LocalMap, + edgeLocWeight, + &myCard, + &msgInd, + &NumMessagesBundled, + &S, + verDistance, + PCounter, + Counter, + myRank, + numProcs, + U, + privateU, + QLocalVtx, + QGhostVtx, + QMsgType, + QOwner, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); tempCounter.clear(); // Do not need this any more @@ -455,6 +455,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); #endif + + // TODO refactor this // Decrement the counter: // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) if (Counter[Ghost2LocalMap[w]] > 0) @@ -579,29 +581,25 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( // Avoid to ask for the critical section if there is nothing to add if (privateU.size() < UCHUNK && !U.empty()) continue; -#pragma omp critical(U) - { - while (!privateU.empty()) - { - U.push_back(privateU.pop_front()); - } - - myCard += privateMyCard; - } // End of critical U + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); } } // End of while ( /*!Q.empty()*/ !U.empty() ) -#pragma omp critical(privateMsg) +#pragma omp critical { - while (!privateQLocalVtx.empty()) - { - - QLocalVtx.push_back(privateQLocalVtx.pop_front()); - QGhostVtx.push_back(privateQGhostVtx.pop_front()); - QMsgType.push_back(privateQMsgType.pop_front()); - QOwner.push_back(privateQOwner.pop_front()); - } + myCard += privateMyCard; } + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); #ifdef COUNT_LOCAL_VERTEX printf("Count local vertexes: %ld for thread %d of processor %d\n", diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index 148951a5..86d19eeb 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -11,14 +11,9 @@ /* * PARALLEL_PROCESS_EXPOSED_VERTEX_B - * The sequential version could be a bit more - * efficient. + * TODO: write comment * - * TODO: Maybe it is possible to append the values of QLocalVtx, QGhostVtx, QMsgType and QOwner - * first in a local variable and then, only at the end, append them to the real data structure - * to remove the critical sections. - * - * TODO: Test when it's more efficient to execute this code + * TODO: Test when it's actually more efficient to execute this code * in parallel. */ @@ -119,22 +114,11 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, assert(ghostOwner != myRank); PCounter[ghostOwner]++; - /* - //TODO why does it fail if I use a private data structure??? privateQLocalVtx.push_back(v + StartIndex); privateQGhostVtx.push_back(w); privateQMsgType.push_back(REQUEST); privateQOwner.push_back(ghostOwner); - */ - -#pragma omp critical(MSG) - { - - QLocalVtx.push_back(v + StartIndex); - QGhostVtx.push_back(w); - QMsgType.push_back(REQUEST); - QOwner.push_back(ghostOwner); - } // end of critical region + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) { @@ -149,6 +133,8 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")"; fflush(stdout); #endif + + //TODO refactor this!! // Decrement the counter: // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) #pragma omp critical @@ -218,10 +204,11 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, assert(ghostOwner != -1); assert(ghostOwner != myRank); PCounter[ghostOwner]++; - QLocalVtx.push_back(v + StartIndex); - QGhostVtx.push_back(w); - QMsgType.push_back(FAILURE); - QOwner.push_back(ghostOwner); + + privateQLocalVtx.push_back(v + StartIndex); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); } // End of if(GHOST) } // End of for loop diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp index 55b0983a..becf14cf 100644 --- a/amgprec/impl/aggregator/queueTransfer.cpp +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -22,17 +22,17 @@ inline void queuesTransfer(staticQueue &U, #pragma omp critical(U) { while (!privateU.empty()) - U.push_back(privateU.pop_front()); + U.push_back(privateU.pop_back()); } #pragma omp critical(privateMsg) { while (!privateQLocalVtx.empty()) { - QLocalVtx.push_back(privateQLocalVtx.pop_front()); - QGhostVtx.push_back(privateQGhostVtx.pop_front()); - QMsgType.push_back(privateQMsgType.pop_front()); - QOwner.push_back(privateQOwner.pop_front()); + QLocalVtx.push_back(privateQLocalVtx.pop_back()); + QGhostVtx.push_back(privateQGhostVtx.pop_back()); + QMsgType.push_back(privateQMsgType.pop_back()); + QOwner.push_back(privateQOwner.pop_back()); } } } \ No newline at end of file From 9b13aef1cedcb9c2bc0bc7d81167a1979d1a73a0 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Fri, 8 Jul 2022 13:32:24 -0500 Subject: [PATCH 42/96] processMathedVertices refactoring --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 354 +++--------------- amgprec/impl/aggregator/extractUChunk.cpp | 34 ++ .../impl/aggregator/processExposedVertex.cpp | 9 +- .../aggregator/processMatchedVertices.cpp | 337 +++++++++++++++++ 4 files changed, 425 insertions(+), 309 deletions(-) create mode 100644 amgprec/impl/aggregator/extractUChunk.cpp create mode 100644 amgprec/impl/aggregator/processMatchedVertices.cpp diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 1fb1d90f..b6ac6364 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -7,6 +7,8 @@ #include "initialize.cpp" #include "parallelComputeCandidateMateB.cpp" #include "processExposedVertex.cpp" +#include "processMatchedVertices.cpp" +//#include "extractUChunk.cpp" // *********************************************************************** // @@ -227,7 +229,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt numMessagesToSend; MilanInt BufferSize; MilanLongInt *Buffer; - bool isEmpty; // Declare the locks // TODO destroy the locks @@ -274,6 +275,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( edgeLocWeight, candidateMate); + /* + * PARALLEL_PROCESS_EXPOSED_VERTEX_B + * TODO: write comment + * + * TODO: Test when it's actually more efficient to execute this code + * in parallel. + */ + PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer, candidateMate, verLocInd, @@ -306,309 +315,52 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( tempCounter.clear(); // Do not need this any more -#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) + /////////////////////////////////////////////////////////////////////////////////// + /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////// +//#define debug +#ifndef debug + + vector UChunkBeingProcessed; + UChunkBeingProcessed.reserve(UCHUNK); + processMatchedVertices(NLVer, + UChunkBeingProcessed, + U, + privateU, + StartIndex, + EndIndex, + &myCard, + &msgInd, + &NumMessagesBundled, + &S, + verLocPtr, + verLocInd, + verDistance, + PCounter, + Counter, + myRank, + numProcs, + candidateMate, + GMate, + Mate, + Ghost2LocalMap, + edgeLocWeight, + QLocalVtx, + QGhostVtx, + QMsgType, + QOwner, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + +#endif + +#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) { -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << "=========================************===============================" << endl; - fflush(stdout); - fflush(stdout); -#endif - /////////////////////////////////////////////////////////////////////////////////// - /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// - /////////////////////////////////////////////////////////////////////////////////// - isEmpty = false; - -#ifdef COUNT_LOCAL_VERTEX - MilanLongInt localVertices = 0; -#endif - - // TODO what would be the optimal UCHUNK - vector Us; - Us.reserve(UCHUNK); - - while (true) - { - - Us.clear(); -#pragma omp critical(U) - { - // If U is emptu and there are no new node to add to U - if (U.empty() && privateU.empty()) - isEmpty = true; - else - { - if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U - while (!privateU.empty()) - { - U.push_back(privateU.pop_front()); - myCard += privateMyCard; - } - for (int i = 0; i < UCHUNK; i++) - { // Pop the new nodes - if (U.empty()) - break; - Us.push_back(U.pop_front()); - } - } - } // End of critical U - if (isEmpty) - break; - - for (MilanLongInt u : Us) - { -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")u: " << u; - fflush(stdout); -#endif - if ((u >= StartIndex) && (u <= EndIndex)) - { // Process Only the Local Vertices - -#ifdef COUNT_LOCAL_VERTEX - localVertices++; -#endif - - // Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; // Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) - { - v = verLocInd[k]; - - if ((v >= StartIndex) && (v <= EndIndex)) - { // If Local Vertex: -#pragma omp critical(innerProcessMatched) - { - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; - fflush(stdout); -#endif - - // If the current vertex is pointing to a matched vertex and is not matched - // FIXME is there a way to make candidateMate private? - // for the moment it could generate an error. - if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and - candidateMate[v - StartIndex] == u) - { - - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], - verLocPtr[v - StartIndex + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - - candidateMate[v - StartIndex] = w; - - // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w; - fflush(stdout); -#endif - // If found a dominating edge: - if (w >= 0) - { - - // TODO is it possible to lock without a critical region? - // TODO there must be a more elegant and efficient way to do this - /* - while(true) { - if (omp_test_lock(&MateLock[v - StartIndex])) { - if (omp_test_lock(&MateLock[w - StartIndex])) break; - else omp_unset_lock(&MateLock[v - StartIndex]); - } - } - */ - - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message:"; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); -#endif - - QLocalVtx.push_back(v); - QGhostVtx.push_back(w); - QMsgType.push_back(REQUEST); - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - QOwner.push_back(ghostOwner); - PCounter[ghostOwner]++; - NumMessagesBundled++; - msgInd++; - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - Mate[v - StartIndex] = w; // v is a local vertex - GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex - // Q.push_back(u); - privateU.push_back(v); - privateU.push_back(w); - privateMyCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); -#endif - - // TODO refactor this - // Decrement the counter: - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - if (Counter[Ghost2LocalMap[w]] > 0) - { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement - if (Counter[Ghost2LocalMap[w]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; - fflush(stdout); -#endif - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex - if (candidateMate[w - StartIndex] == v) - { - Mate[v - StartIndex] = w; // v is a local vertex - Mate[w - StartIndex] = v; // w is a local vertex - // Q.push_back(u); - privateU.push_back(v); - privateU.push_back(w); - privateMyCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); -#endif - } // End of if(CandidateMate(w) = v - } // End of Else - - // omp_unset_lock(&MateLock[v - StartIndex]); - // omp_unset_lock(&MateLock[w - StartIndex]); - - } // End of if(w >=0) - else - { - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); -#endif - /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w), - ComputeTag, comm); */ - QLocalVtx.push_back(v); - QGhostVtx.push_back(w); - QMsgType.push_back(FAILURE); - // ghostOwner = inputSubGraph.findOwner(w); - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - QOwner.push_back(ghostOwner); - PCounter[ghostOwner]++; - NumMessagesBundled++; - msgInd++; - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - - } // End of If (candidateMate[v-StartIndex] == u - - } // End of critical region if - - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else - { // Neighbor is a ghost vertex -#pragma omp critical(innerProcessMatched) - { - - // while(!omp_test_lock(&MateLock[u - StartIndex])); - - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) - { // u is local - // Build the Message Packet: - // Message[0] = u; //LOCAL - // Message[1] = v; //GHOST - // Message[2] = SUCCESS; //TYPE - // Send a Request (Asynchronous) - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; - fflush(stdout); -#endif - - QLocalVtx.push_back(u); - QGhostVtx.push_back(v); - QMsgType.push_back(SUCCESS); - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - QOwner.push_back(ghostOwner); - PCounter[ghostOwner]++; - NumMessagesBundled++; - msgInd++; - } // End of If( v != Mate[u] ) - - // omp_unset_lock(&MateLock[u - StartIndex]); - - } // End of critical region - } // End of Else //A Ghost Vertex - - } // End of For Loop adj(u) - - } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex - - // Avoid to ask for the critical section if there is nothing to add - if (privateU.size() < UCHUNK && !U.empty()) - continue; - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); - } - } // End of while ( /*!Q.empty()*/ !U.empty() ) - -#pragma omp critical - { - myCard += privateMyCard; - } - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); - -#ifdef COUNT_LOCAL_VERTEX - printf("Count local vertexes: %ld for thread %d of processor %d\n", - localVertices, - omp_get_thread_num(), - myRank); -#endif - - ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// #ifdef DEBUG_HANG_ if (myRank == 0) cout << "\n(" << myRank << ") Send Bundles" << endl; diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp new file mode 100644 index 00000000..e8a6951c --- /dev/null +++ b/amgprec/impl/aggregator/extractUChunk.cpp @@ -0,0 +1,34 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +#define UCHUNK 1000 + +inline void extractUChunk( + vector &UChunkBeingProcessed, + staticQueue &U, + staticQueue &privateU) +{ + + UChunkBeingProcessed.clear(); +#pragma omp critical(U) + { + + if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U + while (!privateU.empty()) + U.push_back(privateU.pop_front()); + + for (int i = 0; i < UCHUNK; i++) + { // Pop the new nodes + if (U.empty()) + break; + UChunkBeingProcessed.push_back(U.pop_front()); + } + + } // End of critical U +} \ No newline at end of file diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index 86d19eeb..3847110a 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -9,14 +9,6 @@ #include "omp.h" #include "queueTransfer.cpp" -/* - * PARALLEL_PROCESS_EXPOSED_VERTEX_B - * TODO: write comment - * - * TODO: Test when it's actually more efficient to execute this code - * in parallel. - */ - inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, MilanLongInt *candidateMate, MilanLongInt *verLocInd, @@ -48,6 +40,7 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, staticQueue &privateQOwner) { + //TODO define all the constants in a single place! const MilanLongInt REQUEST = 1; const MilanLongInt SUCCESS = 2; const MilanLongInt FAILURE = 3; diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp new file mode 100644 index 00000000..5f54ef79 --- /dev/null +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -0,0 +1,337 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" +#include "extractUChunk.cpp" + +#define UCHUNK 1000 + +inline void processMatchedVertices( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + staticQueue &U, + staticQueue &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner) +{ + + // TODO define all the constants in a single place! + const MilanLongInt REQUEST = 1; + const MilanLongInt SUCCESS = 2; + const MilanLongInt FAILURE = 3; + const MilanLongInt SIZEINFO = 4; + MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; + MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0; + + // TODO check if private queues arrive empty +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) + { + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif + +#ifdef COUNT_LOCAL_VERTEX + MilanLongInt localVertices = 0; +#endif + + // TODO what would be the optimal UCHUNK + vector UChunkBeingProcessed; + UChunkBeingProcessed.reserve(UCHUNK); + + while (!U.empty()) + { + + extractUChunk(UChunkBeingProcessed, U, privateU); + + for (MilanLongInt u : UChunkBeingProcessed) + { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); +#endif + if ((u >= StartIndex) && (u <= EndIndex)) + { // Process Only the Local Vertices + +#ifdef COUNT_LOCAL_VERTEX + localVertices++; +#endif + + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) + { + v = verLocInd[k]; + + if ((v >= StartIndex) && (v <= EndIndex)) + { // If Local Vertex: +#pragma omp critical(innerProcessMatched) + { + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); +#endif + + // If the current vertex is pointing to a matched vertex and is not matched + // FIXME is there a way to make candidateMate private? + // for the moment it could generate an error. + if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and + candidateMate[v - StartIndex] == u) + { + + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + + // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) + { + + // TODO is it possible to lock without a critical region? + // TODO there must be a more elegant and efficient way to do this + /* + while(true) { + if (omp_test_lock(&MateLock[v - StartIndex])) { + if (omp_test_lock(&MateLock[w - StartIndex])) break; + else omp_unset_lock(&MateLock[v - StartIndex]); + } + } + */ + + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); +#endif + + QLocalVtx.push_back(v); + QGhostVtx.push_back(w); + QMsgType.push_back(REQUEST); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + QOwner.push_back(ghostOwner); + PCounter[ghostOwner]++; + NumMessagesBundled++; + msgInd++; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) + { + Mate[v - StartIndex] = w; // v is a local vertex + GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex + // Q.push_back(u); + privateU.push_back(v); + privateU.push_back(w); + privateMyCard++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + + // TODO refactor this + // Decrement the counter: + // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + if (Counter[Ghost2LocalMap[w]] > 0) + { + Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement + if (Counter[Ghost2LocalMap[w]] == 0) + { + S--; // Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; + fflush(stdout); +#endif + } + } // End of if Counter[w] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + // Q.push_back(u); + privateU.push_back(v); + privateU.push_back(w); + privateMyCard++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + } // End of if(CandidateMate(w) = v + } // End of Else + + // omp_unset_lock(&MateLock[v - StartIndex]); + // omp_unset_lock(&MateLock[w - StartIndex]); + + } // End of if(w >=0) + else + { + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w), + ComputeTag, comm); */ + QLocalVtx.push_back(v); + QGhostVtx.push_back(w); + QMsgType.push_back(FAILURE); + // ghostOwner = inputSubGraph.findOwner(w); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + QOwner.push_back(ghostOwner); + PCounter[ghostOwner]++; + NumMessagesBundled++; + msgInd++; + } // End of if(GHOST) + } // End of for loop + } // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + + } // End of If (candidateMate[v-StartIndex] == u + + } // End of critical region if + + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else + { // Neighbor is a ghost vertex + +#pragma omp critical(innerProcessMatched) + { + + // while(!omp_test_lock(&MateLock[u - StartIndex])); + + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + { // u is local + // Build the Message Packet: + // Message[0] = u; //LOCAL + // Message[1] = v; //GHOST + // Message[2] = SUCCESS; //TYPE + // Send a Request (Asynchronous) + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); +#endif + + QLocalVtx.push_back(u); + QGhostVtx.push_back(v); + QMsgType.push_back(SUCCESS); + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + QOwner.push_back(ghostOwner); + PCounter[ghostOwner]++; + NumMessagesBundled++; + msgInd++; + } // End of If( v != Mate[u] ) + + // omp_unset_lock(&MateLock[u - StartIndex]); + + } // End of critical region + } // End of Else //A Ghost Vertex + + } // End of For Loop adj(u) + + } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + + // Ask for the critical section only when a certain amount + // of data have been accumulated in the private queue + if (privateU.size() < UCHUNK && !U.empty()) + continue; + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + } + } // End of while ( /*!Q.empty()*/ !U.empty() ) + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + +// TODO it is possible that this is not working as expected +// further investigation needed. +#pragma omp atomic + myCard += privateMyCard; + +#ifdef COUNT_LOCAL_VERTEX + printf("Count local vertexes: %ld for thread %d of processor %d\n", + localVertices, + omp_get_thread_num(), + myRank); + +#endif + } + *myCardPtr = myCard; + *msgIndPtr = msgInd; + *NumMessagesBundledPtr = NumMessagesBundled; + *SPtr = S; +} \ No newline at end of file From 3de1e607eb0a4d9b4e4a857125c791e1001fe614 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 10 Jul 2022 03:39:58 -0500 Subject: [PATCH 43/96] sendBundledMessages refactoring --- amgprec/impl/aggregator/MatchBoxPC.h | 14 ++ ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 210 +++------------- .../impl/aggregator/processExposedVertex.cpp | 1 + .../impl/aggregator/sendBundledMessages.cpp | 225 ++++++++++++++++++ 4 files changed, 269 insertions(+), 181 deletions(-) create mode 100644 amgprec/impl/aggregator/sendBundledMessages.cpp diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 96630f9c..351dca98 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -66,6 +66,18 @@ using namespace std; #define NUM_THREAD 4 +// MPI type map +template +MPI_Datatype TypeMap(); +template <> +inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } +template <> +inline MPI_Datatype TypeMap() { return MPI_INT; } +template <> +inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } +template <> +inline MPI_Datatype TypeMap() { return MPI_FLOAT; } + #ifdef __cplusplus extern "C" { @@ -150,6 +162,8 @@ extern "C" #define MilanRealMin MINUS_INFINITY #endif + + // Function of find the owner of a ghost vertex using binary search: inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt myRank, MilanInt numProcs); diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index b6ac6364..cfd6b927 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -8,7 +8,7 @@ #include "parallelComputeCandidateMateB.cpp" #include "processExposedVertex.cpp" #include "processMatchedVertices.cpp" -//#include "extractUChunk.cpp" +#include "sendBundledMessages.cpp" // *********************************************************************** // @@ -85,17 +85,6 @@ #ifdef SERIAL_MPI #else -// MPI type map -template -MPI_Datatype TypeMap(); -template <> -inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } -template <> -inline MPI_Datatype TypeMap() { return MPI_INT; } -template <> -inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } -template <> -inline MPI_Datatype TypeMap() { return MPI_FLOAT; } // DOUBLE PRECISION VERSION // WARNING: The vertex block on a given rank is contiguous @@ -177,6 +166,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector QLocalVtx, QGhostVtx, QMsgType; vector QOwner; // Changed by Fabio to be an integer, addresses needs to be integers! + // TODO move this inseide the initialization function MilanLongInt *PCounter = new MilanLongInt[numProcs]; for (int i = 0; i < numProcs; i++) PCounter[i] = 0; @@ -220,13 +210,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt S; MilanLongInt privateMyCard = 0; staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; - MilanLongInt myIndex = 0; vector PCumulative, PMessageBundle, PSizeInfoMessages; vector SRequest; // Requests that are used for each send message vector SStatus; // Status of sent messages, used in MPI_Wait MilanLongInt MessageIndex = 0; // Pointer for current message - MilanInt OneMessageSize = 0; - MilanLongInt numMessagesToSend; MilanInt BufferSize; MilanLongInt *Buffer; @@ -318,9 +305,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////////////////////////////////////////////////////////////// /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// -//#define debug -#ifndef debug - + vector UChunkBeingProcessed; UChunkBeingProcessed.reserve(UCHUNK); processMatchedVertices(NLVer, @@ -336,7 +321,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( verLocPtr, verLocInd, verDistance, - PCounter, + PCounter, Counter, myRank, numProcs, @@ -354,166 +339,32 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( privateQMsgType, privateQOwner); + ///////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////// SEND BUNDLED MESSAGES ///////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////// -#endif - -#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) - { - - -#ifdef DEBUG_HANG_ - if (myRank == 0) - cout << "\n(" << myRank << ") Send Bundles" << endl; - fflush(stdout); -#endif - ///////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////// SEND BUNDLED MESSAGES ///////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////// -#pragma omp barrier // TODO check if necessary -#pragma omp master - { - // Data structures for Bundled Messages: - try - { - PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message - PCumulative.reserve(numProcs + 1); // Similar to Row Pointer vector in CSR data structure - PSizeInfoMessages.reserve(numProcs * 3); // Buffer to hold the Size info message packets - } - catch (length_error) - { - cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; - cout << "Not enough memory to allocate the internal variables \n"; - exit(1); - } - PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize - PCumulative.resize(numProcs + 1, 0); // Only initialize the counter variable - PSizeInfoMessages.resize(numProcs * 3, 0); - - for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers! - PCumulative[i + 1] = PCumulative[i] + PCounter[i]; - - // OMP not worth parallelizing - // Reuse PCounter to keep track of how many messages were inserted: - for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers! - PCounter[i] = 0; - // Build the Message Bundle packet: - - // OMP Not parallelizable - for (MilanInt i = 0; i < NumMessagesBundled; i++) - { // Changed by Fabio to be an integer, addresses needs to be integers! - myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3; - PMessageBundle[myIndex + 0] = QLocalVtx[i]; - PMessageBundle[myIndex + 1] = QGhostVtx[i]; - PMessageBundle[myIndex + 2] = QMsgType[i]; - PCounter[QOwner[i]]++; - } - - // Send the Bundled Messages: Use ISend - - try - { - SRequest.reserve(numProcs * 2); // At most two messages per processor - SStatus.reserve(numProcs * 2); // At most two messages per processor - } - catch (length_error) - { - cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n"; - cout << "Not enough memory to allocate the internal variables \n"; - exit(1); - } - MPI_Request myReq; // A sample request - SRequest.resize(numProcs * 2, myReq); - MPI_Status myStat; // A sample status - SStatus.resize(numProcs * 2, myStat); - - // Send the Messages - for (MilanInt i = 0; i < numProcs; i++) - { // Changed by Fabio to be an integer, addresses needs to be integers! - if (i == myRank) // Do not send anything to yourself - continue; - // Send the Message with information about the size of next message: - // Build the Message Packet: - PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message - PSizeInfoMessages[i * 3 + 1] = -1; // Dummy packet - PSizeInfoMessages[i * 3 + 2] = SIZEINFO; // TYPE - // Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl; - fflush(stdout); -#endif - if (PSizeInfoMessages[i * 3 + 0] > 0) - { // Send only if it is a nonempty packet - MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, - &SRequest[MessageIndex]); - msgActual++; - MessageIndex++; - // Now Send the message with the data packet: -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending Bundle to : " << i << endl; - for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++) - cout << PMessageBundle[k] << ","; - cout << endl; - fflush(stdout); -#endif - MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], - TypeMap(), i, BundleTag, comm, &SRequest[MessageIndex]); - MessageIndex++; - } // End of if size > 0 - } - // Free up temporary memory: - PCumulative.clear(); - QLocalVtx.clear(); - QGhostVtx.clear(); - QMsgType.clear(); - QOwner.clear(); - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges; - cout << "\n(" << myRank << ")Total number of potential message X 2 = " << numGhostEdges * 2; - cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled; - if (numGhostEdges > 0) - { - cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(numGhostEdges * 2)) * 100.0 << "% \n"; - } - fflush(stdout); -#endif - - // Allocate memory for MPI Send messages: - /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */ - OneMessageSize = 0; - MPI_Pack_size(3, TypeMap(), comm, &OneMessageSize); // Size of one message packet - // How many messages to send? - // Potentially three kinds of messages will be sent/received: - // Request, Success, Failure. - // But only two will be sent from a given processor. - // Substract the number of messages that have already been sent as bundled messages: - numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled; - BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; - - Buffer = 0; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize; - cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD; - cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges; - cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend; - cout << "\n(" << myRank << ")BufferSize = " << BufferSize; - cout << "\n(" << myRank << ")Attaching Buffer on.. "; - fflush(stdout); -#endif - if (BufferSize > 0) - { - Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory - if (Buffer == 0) - { - cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; - cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n"; - exit(1); - } - MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer - } - } // End of master + sendBundledMessages(&numGhostEdges, + &BufferSize, + Buffer, + PCumulative, + PMessageBundle, + PSizeInfoMessages, + PCounter, + NumMessagesBundled, + &msgActual, + &MessageIndex, + numProcs, + myRank, + ComputeTag, + BundleTag, + comm, + QLocalVtx, + QGhostVtx, + QMsgType, + QOwner, + SRequest, + SStatus); - } // end of parallel region ///////////////////////// END OF SEND BUNDLED MESSAGES ////////////////////////////////// finishTime = MPI_Wtime(); @@ -773,10 +624,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////////////////////////////////////////////////////////////// /////////////////////////// PROCESS MESSAGES ////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - /* - RECEIVE message ( u, v, message_type ); - // u is a GHOST vertex ... v is a LOCAL vertex - */ + #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; fflush(stdout); diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index 3847110a..dd9562d5 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -215,6 +215,7 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, privateQMsgType, privateQOwner); +//TODO move this outside of the parallel region!! #pragma omp master { *myCardPtr = myCard; diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp new file mode 100644 index 00000000..e16c5669 --- /dev/null +++ b/amgprec/impl/aggregator/sendBundledMessages.cpp @@ -0,0 +1,225 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +inline void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, + MilanInt *BufferSizePtr, + MilanLongInt *Buffer, + vector &PCumulative, + vector &PMessageBundle, + vector &PSizeInfoMessages, + MilanLongInt *PCounter, + MilanLongInt NumMessagesBundled, + MilanLongInt *msgActualPtr, + MilanLongInt *MessageIndexPtr, + MilanInt numProcs, + MilanInt myRank, + int ComputeTag, + int BundleTag, + MPI_Comm comm, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &SRequest, + vector &SStatus) +{ + + MilanLongInt myIndex = 0, msgActual = *msgActualPtr, MessageIndex = *MessageIndexPtr, numGhostEdges = *numGhostEdgesPtr, numMessagesToSend; + const MilanLongInt SIZEINFO = 4; + MilanInt i = 0, OneMessageSize = 0, BufferSize = *BufferSizePtr; + +#ifdef DEBUG_HANG_ + if (myRank == 0) + cout << "\n(" << myRank << ") Send Bundles" << endl; + fflush(stdout); +#endif + +#pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD) + { +#pragma omp master + { +// Data structures for Bundled Messages: +#pragma omp task depend(inout \ + : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \ + : NumMessagesBundled, numProcs) + {try { + PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message + PCumulative.reserve(numProcs + 1); // Similar to Row Pointer vector in CSR data structure + PSizeInfoMessages.reserve(numProcs * 3); // Buffer to hold the Size info message packets +} +catch (length_error) +{ + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); +} +PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize +PCumulative.resize(numProcs + 1, 0); // Only initialize the counter variable +PSizeInfoMessages.resize(numProcs * 3, 0); +} + +#pragma omp task depend(inout \ + : PCumulative) depend(in \ + : PCounter) +{ + for (i = 0; i < numProcs; i++) + PCumulative[i + 1] = PCumulative[i] + PCounter[i]; +} + +#pragma omp task depend(inout \ + : PCounter) +{ + // Reuse PCounter to keep track of how many messages were inserted: + for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers! + PCounter[i] = 0; +} + +// Build the Message Bundle packet: +#pragma omp task depend(in \ + : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \ + : myIndex, PMessageBundle, PCounter) +{ + for (i = 0; i < NumMessagesBundled; i++) + { + myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3; + PMessageBundle[myIndex + 0] = QLocalVtx[i]; + PMessageBundle[myIndex + 1] = QGhostVtx[i]; + PMessageBundle[myIndex + 2] = QMsgType[i]; + PCounter[QOwner[i]]++; + } +} + +// Send the Bundled Messages: Use ISend +#pragma omp task depend(out \ + : SRequest, SStatus) +{ + try + { + SRequest.reserve(numProcs * 2); // At most two messages per processor + SStatus.reserve(numProcs * 2); // At most two messages per processor + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } +} + +// Send the Messages +#pragma omp task depend(inout \ + : SRequest, PSizeInfoMessages, PCumulative) depend(out \ + : msgActual, MessageIndex) +{ + for (i = 0; i < numProcs; i++) + { // Changed by Fabio to be an integer, addresses needs to be integers! + if (i == myRank) // Do not send anything to yourself + continue; + // Send the Message with information about the size of next message: + // Build the Message Packet: + PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message + PSizeInfoMessages[i * 3 + 1] = -1; // Dummy packet + PSizeInfoMessages[i * 3 + 2] = SIZEINFO; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl; + fflush(stdout); +#endif + if (PSizeInfoMessages[i * 3 + 0] > 0) + { // Send only if it is a nonempty packet + MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, + &SRequest[MessageIndex]); + msgActual++; + MessageIndex++; + // Now Send the message with the data packet: +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl; + for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++) + cout << PMessageBundle[k] << ","; + cout << endl; + fflush(stdout); +#endif + MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], + TypeMap(), i, BundleTag, comm, &SRequest[MessageIndex]); + MessageIndex++; + } // End of if size > 0 + } +} + +#pragma omp task depend(inout \ + : PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner) +{ + + // Free up temporary memory: + PCumulative.clear(); + QLocalVtx.clear(); + QGhostVtx.clear(); + QMsgType.clear(); + QOwner.clear(); +} + +#pragma omp task depend(inout : OneMessageSize, BufferSize) depend(out : numMessagesToSend) depend(in : numGhostEdges) +{ + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges; + cout << "\n(" << myRank << ")Total number of potential message X 2 = " << numGhostEdges * 2; + cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled; + if (numGhostEdges > 0) + { + cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(numGhostEdges * 2)) * 100.0 << "% \n"; + } + fflush(stdout); +#endif + + // Allocate memory for MPI Send messages: + /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */ + OneMessageSize = 0; + MPI_Pack_size(3, TypeMap(), comm, &OneMessageSize); // Size of one message packet + // How many messages to send? + // Potentially three kinds of messages will be sent/received: + // Request, Success, Failure. + // But only two will be sent from a given processor. + // Substract the number of messages that have already been sent as bundled messages: + numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled; + BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; +} + +#pragma omp task depend(out : Buffer) depend(in : BufferSize) +{ + Buffer = 0; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize; + cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD; + cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges; + cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend; + cout << "\n(" << myRank << ")BufferSize = " << BufferSize; + cout << "\n(" << myRank << ")Attaching Buffer on.. "; + fflush(stdout); +#endif + if (BufferSize > 0) + { + Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory + if (Buffer == 0) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n"; + exit(1); + } + MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer + } +} +} +} + +*MessageIndexPtr = MessageIndex; +*msgActualPtr = msgActual; +*numGhostEdgesPtr = numGhostEdges; +*BufferSizePtr = BufferSize; +} \ No newline at end of file From df1e4a4616f1b7d9a2aaa8a4b790d7c7d9dca302 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 10 Jul 2022 04:31:51 -0500 Subject: [PATCH 44/96] PROCESS_CROSS_EDGE refactoring --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 81 ++----------------- amgprec/impl/aggregator/processCrossEdge.cpp | 33 ++++++++ .../impl/aggregator/processExposedVertex.cpp | 20 +---- .../aggregator/processMatchedVertices.cpp | 16 +--- 4 files changed, 45 insertions(+), 105 deletions(-) create mode 100644 amgprec/impl/aggregator/processCrossEdge.cpp diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index cfd6b927..6b189da5 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -9,6 +9,7 @@ #include "processExposedVertex.cpp" #include "processMatchedVertices.cpp" #include "sendBundledMessages.cpp" +//#include "processCrossEdge.cpp" // *********************************************************************** // @@ -506,21 +507,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); #endif - // Decrement the counter: - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - if (Counter[Ghost2LocalMap[w]] > 0) - { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement - if (Counter[Ghost2LocalMap[w]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; - fflush(stdout); -#endif - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); + } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex else @@ -759,20 +748,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; fflush(stdout); #endif - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) - if (Counter[Ghost2LocalMap[u]] > 0) - { - Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement - if (Counter[Ghost2LocalMap[u]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages" << endl; - fflush(stdout); -#endif - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); } // End of if ( candidateMate[v-StartIndex] == u )e } // End of if ( Mate[v] == -1 ) } // End of REQUEST @@ -784,22 +760,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; fflush(stdout); #endif - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) - // process it again - if (Counter[Ghost2LocalMap[u]] > 0) - { - Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement - if (Counter[Ghost2LocalMap[u]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages"; - fflush(stdout); -#endif - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); #ifdef DEBUG_GHOST_ if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { @@ -877,20 +839,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif // Decrement the counter: - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - if (Counter[Ghost2LocalMap[w]] > 0) - { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement - if (Counter[Ghost2LocalMap[w]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; - fflush(stdout); -#endif - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex else @@ -949,22 +898,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( cout << "\n(" << myRank << ")Message type is FAILURE" << endl; fflush(stdout); #endif - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) - // process it again - if (Counter[Ghost2LocalMap[u]] > 0) - { - Counter[Ghost2LocalMap[u]] = Counter[Ghost2LocalMap[u]] - 1; // Decrement - if (Counter[Ghost2LocalMap[u]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << u << " has received all its messages"; - fflush(stdout); -#endif - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,u) + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); } // End of else: CASE III } // End of else: CASE I } // End of if (!MsgQ.empty()) diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp new file mode 100644 index 00000000..f99bee1a --- /dev/null +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -0,0 +1,33 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +inline void PROCESS_CROSS_EDGE(vector &Counter, + map &Ghost2LocalMap, + MilanLongInt edge, + MilanLongInt *SPtr) +{ + MilanLongInt S = *SPtr; + // Decrement the counter: + // Start: PARALLEL_PROCESS_CROSS_EDGE_B + if (Counter[Ghost2LocalMap[edge]] > 0) + { + Counter[Ghost2LocalMap[edge]] -= 1; // Decrement + if (Counter[Ghost2LocalMap[edge]] == 0) + { + S--; // Decrement S +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages"; + fflush(stdout); +#endif + } + } // End of if Counter[edge] > 0 + // End: PARALLEL_PROCESS_CROSS_EDGE_B + *SPtr = S; +} \ No newline at end of file diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index dd9562d5..9ba155f9 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -8,6 +8,7 @@ #include "dataStrStaticQueue.h" #include "omp.h" #include "queueTransfer.cpp" +#include "processCrossEdge.cpp" inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, MilanLongInt *candidateMate, @@ -129,24 +130,7 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, //TODO refactor this!! // Decrement the counter: - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v) -#pragma omp critical - { - if (Counter[Ghost2LocalMap[w]] > 0) - { - - Counter[Ghost2LocalMap[w]] -= 1; // Decrement - if (Counter[Ghost2LocalMap[w]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; - fflush(stdout); -#endif - } - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v) + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 5f54ef79..1e496888 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -177,20 +177,8 @@ inline void processMatchedVertices( // TODO refactor this // Decrement the counter: - // Start: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) - if (Counter[Ghost2LocalMap[w]] > 0) - { - Counter[Ghost2LocalMap[w]] = Counter[Ghost2LocalMap[w]] - 1; // Decrement - if (Counter[Ghost2LocalMap[w]] == 0) - { - S--; // Decrement S -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << w << " has received all its messages"; - fflush(stdout); -#endif - } - } // End of if Counter[w] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B(v,w) + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); + } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex else From d19443052d9734b12f21fe8b15848354a1d25683 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 10 Jul 2022 05:24:31 -0500 Subject: [PATCH 45/96] Insert private queue error in processMatchedVertices.cpp --- .../aggregator/processMatchedVertices.cpp | 62 +++++++++++++++---- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 1e496888..567ae2e3 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -8,7 +8,7 @@ #include "omp.h" #include "extractUChunk.cpp" -#define UCHUNK 1000 +//#define privateQueues inline void processMatchedVertices( MilanLongInt NLVer, @@ -152,13 +152,22 @@ inline void processMatchedVertices( cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); #endif - QLocalVtx.push_back(v); - QGhostVtx.push_back(w); - QMsgType.push_back(REQUEST); ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + +#ifdef privateQueues + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); +#endif +#ifndef privateQueues + QLocalVtx.push_back(v); + QGhostVtx.push_back(w); + QMsgType.push_back(REQUEST); QOwner.push_back(ghostOwner); +#endif PCounter[ghostOwner]++; NumMessagesBundled++; msgInd++; @@ -217,16 +226,25 @@ inline void processMatchedVertices( cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif - /* MPI_Bsend(&Message[0], 3, MPI_INT, inputSubGraph.findOwner(w), - ComputeTag, comm); */ - QLocalVtx.push_back(v); - QGhostVtx.push_back(w); - QMsgType.push_back(FAILURE); + // ghostOwner = inputSubGraph.findOwner(w); ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + +#ifdef privateQueues + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); +#endif +#ifndef privateQueues + QLocalVtx.push_back(v); + QGhostVtx.push_back(w); + QMsgType.push_back(FAILURE); QOwner.push_back(ghostOwner); +#endif + PCounter[ghostOwner]++; NumMessagesBundled++; msgInd++; @@ -264,13 +282,23 @@ inline void processMatchedVertices( fflush(stdout); #endif - QLocalVtx.push_back(u); - QGhostVtx.push_back(v); - QMsgType.push_back(SUCCESS); ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + +#ifdef privateQueues + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); +#endif +#ifndef privateQueues + QLocalVtx.push_back(u); + QGhostVtx.push_back(v); + QMsgType.push_back(SUCCESS); QOwner.push_back(ghostOwner); +#endif + PCounter[ghostOwner]++; NumMessagesBundled++; msgInd++; @@ -289,12 +317,22 @@ inline void processMatchedVertices( // of data have been accumulated in the private queue if (privateU.size() < UCHUNK && !U.empty()) continue; + +#ifdef privateQueues +#pragma omp critical(U) + { + while (!privateU.empty()) + U.push_back(privateU.pop_back()); + } +#endif +#ifndef privateQueues queuesTransfer(U, privateU, QLocalVtx, QGhostVtx, QMsgType, QOwner, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner); +#endif } } // End of while ( /*!Q.empty()*/ !U.empty() ) From 64c23f93f8358adebe790913e9c31ce0b3dcc0d9 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 10 Jul 2022 10:01:50 -0500 Subject: [PATCH 46/96] processMessags partial refactoring, message const refactoring --- amgprec/impl/aggregator/MatchBoxPC.h | 4 + ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 144 ++---------------- .../impl/aggregator/computeCandidateMate.cpp | 41 +++-- .../parallelComputeCandidateMateB.cpp | 2 +- .../impl/aggregator/processExposedVertex.cpp | 7 +- .../aggregator/processMatchedVertices.cpp | 5 - amgprec/impl/aggregator/processMessages.cpp | 130 ++++++++++++++++ 7 files changed, 177 insertions(+), 156 deletions(-) create mode 100644 amgprec/impl/aggregator/processMessages.cpp diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 351dca98..c6445c81 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -65,6 +65,10 @@ using namespace std; #define NUM_THREAD 4 +const MilanLongInt REQUEST = 1; +const MilanLongInt SUCCESS = 2; +const MilanLongInt FAILURE = 3; +const MilanLongInt SIZEINFO = 4; // MPI type map template diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 6b189da5..5a5ef836 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -9,7 +9,8 @@ #include "processExposedVertex.cpp" #include "processMatchedVertices.cpp" #include "sendBundledMessages.cpp" -//#include "processCrossEdge.cpp" +#include "processMessages.cpp" + // *********************************************************************** // @@ -155,10 +156,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( // Data structures for sending and receiving messages: vector Message; // [ u, v, message_type ] Message.resize(3, -1); - const MilanLongInt REQUEST = 1; - const MilanLongInt SUCCESS = 2; - const MilanLongInt FAILURE = 3; - const MilanLongInt SIZEINFO = 4; MilanLongInt message_type = 0; // Data structures for Message Bundling: // Although up to two messages can be sent along any cross edge, @@ -186,7 +183,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt k = -1, adj1 = -1, adj2 = -1; MilanLongInt k1 = -1, adj11 = -1, adj12 = -1; MilanLongInt myCard = 0; - MilanInt Sender = 0; // This is the rank of the sending nodes, it has to be an integer! Fabio // Build the Ghost Vertex Set: Vg map Ghost2LocalMap; // Map each ghost vertex to a local vertex @@ -614,90 +610,19 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////// PROCESS MESSAGES ////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << "=========================************===============================" << endl; - fflush(stdout); - fflush(stdout); -#endif -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")About to begin Message processing phase ... S=" << S << endl; - fflush(stdout); -#endif -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << "=========================************===============================" << endl; - fflush(stdout); - fflush(stdout); -#endif - // BLOCKING RECEIVE: -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << " Waiting for blocking receive..." << endl; - fflush(stdout); - fflush(stdout); -#endif - error_codeC = MPI_Recv(&Message[0], 3, TypeMap(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus); - if (error_codeC != MPI_SUCCESS) - { - MPI_Error_string(error_codeC, error_message, &message_length); - cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n"; - fflush(stdout); - } - Sender = computeStatus.MPI_SOURCE; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl; - fflush(stdout); -#endif - // If the Message Type is a size indicator, then receive the bigger message. - if (Message[2] == SIZEINFO) - { -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl; - fflush(stdout); -#endif - bundleSize = Message[0]; //#of integers in the message - // Build the Message Buffer: - if (!ReceiveBuffer.empty()) - ReceiveBuffer.clear(); // Empty it out first - ReceiveBuffer.resize(bundleSize, -1); // Initialize -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message Bundle Before: " << endl; - for (i = 0; i < bundleSize; i++) - cout << ReceiveBuffer[i] << ","; - cout << endl; - fflush(stdout); -#endif - // Receive the message - error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap(), Sender, BundleTag, comm, &computeStatus); - if (error_codeC != MPI_SUCCESS) - { - MPI_Error_string(error_codeC, error_message, &message_length); - cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n"; - fflush(stdout); - } -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message Bundle After: " << endl; - for (i = 0; i < bundleSize; i++) - cout << ReceiveBuffer[i] << ","; - cout << endl; - fflush(stdout); -#endif - } - else - { // Just a single message: -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl; - fflush(stdout); -#endif - // Add the current message to Queue: - bundleSize = 3; //#of integers in the message - // Build the Message Buffer: - if (!ReceiveBuffer.empty()) - ReceiveBuffer.clear(); // Empty it out first - ReceiveBuffer.resize(bundleSize, -1); // Initialize + processMessages(error_codeC, + numProcs, + myRank, + ComputeTag, + BundleTag, + comm, + Message, + error_message, + message_length, + ReceiveBuffer, + &bundleSize); + - ReceiveBuffer[0] = Message[0]; // u - ReceiveBuffer[1] = Message[1]; // v - ReceiveBuffer[2] = Message[2]; // message_type - } bundleCounter = 0; while (bundleCounter < bundleSize) { @@ -707,17 +632,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( bundleCounter++; message_type = ReceiveBuffer[bundleCounter]; // TYPE bundleCounter++; -#ifdef DEBUG_GHOST_ - if ((v < StartIndex) || (v > EndIndex)) - { - cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; - fflush(stdout); - } -#endif -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl; - fflush(stdout); -#endif + // CASE I: REQUEST if (message_type == REQUEST) { @@ -774,33 +689,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( if (candidateMate[v - StartIndex] == u) { // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - w = -1; - heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN - for (k1 = adj11; k1 < adj12; k1++) - { - if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex)) - { // Is it a ghost vertex? - if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched - continue; - } - else - { // A local vertex - if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched - continue; - } - - if ((edgeLocWeight[k1] > heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1]))) - { - heaviestEdgeWt = edgeLocWeight[k1]; - w = verLocInd[k1]; - } - } // End of for loop + w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap); candidateMate[v - StartIndex] = w; - // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl; fflush(stdout); @@ -830,7 +720,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( { Mate[v - StartIndex] = w; // v is local GMate[Ghost2LocalMap[w]] = v; // w is ghost - // Q.push_back(u); U.push_back(v); U.push_back(w); myCard++; @@ -878,7 +767,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; fflush(stdout); #endif - // MPI_Bsend(&Message[0], 3, MilanMpiLongInt, findOwnerOfGhost(w, verDistance, myRank, numProcs), ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp index 92e3c92b..b6d8b3ff 100644 --- a/amgprec/impl/aggregator/computeCandidateMate.cpp +++ b/amgprec/impl/aggregator/computeCandidateMate.cpp @@ -11,21 +11,23 @@ */ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, MilanLongInt adj2, - MilanLongInt* verLocInd, - MilanReal* edgeLocWeight) + MilanLongInt *verLocInd, + MilanReal *edgeLocWeight) { MilanInt w = -1; - MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN + MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN int finalK; - for (int k = adj1; k < adj2; k++) { + for (int k = adj1; k < adj2; k++) + { if ((edgeLocWeight[k] > heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) + { heaviestEdgeWt = edgeLocWeight[k]; w = verLocInd[k]; finalK = k; } - } //End of for loop + } // End of for loop return finalK; } @@ -45,25 +47,32 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, */ inline MilanLongInt computeCandidateMate(MilanLongInt adj1, MilanLongInt adj2, - MilanReal* edgeLocWeight, + MilanReal *edgeLocWeight, MilanLongInt k, - MilanLongInt* verLocInd, + MilanLongInt *verLocInd, MilanLongInt StartIndex, MilanLongInt EndIndex, - vector & GMate, - MilanLongInt* Mate, - map & Ghost2LocalMap) + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap) { + // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + MilanInt w = -1; - MilanReal heaviestEdgeWt = MilanRealMin; //Assign the smallest Value possible first LDBL_MIN - for (k = adj1; k < adj2; k++) { - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) continue; + MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN + for (k = adj1; k < adj2; k++) + { + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + continue; if ((edgeLocWeight[k] > heaviestEdgeWt) || - ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) + { heaviestEdgeWt = edgeLocWeight[k]; w = verLocInd[k]; } - } //End of for loop + } // End of for loop + // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + return w; } \ No newline at end of file diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp index ced93456..d3e39056 100644 --- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp +++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp @@ -18,7 +18,7 @@ inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, MilanLongInt v = -1; -#pragma omp parallel private(v) default(shared) num_threads(4) +#pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD) { #pragma omp for schedule(static) diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index 9ba155f9..ff57b5e5 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -41,17 +41,12 @@ inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, staticQueue &privateQOwner) { - //TODO define all the constants in a single place! - const MilanLongInt REQUEST = 1; - const MilanLongInt SUCCESS = 2; - const MilanLongInt FAILURE = 3; - const MilanLongInt SIZEINFO = 4; MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0, S = *SPtr; MilanLongInt myCard = 0, msgInd = 0; MilanLongInt NumMessagesBundled = 0; MilanInt ghostOwner = 0; -#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) +#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) { #pragma omp for reduction(+ \ : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static) diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 567ae2e3..c6c729ad 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -43,11 +43,6 @@ inline void processMatchedVertices( staticQueue &privateQOwner) { - // TODO define all the constants in a single place! - const MilanLongInt REQUEST = 1; - const MilanLongInt SUCCESS = 2; - const MilanLongInt FAILURE = 3; - const MilanLongInt SIZEINFO = 4; MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0; diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp new file mode 100644 index 00000000..ae2c8671 --- /dev/null +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -0,0 +1,130 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + +inline void processMessages(int error_codeC, + MilanInt numProcs, + MilanInt myRank, + int ComputeTag, + int BundleTag, + MPI_Comm comm, + vector &Message, + char *error_message, + int message_length, + vector &ReceiveBuffer, + MilanLongInt *BundleSizePtr) +{ + + MilanInt Sender; + MPI_Status computeStatus; + MilanLongInt bundleSize = *BundleSizePtr; + +#ifdef PRINT_DEBUG_INFO_ + cout + << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")About to begin Message processing phase ... S=" << S << endl; + fflush(stdout); +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif + // BLOCKING RECEIVE: +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << " Waiting for blocking receive..." << endl; + fflush(stdout); + fflush(stdout); +#endif + + error_codeC = MPI_Recv(&Message[0], 3, TypeMap(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus); + if (error_codeC != MPI_SUCCESS) + { + MPI_Error_string(error_codeC, error_message, &message_length); + cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n"; + fflush(stdout); + } + Sender = computeStatus.MPI_SOURCE; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl; + fflush(stdout); +#endif + + if (Message[2] == SIZEINFO) + { + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl; + fflush(stdout); +#endif + bundleSize = Message[0]; //#of integers in the message + // Build the Message Buffer: + if (!ReceiveBuffer.empty()) + ReceiveBuffer.clear(); // Empty it out first + ReceiveBuffer.resize(bundleSize, -1); // Initialize +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message Bundle Before: " << endl; + for (i = 0; i < bundleSize; i++) + cout << ReceiveBuffer[i] << ","; + cout << endl; + fflush(stdout); +#endif + // Receive the message + error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap(), Sender, BundleTag, comm, &computeStatus); + if (error_codeC != MPI_SUCCESS) + { + MPI_Error_string(error_codeC, error_message, &message_length); + cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n"; + fflush(stdout); + } +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message Bundle After: " << endl; + for (i = 0; i < bundleSize; i++) + cout << ReceiveBuffer[i] << ","; + cout << endl; + fflush(stdout); +#endif + } + else + { // Just a single message: +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl; + fflush(stdout); +#endif + // Add the current message to Queue: + bundleSize = 3; //#of integers in the message + // Build the Message Buffer: + if (!ReceiveBuffer.empty()) + ReceiveBuffer.clear(); // Empty it out first + ReceiveBuffer.resize(bundleSize, -1); // Initialize + + ReceiveBuffer[0] = Message[0]; // u + ReceiveBuffer[1] = Message[1]; // v + ReceiveBuffer[2] = Message[2]; // message_type + } + +#ifdef DEBUG_GHOST_ + if ((v < StartIndex) || (v > EndIndex)) + { + cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; + fflush(stdout); + } +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl; + fflush(stdout); +#endif + + *BundleSizePtr = bundleSize; + return; +} \ No newline at end of file From 32777cc15c562f8288e084d3bcf819f5c9cc4d08 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 10 Jul 2022 11:09:10 -0500 Subject: [PATCH 47/96] clean partial refactoring --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 75 ++++------------ amgprec/impl/aggregator/clean.cpp | 88 +++++++++++++++++++ 2 files changed, 103 insertions(+), 60 deletions(-) create mode 100644 amgprec/impl/aggregator/clean.cpp diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 5a5ef836..ad5d1142 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -10,7 +10,7 @@ #include "processMatchedVertices.cpp" #include "sendBundledMessages.cpp" #include "processMessages.cpp" - +#include "clean.cpp" // *********************************************************************** // @@ -610,7 +610,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////// PROCESS MESSAGES ////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - processMessages(error_codeC, + processMessages(error_codeC, numProcs, myRank, ComputeTag, @@ -622,7 +622,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( ReceiveBuffer, &bundleSize); - bundleCounter = 0; while (bundleCounter < bundleSize) { @@ -802,66 +801,22 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #endif } // End of while (true) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ") Waitall= " << endl; - fflush(stdout); -#endif -#ifdef DEBUG_HANG_ - cout << "\n(" << myRank << ") Waitall " << endl; - fflush(stdout); -#endif - // MPI_Barrier(comm); - // Cleanup Phase - MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); - // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer - if (BufferSize > 0) - { - MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer - free(Buffer); // Free the memory that was allocated - } + clean(myRank, + MessageIndex, + SRequest, + SStatus, + BufferSize, + Buffer, + msgActual, + msgActualSent, + msgInd, + msgIndSent, + NumMessagesBundled, + msgPercent); + finishTime = MPI_Wtime(); *ph2_time = finishTime - startTime; // Time taken for Phase-2 *ph2_card = myCard; // Cardinality at the end of Phase-2 - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")End of function to compute matching: " << endl; - fflush(stdout); - cout << "\n(" << myRank << ")myCardinality: " << myCard << endl; - fflush(stdout); - cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl; - fflush(stdout); - cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl; - fflush(stdout); -#endif -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges; - cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2; - cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled; - cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd; - if (msgInd > 0) - { - cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n"; - } - fflush(stdout); -#endif - - *msgActualSent = msgActual; - *msgIndSent = msgInd; - if (msgInd > 0) - { - *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0; - } - else - { - *msgPercent = 0; - } - -#ifdef DEBUG_HANG_ - if (myRank == 0) - cout << "\n(" << myRank << ") Done" << endl; - fflush(stdout); -#endif - // MPI_Barrier(comm); } // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate #endif diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp new file mode 100644 index 00000000..6c5543b8 --- /dev/null +++ b/amgprec/impl/aggregator/clean.cpp @@ -0,0 +1,88 @@ +#include "MatchBoxPC.h" +#include +#include +#include +#include +#include +#include "primitiveDataTypeDefinitions.h" +#include "dataStrStaticQueue.h" +#include "omp.h" + + +//TODO comment +//TODO use task +//TODO destroy the locks + +inline void clean(MilanInt myRank, + MilanLongInt MessageIndex, + vector &SRequest, + vector &SStatus, + MilanInt BufferSize, + MilanLongInt *Buffer, + MilanLongInt msgActual, + MilanLongInt *msgActualSent, + MilanLongInt msgInd, + MilanLongInt *msgIndSent, + MilanLongInt NumMessagesBundled, + MilanReal *msgPercent) +{ + // Cleanup Phase + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ") Waitall= " << endl; + fflush(stdout); +#endif +#ifdef DEBUG_HANG_ + cout << "\n(" << myRank << ") Waitall " << endl; + fflush(stdout); +#endif + return; + + MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); + + // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer + if (BufferSize > 0) + { + MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer + free(Buffer); // Free the memory that was allocated + } + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")End of function to compute matching: " << endl; + fflush(stdout); + cout << "\n(" << myRank << ")myCardinality: " << myCard << endl; + fflush(stdout); + cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl; + fflush(stdout); + cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl; + fflush(stdout); +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges; + cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2; + cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled; + cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd; + if (msgInd > 0) + { + cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n"; + } + fflush(stdout); +#endif + + *msgActualSent = msgActual; + *msgIndSent = msgInd; + if (msgInd > 0) + { + *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0; + } + else + { + *msgPercent = 0; + } + +#ifdef DEBUG_HANG_ + if (myRank == 0) + cout << "\n(" << myRank << ") Done" << endl; + fflush(stdout); +#endif +} \ No newline at end of file From 36bd3a51a22a26589728ab2cf06c32f8f3336e3d Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Mon, 11 Jul 2022 16:31:58 -0500 Subject: [PATCH 48/96] Makefile fix --- amgprec/impl/aggregator/Makefile | 13 + amgprec/impl/aggregator/MatchBoxPC.h | 314 +++++++++++------- ...DomEdgesLinearSearchMesgBndlSmallMateC.cpp | 6 - ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 12 - amgprec/impl/aggregator/clean.cpp | 11 +- .../impl/aggregator/computeCandidateMate.cpp | 4 +- amgprec/impl/aggregator/extractUChunk.cpp | 11 +- amgprec/impl/aggregator/findOwnerOfGhost.cpp | 8 +- amgprec/impl/aggregator/initialize.cpp | 14 +- amgprec/impl/aggregator/isAlreadyMatched.cpp | 8 +- .../parallelComputeCandidateMateB.cpp | 10 +- amgprec/impl/aggregator/processCrossEdge.cpp | 10 +- .../impl/aggregator/processExposedVertex.cpp | 14 +- .../aggregator/processMatchedVertices.cpp | 11 +- amgprec/impl/aggregator/processMessages.cpp | 9 +- amgprec/impl/aggregator/queueTransfer.cpp | 9 +- .../impl/aggregator/sendBundledMessages.cpp | 11 +- 17 files changed, 233 insertions(+), 242 deletions(-) diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile index 1f6f52af..f1760822 100644 --- a/amgprec/impl/aggregator/Makefile +++ b/amgprec/impl/aggregator/Makefile @@ -62,9 +62,22 @@ amg_s_parmatch_smth_bld.o \ amg_s_parmatch_spmm_bld_inner.o MPCOBJS=MatchBoxPC.o \ +sendBundledMessages.o \ +initialize.o \ +extractUChunk.o \ +isAlreadyMatched.o \ +findOwnerOfGhost.o \ +computeCandidateMate.o \ +parallelComputeCandidateMateB.o \ +processMatchedVertices.o \ +processCrossEdge.o \ +queueTransfer.o \ +processMessages.o \ +processExposedVertex.o \ algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \ algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o + OBJS = $(FOBJS) $(MPCOBJS) LIBNAME=libamg_prec.a diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index c6445c81..dfcb6f7e 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -64,7 +64,10 @@ #include "dataStrStaticQueue.h" using namespace std; + #define NUM_THREAD 4 +#define UCHUNK 1000 + const MilanLongInt REQUEST = 1; const MilanLongInt SUCCESS = 2; const MilanLongInt FAILURE = 3; @@ -166,119 +169,206 @@ extern "C" #define MilanRealMin MINUS_INFINITY #endif - - // Function of find the owner of a ghost vertex using binary search: - inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, - MilanInt myRank, MilanInt numProcs); - - inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanLongInt *verLocInd, - MilanReal *edgeLocWeight); - - inline bool isAlreadyMatched(MilanLongInt node, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector &GMate, - MilanLongInt *Mate, - map &Ghost2LocalMap); - - inline MilanLongInt computeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanReal *edgeLocWeight, - MilanLongInt k, - MilanLongInt *verLocInd, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector &GMate, - MilanLongInt *Mate, - map &Ghost2LocalMap); - - inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt *numGhostEdgesPtr, - MilanLongInt *numGhostVerticesPtr, - MilanLongInt *S, - MilanLongInt *verLocInd, - MilanLongInt *verLocPtr, - omp_lock_t *MateLock, - map &Ghost2LocalMap, - vector &Counter, - vector &verGhostPtr, - vector &verGhostInd, - vector &tempCounter, - vector &GMate, - vector &Message, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - MilanLongInt *&candidateMate, - staticQueue &U, - staticQueue &privateU, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner); - - inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, - MilanLongInt *verLocPtr, - MilanLongInt *verLocInd, - MilanInt myRank, - MilanReal *edgeLocWeight, - MilanLongInt *candidateMate); - - inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, - MilanLongInt *candidateMate, - MilanLongInt *verLocInd, - MilanLongInt *verLocPtr, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - MilanLongInt *Mate, - vector &GMate, - map &Ghost2LocalMap, - MilanReal *edgeLocWeight, - MilanLongInt *myCardPtr, - MilanLongInt *msgIndPtr, - MilanLongInt *NumMessagesBundledPtr, - MilanLongInt *SPtr, - MilanLongInt *verDistance, - MilanLongInt *PCounter, - vector &Counter, - MilanInt myRank, - MilanInt numProcs, - staticQueue &U, - staticQueue &privateU, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner); - - inline void queuesTransfer(staticQueue &U, - staticQueue &privateU, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner); - - void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( - MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, - MilanLongInt *verDistance, - MilanLongInt *Mate, - MilanInt myRank, MilanInt numProcs, MPI_Comm comm, - MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, - MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, - MilanLongInt *ph1_card, MilanLongInt *ph2_card); + MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, + MilanInt myRank, MilanInt numProcs); + + MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanLongInt *verLocInd, + MilanReal *edgeLocWeight); + + void queuesTransfer(staticQueue &U, + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); + + bool isAlreadyMatched(MilanLongInt node, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap); + + MilanLongInt computeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanReal *edgeLocWeight, + MilanLongInt k, + MilanLongInt *verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap); + + void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt *numGhostEdgesPtr, + MilanLongInt *numGhostVerticesPtr, + MilanLongInt *S, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + omp_lock_t *MateLock, + map &Ghost2LocalMap, + vector &Counter, + vector &verGhostPtr, + vector &verGhostInd, + vector &tempCounter, + vector &GMate, + vector &Message, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + MilanLongInt *&candidateMate, + staticQueue &U, + staticQueue &privateU, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); + + void clean(MilanInt myRank, + MilanLongInt MessageIndex, + vector &SRequest, + vector &SStatus, + MilanInt BufferSize, + MilanLongInt *Buffer, + MilanLongInt msgActual, + MilanLongInt *msgActualSent, + MilanLongInt msgInd, + MilanLongInt *msgIndSent, + MilanLongInt NumMessagesBundled, + MilanReal *msgPercent); + + void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanInt myRank, + MilanReal *edgeLocWeight, + MilanLongInt *candidateMate); + + void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, + MilanLongInt *candidateMate, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *Mate, + vector &GMate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + staticQueue &U, + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); + + void PROCESS_CROSS_EDGE(vector &Counter, + map &Ghost2LocalMap, + MilanLongInt edge, + MilanLongInt *SPtr); + + void processMatchedVertices( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + staticQueue &U, + staticQueue &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); + + void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, + MilanInt *BufferSizePtr, + MilanLongInt *Buffer, + vector &PCumulative, + vector &PMessageBundle, + vector &PSizeInfoMessages, + MilanLongInt *PCounter, + MilanLongInt NumMessagesBundled, + MilanLongInt *msgActualPtr, + MilanLongInt *MessageIndexPtr, + MilanInt numProcs, + MilanInt myRank, + int ComputeTag, + int BundleTag, + MPI_Comm comm, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &SRequest, + vector &SStatus); + + void processMessages(int error_codeC, + MilanInt numProcs, + MilanInt myRank, + int ComputeTag, + int BundleTag, + MPI_Comm comm, + vector &Message, + char *error_message, + int message_length, + vector &ReceiveBuffer, + MilanLongInt *BundleSizePtr); + + void extractUChunk( + vector &UChunkBeingProcessed, + staticQueue &U, + staticQueue &privateU); + + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( MilanLongInt NLVer, MilanLongInt NLEdge, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp index 8be438b6..f03f726f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp @@ -72,12 +72,6 @@ #ifdef SERIAL_MPI #else -//MPI type map -template MPI_Datatype TypeMap(); -template<> inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } -template<> inline MPI_Datatype TypeMap() { return MPI_INT; } -template<> inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } -template<> inline MPI_Datatype TypeMap() { return MPI_FLOAT; } // DOUBLE PRECISION VERSION //WARNING: The vertex block on a given rank is contiguous diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index ad5d1142..50930601 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -1,16 +1,4 @@ #include "MatchBoxPC.h" -#include -#include -#include "isAlreadyMatched.cpp" -#include "findOwnerOfGhost.cpp" -#include "computeCandidateMate.cpp" -#include "initialize.cpp" -#include "parallelComputeCandidateMateB.cpp" -#include "processExposedVertex.cpp" -#include "processMatchedVertices.cpp" -#include "sendBundledMessages.cpp" -#include "processMessages.cpp" -#include "clean.cpp" // *********************************************************************** // diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp index 6c5543b8..5204894a 100644 --- a/amgprec/impl/aggregator/clean.cpp +++ b/amgprec/impl/aggregator/clean.cpp @@ -1,19 +1,10 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" - //TODO comment //TODO use task //TODO destroy the locks -inline void clean(MilanInt myRank, +void clean(MilanInt myRank, MilanLongInt MessageIndex, vector &SRequest, vector &SStatus, diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp index b6d8b3ff..36eaa727 100644 --- a/amgprec/impl/aggregator/computeCandidateMate.cpp +++ b/amgprec/impl/aggregator/computeCandidateMate.cpp @@ -9,7 +9,7 @@ * @param edgeLocWeight * @return */ -inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, +MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, MilanLongInt adj2, MilanLongInt *verLocInd, MilanReal *edgeLocWeight) @@ -45,7 +45,7 @@ inline MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, * @param Ghost2LocalMap * @return */ -inline MilanLongInt computeCandidateMate(MilanLongInt adj1, +MilanLongInt computeCandidateMate(MilanLongInt adj1, MilanLongInt adj2, MilanReal *edgeLocWeight, MilanLongInt k, diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp index e8a6951c..b5bc1f5f 100644 --- a/amgprec/impl/aggregator/extractUChunk.cpp +++ b/amgprec/impl/aggregator/extractUChunk.cpp @@ -1,15 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" -#define UCHUNK 1000 - -inline void extractUChunk( +void extractUChunk( vector &UChunkBeingProcessed, staticQueue &U, staticQueue &privateU) diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp index 10850607..59a87bc3 100644 --- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp +++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp @@ -1,13 +1,7 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" ///Find the owner of a ghost node: -inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, +MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt myRank, MilanInt numProcs) { //MilanLongInt Size = mVerDistance.size(); MilanLongInt mStartInd = mVerDistance[myRank]; diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index c5ae3f26..979cdcf5 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -1,16 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" - -#define NUM_THREAD 4 - -inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, + +void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt StartIndex, MilanLongInt EndIndex, MilanLongInt *numGhostEdgesPtr, MilanLongInt *numGhostVerticesPtr, diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp index 5a9cf476..dbb1052f 100644 --- a/amgprec/impl/aggregator/isAlreadyMatched.cpp +++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp @@ -1,10 +1,4 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" /** * //TODO documentation @@ -17,7 +11,7 @@ * @param Ghost2LocalMap * @return */ -inline bool isAlreadyMatched(MilanLongInt node, +bool isAlreadyMatched(MilanLongInt node, MilanLongInt StartIndex, MilanLongInt EndIndex, vector &GMate, diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp index d3e39056..998edd9e 100644 --- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp +++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp @@ -1,14 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" -inline void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, +void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanInt myRank, diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp index f99bee1a..05cae5d2 100644 --- a/amgprec/impl/aggregator/processCrossEdge.cpp +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -1,14 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" -inline void PROCESS_CROSS_EDGE(vector &Counter, +void PROCESS_CROSS_EDGE(vector &Counter, map &Ghost2LocalMap, MilanLongInt edge, MilanLongInt *SPtr) diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index ff57b5e5..50a5ecfd 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -1,16 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" -#include "queueTransfer.cpp" -#include "processCrossEdge.cpp" - -inline void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, + +void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, MilanLongInt *candidateMate, MilanLongInt *verLocInd, MilanLongInt *verLocPtr, diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index c6c729ad..aaef21a1 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -1,16 +1,8 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" -#include "extractUChunk.cpp" //#define privateQueues -inline void processMatchedVertices( +void processMatchedVertices( MilanLongInt NLVer, vector &UChunkBeingProcessed, staticQueue &U, @@ -61,6 +53,7 @@ inline void processMatchedVertices( #endif // TODO what would be the optimal UCHUNK + // TODO refactor vector UChunkBeingProcessed; UChunkBeingProcessed.reserve(UCHUNK); diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index ae2c8671..cb43cdb8 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -1,13 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" -inline void processMessages(int error_codeC, +void processMessages(int error_codeC, MilanInt numProcs, MilanInt myRank, int ComputeTag, diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp index becf14cf..ed2829c6 100644 --- a/amgprec/impl/aggregator/queueTransfer.cpp +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -1,13 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" -inline void queuesTransfer(staticQueue &U, +void queuesTransfer(staticQueue &U, staticQueue &privateU, vector &QLocalVtx, vector &QGhostVtx, diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp index e16c5669..382d8a16 100644 --- a/amgprec/impl/aggregator/sendBundledMessages.cpp +++ b/amgprec/impl/aggregator/sendBundledMessages.cpp @@ -1,13 +1,6 @@ #include "MatchBoxPC.h" -#include -#include -#include -#include -#include "primitiveDataTypeDefinitions.h" -#include "dataStrStaticQueue.h" -#include "omp.h" - -inline void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, + +void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, MilanInt *BufferSizePtr, MilanLongInt *Buffer, vector &PCumulative, From c7e81935142a3e8d352c5c35d90440efed86e48e Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Tue, 12 Jul 2022 12:12:15 -0500 Subject: [PATCH 49/96] omp task in clean.cpp, lock destroy --- amgprec/impl/aggregator/MatchBoxPC.h | 6 +- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 6 +- amgprec/impl/aggregator/clean.cpp | 133 ++++++++++-------- 3 files changed, 84 insertions(+), 61 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index dfcb6f7e..0b3dcd74 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -234,7 +234,8 @@ extern "C" staticQueue &privateQMsgType, staticQueue &privateQOwner); - void clean(MilanInt myRank, + void clean(MilanLongInt NLVer, + MilanInt myRank, MilanLongInt MessageIndex, vector &SRequest, vector &SStatus, @@ -245,7 +246,8 @@ extern "C" MilanLongInt msgInd, MilanLongInt *msgIndSent, MilanLongInt NumMessagesBundled, - MilanReal *msgPercent); + MilanReal *msgPercent, + omp_lock_t *MateLock); void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, MilanLongInt *verLocPtr, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 50930601..e45ee792 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -789,7 +789,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #endif } // End of while (true) - clean(myRank, + clean(NLVer, + myRank, MessageIndex, SRequest, SStatus, @@ -800,7 +801,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( msgInd, msgIndSent, NumMessagesBundled, - msgPercent); + msgPercent, + MateLock); finishTime = MPI_Wtime(); *ph2_time = finishTime - startTime; // Time taken for Phase-2 diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp index 5204894a..d91076c9 100644 --- a/amgprec/impl/aggregator/clean.cpp +++ b/amgprec/impl/aggregator/clean.cpp @@ -1,79 +1,98 @@ #include "MatchBoxPC.h" -//TODO comment -//TODO use task -//TODO destroy the locks +// TODO comment +// TODO use task +// TODO destroy the locks -void clean(MilanInt myRank, - MilanLongInt MessageIndex, - vector &SRequest, - vector &SStatus, - MilanInt BufferSize, - MilanLongInt *Buffer, - MilanLongInt msgActual, - MilanLongInt *msgActualSent, - MilanLongInt msgInd, - MilanLongInt *msgIndSent, - MilanLongInt NumMessagesBundled, - MilanReal *msgPercent) +void clean(MilanLongInt NLVer, + MilanInt myRank, + MilanLongInt MessageIndex, + vector &SRequest, + vector &SStatus, + MilanInt BufferSize, + MilanLongInt *Buffer, + MilanLongInt msgActual, + MilanLongInt *msgActualSent, + MilanLongInt msgInd, + MilanLongInt *msgIndSent, + MilanLongInt NumMessagesBundled, + MilanReal *msgPercent, + omp_lock_t *MateLock) { // Cleanup Phase +#pragma omp parallel + { +#pragma omp master + { +#pragma omp task + { + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ") Waitall= " << endl; - fflush(stdout); + cout << "\n(" << myRank << ") Waitall= " << endl; + fflush(stdout); #endif #ifdef DEBUG_HANG_ - cout << "\n(" << myRank << ") Waitall " << endl; - fflush(stdout); + cout << "\n(" << myRank << ") Waitall " << endl; + fflush(stdout); #endif - return; + return; - MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); + MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); - // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer - if (BufferSize > 0) - { - MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer - free(Buffer); // Free the memory that was allocated - } + // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer + if (BufferSize > 0) + { + MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer + free(Buffer); // Free the memory that was allocated + } + } #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")End of function to compute matching: " << endl; - fflush(stdout); - cout << "\n(" << myRank << ")myCardinality: " << myCard << endl; - fflush(stdout); - cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl; - fflush(stdout); - cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl; - fflush(stdout); + cout << "\n(" << myRank << ")End of function to compute matching: " << endl; + fflush(stdout); + cout << "\n(" << myRank << ")myCardinality: " << myCard << endl; + fflush(stdout); + cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl; + fflush(stdout); + cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl; + fflush(stdout); #endif #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges; - cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2; - cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled; - cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd; - if (msgInd > 0) - { - cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n"; - } - fflush(stdout); + cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges; + cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2; + cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled; + cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd; + if (msgInd > 0) + { + cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n"; + } + fflush(stdout); #endif - *msgActualSent = msgActual; - *msgIndSent = msgInd; - if (msgInd > 0) - { - *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0; - } - else - { - *msgPercent = 0; - } +#pragma omp task + { + *msgActualSent = msgActual; + *msgIndSent = msgInd; + if (msgInd > 0) + { + *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0; + } + else + { + *msgPercent = 0; + } + } + // Destroy the locks +#pragma omp taskloop num_tasks(NUM_THREAD) + for (int i = 0; i < NLVer; i++) + omp_destroy_lock(&MateLock[i]); #ifdef DEBUG_HANG_ - if (myRank == 0) - cout << "\n(" << myRank << ") Done" << endl; - fflush(stdout); + if (myRank == 0) + cout << "\n(" << myRank << ") Done" << endl; + fflush(stdout); #endif + } + } } \ No newline at end of file From ccac816f52b4be91a0231c14c3a4983c85285e04 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Tue, 12 Jul 2022 13:24:12 -0500 Subject: [PATCH 50/96] processCrossEdge small refactoring --- amgprec/impl/aggregator/processCrossEdge.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp index 05cae5d2..1ef99560 100644 --- a/amgprec/impl/aggregator/processCrossEdge.cpp +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -1,11 +1,10 @@ #include "MatchBoxPC.h" void PROCESS_CROSS_EDGE(vector &Counter, - map &Ghost2LocalMap, - MilanLongInt edge, - MilanLongInt *SPtr) + map &Ghost2LocalMap, + MilanLongInt edge, + MilanLongInt *SPtr) { - MilanLongInt S = *SPtr; // Decrement the counter: // Start: PARALLEL_PROCESS_CROSS_EDGE_B if (Counter[Ghost2LocalMap[edge]] > 0) @@ -13,13 +12,13 @@ void PROCESS_CROSS_EDGE(vector &Counter, Counter[Ghost2LocalMap[edge]] -= 1; // Decrement if (Counter[Ghost2LocalMap[edge]] == 0) { - S--; // Decrement S + (*SPtr)--; // Decrement S #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages"; fflush(stdout); #endif } + } // End of if Counter[edge] > 0 // End: PARALLEL_PROCESS_CROSS_EDGE_B - *SPtr = S; } \ No newline at end of file From dc1675766fe1fc8d75bf1c264176178fa1d2ae85 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 13 Jul 2022 16:19:38 -0500 Subject: [PATCH 51/96] processMessages.cpp further refactoring --- amgprec/impl/aggregator/MatchBoxPC.h | 43 ++- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 246 ++++-------------- .../impl/aggregator/computeCandidateMate.cpp | 24 +- amgprec/impl/aggregator/processMessages.cpp | 224 +++++++++++++++- 4 files changed, 304 insertions(+), 233 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 0b3dcd74..eecc57ce 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -345,17 +345,38 @@ extern "C" vector &SRequest, vector &SStatus); - void processMessages(int error_codeC, - MilanInt numProcs, - MilanInt myRank, - int ComputeTag, - int BundleTag, - MPI_Comm comm, - vector &Message, - char *error_message, - int message_length, - vector &ReceiveBuffer, - MilanLongInt *BundleSizePtr); + void processMessages( + MilanLongInt NLVer, + MilanLongInt *Mate, + MilanLongInt *candidateMate, + map &Ghost2LocalMap, + vector &GMate, + vector &Counter, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *msgActualPtr, + MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *verLocPtr, + MilanLongInt k, + MilanLongInt *verLocInd, + int error_codeC, + MilanInt numProcs, + MilanInt myRank, + int ComputeTag, + int BundleTag, + MPI_Comm comm, + vector &Message, + char *error_message, + int message_length, + vector &ReceiveBuffer, + MilanLongInt u, + MilanLongInt v, + MilanLongInt message_type, + MilanLongInt *SPtr, + staticQueue &U); void extractUChunk( vector &UChunkBeingProcessed, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index e45ee792..c5a81c4d 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -598,7 +598,23 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////// PROCESS MESSAGES ////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - processMessages(error_codeC, + processMessages(NLVer, + Mate, + candidateMate, + Ghost2LocalMap, + GMate, + Counter, + StartIndex, + EndIndex, + &myCard, + &msgInd, + &msgActual, + edgeLocWeight, + verDistance, + verLocPtr, + k, + verLocInd, + error_codeC, numProcs, myRank, ComputeTag, @@ -608,205 +624,41 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( error_message, message_length, ReceiveBuffer, - &bundleSize); - - bundleCounter = 0; - while (bundleCounter < bundleSize) - { - u = ReceiveBuffer[bundleCounter]; // GHOST - bundleCounter++; - v = ReceiveBuffer[bundleCounter]; // LOCAL - bundleCounter++; - message_type = ReceiveBuffer[bundleCounter]; // TYPE - bundleCounter++; - - // CASE I: REQUEST - if (message_type == REQUEST) - { -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message type is REQUEST" << endl; - fflush(stdout); -#endif -#ifdef DEBUG_GHOST_ - if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) - { - cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; - fflush(stdout); - } + u, + v, + message_type, + &S, + U); -#endif - if (Mate[v - StartIndex] == -1) - { // Process only if not already matched (v is local) - candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost - if (candidateMate[v - StartIndex] == u) - { - GMate[Ghost2LocalMap[u]] = v; // u is ghost - Mate[v - StartIndex] = u; // v is local - // Q.push_back(u); - U.push_back(v); - U.push_back(u); - myCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; - fflush(stdout); -#endif - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); - } // End of if ( candidateMate[v-StartIndex] == u )e - } // End of if ( Mate[v] == -1 ) - } // End of REQUEST - else - { // CASE II: SUCCESS - if (message_type == SUCCESS) - { -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; - fflush(stdout); -#endif - GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); -#ifdef DEBUG_GHOST_ - if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) - { - cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; - fflush(stdout); - } -#endif - if (Mate[v - StartIndex] == -1) - { // Process only if not already matched ( v is local) - if (candidateMate[v - StartIndex] == u) - { - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap); - candidateMate[v - StartIndex] = w; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl; - fflush(stdout); -#endif - // If found a dominating edge: - if (w >= 0) - { - if ((w < StartIndex) || (w > EndIndex)) - { // w is a ghost - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; - fflush(stdout); -#endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - Mate[v - StartIndex] = w; // v is local - GMate[Ghost2LocalMap[w]] = v; // w is ghost - U.push_back(v); - U.push_back(w); - myCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; - fflush(stdout); -#endif - // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); - } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex - if (candidateMate[w - StartIndex] == v) - { - Mate[v - StartIndex] = w; // v is local - Mate[w - StartIndex] = v; // w is local - // Q.push_back(u); - U.push_back(v); - U.push_back(w); - myCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; - fflush(stdout); -#endif - } // End of if(CandidateMate(w) = v - } // End of Else - } // End of if(w >=0) - else - { // No dominant edge found - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = FAILURE; // TYPE - // Send a Request (Asynchronous) + ///////////////////////// END OF PROCESS MESSAGES ///////////////////////////////// #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; - fflush(stdout); -#endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of if ( candidateMate[v-StartIndex] == u ) - } // End of if ( Mate[v] == -1 ) - } // End of if ( message_type == SUCCESS ) - else - { // CASE III: FAILURE -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message type is FAILURE" << endl; - fflush(stdout); -#endif - GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); - } // End of else: CASE III - } // End of else: CASE I - } // End of if (!MsgQ.empty()) - ///////////////////////// END OF PROCESS MESSAGES ///////////////////////////////// -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S; - fflush(stdout); - cout << "\n(" << myRank << ")** SENT : ACTUAL= " << msgActual; - fflush(stdout); - cout << "\n(" << myRank << ")** SENT : INDIVIDUAL= " << msgInd << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S; + fflush(stdout); + cout << "\n(" << myRank << ")** SENT : ACTUAL= " << msgActual; + fflush(stdout); + cout << "\n(" << myRank << ")** SENT : INDIVIDUAL= " << msgInd << endl; + fflush(stdout); #endif - } // End of while (true) - - clean(NLVer, - myRank, - MessageIndex, - SRequest, - SStatus, - BufferSize, - Buffer, - msgActual, - msgActualSent, - msgInd, - msgIndSent, - NumMessagesBundled, - msgPercent, - MateLock); - - finishTime = MPI_Wtime(); - *ph2_time = finishTime - startTime; // Time taken for Phase-2 - *ph2_card = myCard; // Cardinality at the end of Phase-2 +} // End of while (true) + +clean(NLVer, + myRank, + MessageIndex, + SRequest, + SStatus, + BufferSize, + Buffer, + msgActual, + msgActualSent, + msgInd, + msgIndSent, + NumMessagesBundled, + msgPercent, + MateLock); + +finishTime = MPI_Wtime(); +*ph2_time = finishTime - startTime; // Time taken for Phase-2 +*ph2_card = myCard; // Cardinality at the end of Phase-2 } // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate #endif diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp index 36eaa727..f79fc613 100644 --- a/amgprec/impl/aggregator/computeCandidateMate.cpp +++ b/amgprec/impl/aggregator/computeCandidateMate.cpp @@ -10,9 +10,9 @@ * @return */ MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanLongInt *verLocInd, - MilanReal *edgeLocWeight) + MilanLongInt adj2, + MilanLongInt *verLocInd, + MilanReal *edgeLocWeight) { MilanInt w = -1; MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN @@ -46,15 +46,15 @@ MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, * @return */ MilanLongInt computeCandidateMate(MilanLongInt adj1, - MilanLongInt adj2, - MilanReal *edgeLocWeight, - MilanLongInt k, - MilanLongInt *verLocInd, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector &GMate, - MilanLongInt *Mate, - map &Ghost2LocalMap) + MilanLongInt adj2, + MilanReal *edgeLocWeight, + MilanLongInt k, + MilanLongInt *verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap) { // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index cb43cdb8..7d44b281 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -1,21 +1,45 @@ #include "MatchBoxPC.h" -void processMessages(int error_codeC, - MilanInt numProcs, - MilanInt myRank, - int ComputeTag, - int BundleTag, - MPI_Comm comm, - vector &Message, - char *error_message, - int message_length, - vector &ReceiveBuffer, - MilanLongInt *BundleSizePtr) +void processMessages( + MilanLongInt NLVer, + MilanLongInt *Mate, + MilanLongInt *candidateMate, + map &Ghost2LocalMap, + vector &GMate, + vector &Counter, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *msgActualPtr, + MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *verLocPtr, + MilanLongInt k, + MilanLongInt *verLocInd, + int error_codeC, + MilanInt numProcs, + MilanInt myRank, + int ComputeTag, + int BundleTag, + MPI_Comm comm, + vector &Message, + char *error_message, + int message_length, + vector &ReceiveBuffer, + MilanLongInt u, + MilanLongInt v, + MilanLongInt message_type, + MilanLongInt *SPtr, + staticQueue &U) { MilanInt Sender; MPI_Status computeStatus; - MilanLongInt bundleSize = *BundleSizePtr; + MilanLongInt bundleSize, bundleCounter = 0, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w; + MilanLongInt S = *SPtr; // TODO refactor this + MilanLongInt adj11, adj12, k1; + MilanLongInt ghostOwner; #ifdef PRINT_DEBUG_INFO_ cout @@ -118,6 +142,180 @@ void processMessages(int error_codeC, fflush(stdout); #endif - *BundleSizePtr = bundleSize; + bundleCounter = 0; + while (bundleCounter < bundleSize) + { + u = ReceiveBuffer[bundleCounter]; // GHOST + bundleCounter++; + v = ReceiveBuffer[bundleCounter]; // LOCAL + bundleCounter++; + message_type = ReceiveBuffer[bundleCounter]; // TYPE + bundleCounter++; + + // CASE I: REQUEST + if (message_type == REQUEST) + { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message type is REQUEST" << endl; + fflush(stdout); +#endif +#ifdef DEBUG_GHOST_ + if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) + { + cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } + +#endif + + if (Mate[v - StartIndex] == -1) + { // Process only if not already matched (v is local) + candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost + if (candidateMate[v - StartIndex] == u) + { + GMate[Ghost2LocalMap[u]] = v; // u is ghost + Mate[v - StartIndex] = u; // v is local + // Q.push_back(u); + U.push_back(v); + U.push_back(u); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; + fflush(stdout); +#endif + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); + } // End of if ( candidateMate[v-StartIndex] == u )e + } // End of if ( Mate[v] == -1 ) + } // End of REQUEST + else + { // CASE II: SUCCESS + if (message_type == SUCCESS) + { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; + fflush(stdout); +#endif + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); +#ifdef DEBUG_GHOST_ + if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) + { + cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } +#endif + if (Mate[v - StartIndex] == -1) + { // Process only if not already matched ( v is local) + if (candidateMate[v - StartIndex] == u) + { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap); + candidateMate[v - StartIndex] = w; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) + { + if ((w < StartIndex) || (w > EndIndex)) + { // w is a ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); +#endif + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + msgInd++; + msgActual++; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) + { + Mate[v - StartIndex] = w; // v is local + GMate[Ghost2LocalMap[w]] = v; // w is ghost + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; + fflush(stdout); +#endif + // Decrement the counter: + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + Mate[v - StartIndex] = w; // v is local + Mate[w - StartIndex] = v; // w is local + // Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; + fflush(stdout); +#endif + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + { // No dominant edge found + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); +#endif + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + msgInd++; + msgActual++; + } // End of if(GHOST) + } // End of for loop + } // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of if ( candidateMate[v-StartIndex] == u ) + } // End of if ( Mate[v] == -1 ) + } // End of if ( message_type == SUCCESS ) + else + { // CASE III: FAILURE +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message type is FAILURE" << endl; + fflush(stdout); +#endif + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); // Decrease the counter + } // End of else: CASE III + } // End of else: CASE I + } + + *myCardPtr = myCard; + *SPtr = S; + *msgIndPtr = msgInd; + *msgActualPtr = msgActual; return; } \ No newline at end of file From 47c6f4f2f8787ba7b70d7994b853b0799dddfacf Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 13 Jul 2022 16:19:52 -0500 Subject: [PATCH 52/96] comments --- ...dgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 9 +++++---- amgprec/impl/aggregator/processMessages.cpp | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index c5a81c4d..2ebb2876 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -121,14 +121,14 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif - // inputSubGraph.getStartEndIndices(StartIndex, EndIndex); MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank - // MilanLongInt EndIndex = verDistance[myRank+1]; //The ending vertex owned by the current rank MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank MPI_Status computeStatus; const int ComputeTag = 7; // Predefined tag - const int BundleTag = 9; // Predefined tag + const int BundleTag = 9; // Predefined tag //TODO refactor this + + //TODO refactor this int error_codeC; error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); char error_message[MPI_MAX_ERROR_STRING]; @@ -144,7 +144,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( // Data structures for sending and receiving messages: vector Message; // [ u, v, message_type ] Message.resize(3, -1); - MilanLongInt message_type = 0; + MilanLongInt message_type = 0; //TODO refactor this, it could be constants // Data structures for Message Bundling: // Although up to two messages can be sent along any cross edge, // only one message will be sent in the initialization phase - @@ -158,6 +158,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( PCounter[i] = 0; MilanLongInt NumMessagesBundled = 0; + //TODO when the last computational section will be refactored this could be eliminated MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers! MilanLongInt *candidateMate = nullptr; #ifdef PRINT_DEBUG_INFO_ diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index 7d44b281..2ae84317 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -1,5 +1,7 @@ #include "MatchBoxPC.h" +//TODO there are many useless parameter passed to this function + void processMessages( MilanLongInt NLVer, MilanLongInt *Mate, From 1ea1be33badee014e6208a3bbb0e430f1d1ef054 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Thu, 14 Jul 2022 15:23:32 -0500 Subject: [PATCH 53/96] Refactoring, eliminated useless passed variables --- amgprec/impl/aggregator/MatchBoxPC.h | 13 ++---- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 41 +++---------------- amgprec/impl/aggregator/processMessages.cpp | 29 +++++++++---- .../impl/aggregator/sendBundledMessages.cpp | 3 -- 4 files changed, 29 insertions(+), 57 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index eecc57ce..fc141b43 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -73,6 +73,9 @@ const MilanLongInt SUCCESS = 2; const MilanLongInt FAILURE = 3; const MilanLongInt SIZEINFO = 4; +const int ComputeTag = 7; // Predefined tag +const int BundleTag = 9; // Predefined tag + // MPI type map template MPI_Datatype TypeMap(); @@ -335,8 +338,6 @@ extern "C" MilanLongInt *MessageIndexPtr, MilanInt numProcs, MilanInt myRank, - int ComputeTag, - int BundleTag, MPI_Comm comm, vector &QLocalVtx, vector &QGhostVtx, @@ -362,19 +363,13 @@ extern "C" MilanLongInt *verLocPtr, MilanLongInt k, MilanLongInt *verLocInd, - int error_codeC, MilanInt numProcs, MilanInt myRank, - int ComputeTag, - int BundleTag, MPI_Comm comm, vector &Message, - char *error_message, - int message_length, - vector &ReceiveBuffer, + MilanLongInt numGhostEdges, MilanLongInt u, MilanLongInt v, - MilanLongInt message_type, MilanLongInt *SPtr, staticQueue &U); diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 2ebb2876..45c49ec0 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -125,26 +125,16 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank MPI_Status computeStatus; - const int ComputeTag = 7; // Predefined tag - const int BundleTag = 9; // Predefined tag //TODO refactor this - //TODO refactor this - int error_codeC; - error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); - char error_message[MPI_MAX_ERROR_STRING]; - int message_length; - - // MilanLongInt NLVer=0, NLEdge=0, StartIndex=0, EndIndex=0; MilanLongInt msgActual = 0, msgInd = 0; MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight MilanReal startTime, finishTime; - // MilanReal Precision = MPI_Wtick(); //Get the precision of the MPI Timer + startTime = MPI_Wtime(); // Data structures for sending and receiving messages: vector Message; // [ u, v, message_type ] Message.resize(3, -1); - MilanLongInt message_type = 0; //TODO refactor this, it could be constants // Data structures for Message Bundling: // Although up to two messages can be sent along any cross edge, // only one message will be sent in the initialization phase - @@ -204,7 +194,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt *Buffer; // Declare the locks - // TODO destroy the locks omp_lock_t MateLock[NLVer]; initialize(NLVer, NLEdge, StartIndex, @@ -341,8 +330,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( &MessageIndex, numProcs, myRank, - ComputeTag, - BundleTag, + //ComputeTag, + //BundleTag, comm, QLocalVtx, QGhostVtx, @@ -376,21 +365,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); fflush(stdout); #endif - // Buffer to receive bundled messages - // Maximum messages that can be received from any processor is - // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS) - vector ReceiveBuffer; - MilanLongInt bundleSize = 0, bundleCounter = 0; - try - { - ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge - } - catch (length_error) - { - cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; - cout << "Not enough memory to allocate the internal variables \n"; - exit(1); - } + while (true) { #ifdef DEBUG_HANG_ @@ -615,19 +590,13 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( verLocPtr, k, verLocInd, - error_codeC, numProcs, myRank, - ComputeTag, - BundleTag, comm, Message, - error_message, - message_length, - ReceiveBuffer, + numGhostEdges, u, v, - message_type, &S, U); diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index 2ae84317..c487042a 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -1,7 +1,5 @@ #include "MatchBoxPC.h" -//TODO there are many useless parameter passed to this function - void processMessages( MilanLongInt NLVer, MilanLongInt *Mate, @@ -19,19 +17,13 @@ void processMessages( MilanLongInt *verLocPtr, MilanLongInt k, MilanLongInt *verLocInd, - int error_codeC, MilanInt numProcs, MilanInt myRank, - int ComputeTag, - int BundleTag, MPI_Comm comm, vector &Message, - char *error_message, - int message_length, - vector &ReceiveBuffer, + MilanLongInt numGhostEdges, MilanLongInt u, MilanLongInt v, - MilanLongInt message_type, MilanLongInt *SPtr, staticQueue &U) { @@ -42,6 +34,25 @@ void processMessages( MilanLongInt S = *SPtr; // TODO refactor this MilanLongInt adj11, adj12, k1; MilanLongInt ghostOwner; + int error_codeC; + error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); + char error_message[MPI_MAX_ERROR_STRING]; + int message_length; + MilanLongInt message_type = 0; + // Buffer to receive bundled messages + // Maximum messages that can be received from any processor is + // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS) + vector ReceiveBuffer; + try + { + ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } #ifdef PRINT_DEBUG_INFO_ cout diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp index 382d8a16..f3dd3e46 100644 --- a/amgprec/impl/aggregator/sendBundledMessages.cpp +++ b/amgprec/impl/aggregator/sendBundledMessages.cpp @@ -12,8 +12,6 @@ void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, MilanLongInt *MessageIndexPtr, MilanInt numProcs, MilanInt myRank, - int ComputeTag, - int BundleTag, MPI_Comm comm, vector &QLocalVtx, vector &QGhostVtx, @@ -24,7 +22,6 @@ void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, { MilanLongInt myIndex = 0, msgActual = *msgActualPtr, MessageIndex = *MessageIndexPtr, numGhostEdges = *numGhostEdgesPtr, numMessagesToSend; - const MilanLongInt SIZEINFO = 4; MilanInt i = 0, OneMessageSize = 0, BufferSize = *BufferSizePtr; #ifdef DEBUG_HANG_ From f17082b3375d95bcfe75df2a32b215ad09581f4c Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Thu, 14 Jul 2022 15:27:53 -0500 Subject: [PATCH 54/96] Refactoring: eliminatino of SPtr inside processMessages --- amgprec/impl/aggregator/processMessages.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index c487042a..7ce867ff 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -24,14 +24,13 @@ void processMessages( MilanLongInt numGhostEdges, MilanLongInt u, MilanLongInt v, - MilanLongInt *SPtr, + MilanLongInt *S, staticQueue &U) { MilanInt Sender; MPI_Status computeStatus; MilanLongInt bundleSize, bundleCounter = 0, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w; - MilanLongInt S = *SPtr; // TODO refactor this MilanLongInt adj11, adj12, k1; MilanLongInt ghostOwner; int error_codeC; @@ -39,6 +38,7 @@ void processMessages( char error_message[MPI_MAX_ERROR_STRING]; int message_length; MilanLongInt message_type = 0; + // Buffer to receive bundled messages // Maximum messages that can be received from any processor is // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS) @@ -61,7 +61,7 @@ void processMessages( fflush(stdout); #endif #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")About to begin Message processing phase ... S=" << S << endl; + cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl; fflush(stdout); #endif #ifdef PRINT_DEBUG_INFO_ @@ -196,7 +196,7 @@ void processMessages( cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; fflush(stdout); #endif - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); } // End of if ( candidateMate[v-StartIndex] == u )e } // End of if ( Mate[v] == -1 ) } // End of REQUEST @@ -209,7 +209,7 @@ void processMessages( fflush(stdout); #endif GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); #ifdef DEBUG_GHOST_ if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { @@ -262,7 +262,7 @@ void processMessages( fflush(stdout); #endif // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex else @@ -321,13 +321,12 @@ void processMessages( fflush(stdout); #endif GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, &S); // Decrease the counter + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); // Decrease the counter } // End of else: CASE III } // End of else: CASE I } *myCardPtr = myCard; - *SPtr = S; *msgIndPtr = msgInd; *msgActualPtr = msgActual; return; From 5ca78fb871c963a93578af05e2b330a08a964334 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Thu, 14 Jul 2022 17:10:36 -0500 Subject: [PATCH 55/96] Refactoring isAlreadyMatched and processCrossEdge --- amgprec/impl/aggregator/MatchBoxPC.h | 1 - ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 4 +-- amgprec/impl/aggregator/isAlreadyMatched.cpp | 6 ++-- amgprec/impl/aggregator/processCrossEdge.cpp | 7 ++-- .../impl/aggregator/processExposedVertex.cpp | 2 +- .../aggregator/processMatchedVertices.cpp | 2 +- amgprec/impl/aggregator/processMessages.cpp | 36 +++++++++---------- 7 files changed, 25 insertions(+), 33 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index fc141b43..a8f22f49 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -290,7 +290,6 @@ extern "C" staticQueue &privateQOwner); void PROCESS_CROSS_EDGE(vector &Counter, - map &Ghost2LocalMap, MilanLongInt edge, MilanLongInt *SPtr); diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 45c49ec0..6e24393b 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -142,7 +142,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector QLocalVtx, QGhostVtx, QMsgType; vector QOwner; // Changed by Fabio to be an integer, addresses needs to be integers! - // TODO move this inseide the initialization function MilanLongInt *PCounter = new MilanLongInt[numProcs]; for (int i = 0; i < numProcs; i++) PCounter[i] = 0; @@ -358,7 +357,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Entering While(true) loop.."; fflush(stdout); - // U.display(); fflush(stdout); #endif #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; @@ -468,7 +466,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp index dbb1052f..38ae73f5 100644 --- a/amgprec/impl/aggregator/isAlreadyMatched.cpp +++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp @@ -23,11 +23,9 @@ bool isAlreadyMatched(MilanLongInt node, #pragma omp critical(Mate) { if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex? - if (GMate[Ghost2LocalMap[node]] >= 0)// Already matched - result = true; + result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched } else { //A local vertex - if (Mate[node - StartIndex] >= 0) // Already matched - result = true; + result = (Mate[node - StartIndex] >= 0); // Already matched } } diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp index 1ef99560..ee367a61 100644 --- a/amgprec/impl/aggregator/processCrossEdge.cpp +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -1,16 +1,15 @@ #include "MatchBoxPC.h" void PROCESS_CROSS_EDGE(vector &Counter, - map &Ghost2LocalMap, MilanLongInt edge, MilanLongInt *SPtr) { // Decrement the counter: // Start: PARALLEL_PROCESS_CROSS_EDGE_B - if (Counter[Ghost2LocalMap[edge]] > 0) + if (Counter[edge] > 0) { - Counter[Ghost2LocalMap[edge]] -= 1; // Decrement - if (Counter[Ghost2LocalMap[edge]] == 0) + Counter[edge] -= 1; // Decrement + if (Counter[edge] == 0) { (*SPtr)--; // Decrement S #ifdef PRINT_DEBUG_INFO_ diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index 50a5ecfd..97840b19 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -115,7 +115,7 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, //TODO refactor this!! // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index aaef21a1..c1ae6d13 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -174,7 +174,7 @@ void processMatchedVertices( // TODO refactor this // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, &S); + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index 7ce867ff..7e5c3915 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -30,7 +30,7 @@ void processMessages( MilanInt Sender; MPI_Status computeStatus; - MilanLongInt bundleSize, bundleCounter = 0, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w; + MilanLongInt bundleSize, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w; MilanLongInt adj11, adj12, k1; MilanLongInt ghostOwner; int error_codeC; @@ -155,15 +155,13 @@ void processMessages( fflush(stdout); #endif - bundleCounter = 0; - while (bundleCounter < bundleSize) + + //Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop + for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3) { - u = ReceiveBuffer[bundleCounter]; // GHOST - bundleCounter++; - v = ReceiveBuffer[bundleCounter]; // LOCAL - bundleCounter++; - message_type = ReceiveBuffer[bundleCounter]; // TYPE - bundleCounter++; + u = ReceiveBuffer[bundleCounter - 3]; // GHOST + v = ReceiveBuffer[bundleCounter - 2]; // LOCAL + message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE // CASE I: REQUEST if (message_type == REQUEST) @@ -188,7 +186,6 @@ void processMessages( { GMate[Ghost2LocalMap[u]] = v; // u is ghost Mate[v - StartIndex] = u; // v is local - // Q.push_back(u); U.push_back(v); U.push_back(u); myCard++; @@ -196,7 +193,8 @@ void processMessages( cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; fflush(stdout); #endif - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); + + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); } // End of if ( candidateMate[v-StartIndex] == u )e } // End of if ( Mate[v] == -1 ) } // End of REQUEST @@ -208,8 +206,8 @@ void processMessages( cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; fflush(stdout); #endif - GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); #ifdef DEBUG_GHOST_ if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { @@ -261,8 +259,8 @@ void processMessages( cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; fflush(stdout); #endif - // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, w, S); + + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex else @@ -320,10 +318,10 @@ void processMessages( cout << "\n(" << myRank << ")Message type is FAILURE" << endl; fflush(stdout); #endif - GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap, u, S); // Decrease the counter - } // End of else: CASE III - } // End of else: CASE I + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); // Decrease the counter + } // End of else: CASE III + } // End of else: CASE I } *myCardPtr = myCard; From 561cadee0fd721d475f625c3441ead31790c2ede Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Fri, 15 Jul 2022 07:27:30 -0500 Subject: [PATCH 56/96] parallelQueues working --- amgprec/impl/aggregator/MatchBoxPC.h | 2 +- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 63 +++++++++---------- .../aggregator/processMatchedVertices.cpp | 63 +++++++------------ amgprec/impl/aggregator/queueTransfer.cpp | 1 + 4 files changed, 54 insertions(+), 75 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index a8f22f49..d1e26fbc 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -66,7 +66,7 @@ using namespace std; #define NUM_THREAD 4 -#define UCHUNK 1000 +#define UCHUNK 100000 const MilanLongInt REQUEST = 1; const MilanLongInt SUCCESS = 2; diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 6e24393b..d5ac4394 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -71,8 +71,6 @@ Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2) */ -#define UCHUNK 1000 - #ifdef SERIAL_MPI #else @@ -121,7 +119,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif - MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank + MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank MPI_Status computeStatus; @@ -147,7 +145,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( PCounter[i] = 0; MilanLongInt NumMessagesBundled = 0; - //TODO when the last computational section will be refactored this could be eliminated + // TODO when the last computational section will be refactored this could be eliminated MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers! MilanLongInt *candidateMate = nullptr; #ifdef PRINT_DEBUG_INFO_ @@ -282,6 +280,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector UChunkBeingProcessed; UChunkBeingProcessed.reserve(UCHUNK); + processMatchedVertices(NLVer, UChunkBeingProcessed, U, @@ -329,8 +328,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( &MessageIndex, numProcs, myRank, - //ComputeTag, - //BundleTag, comm, QLocalVtx, QGhostVtx, @@ -598,35 +595,35 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( &S, U); - ///////////////////////// END OF PROCESS MESSAGES ///////////////////////////////// + ///////////////////////// END OF PROCESS MESSAGES ///////////////////////////////// #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S; - fflush(stdout); - cout << "\n(" << myRank << ")** SENT : ACTUAL= " << msgActual; - fflush(stdout); - cout << "\n(" << myRank << ")** SENT : INDIVIDUAL= " << msgInd << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S; + fflush(stdout); + cout << "\n(" << myRank << ")** SENT : ACTUAL= " << msgActual; + fflush(stdout); + cout << "\n(" << myRank << ")** SENT : INDIVIDUAL= " << msgInd << endl; + fflush(stdout); #endif -} // End of while (true) - -clean(NLVer, - myRank, - MessageIndex, - SRequest, - SStatus, - BufferSize, - Buffer, - msgActual, - msgActualSent, - msgInd, - msgIndSent, - NumMessagesBundled, - msgPercent, - MateLock); - -finishTime = MPI_Wtime(); -*ph2_time = finishTime - startTime; // Time taken for Phase-2 -*ph2_card = myCard; // Cardinality at the end of Phase-2 + } // End of while (true) + + clean(NLVer, + myRank, + MessageIndex, + SRequest, + SStatus, + BufferSize, + Buffer, + msgActual, + msgActualSent, + msgInd, + msgIndSent, + NumMessagesBundled, + msgPercent, + MateLock); + + finishTime = MPI_Wtime(); + *ph2_time = finishTime - startTime; // Time taken for Phase-2 + *ph2_card = myCard; // Cardinality at the end of Phase-2 } // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate #endif diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index c1ae6d13..d766bc42 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -1,6 +1,6 @@ #include "MatchBoxPC.h" -//#define privateQueues +//#define error void processMatchedVertices( MilanLongInt NLVer, @@ -38,8 +38,13 @@ void processMatchedVertices( MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0; - // TODO check if private queues arrive empty -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4) + // TODO check that the queues arrives empty + assert(privateQGhostVtx.empty()); + assert(privateQLocalVtx.empty()); + assert(privateQMsgType.empty()); + assert(privateQOwner.empty()); + +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) { #ifdef PRINT_DEBUG_INFO_ @@ -140,25 +145,18 @@ void processMatchedVertices( cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); #endif + msgInd++; + NumMessagesBundled++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + PCounter[ghostOwner]++; -#ifdef privateQueues privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); privateQMsgType.push_back(REQUEST); privateQOwner.push_back(ghostOwner); -#endif -#ifndef privateQueues - QLocalVtx.push_back(v); - QGhostVtx.push_back(w); - QMsgType.push_back(REQUEST); - QOwner.push_back(ghostOwner); -#endif - PCounter[ghostOwner]++; - NumMessagesBundled++; - msgInd++; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { Mate[v - StartIndex] = w; // v is a local vertex @@ -214,28 +212,18 @@ void processMatchedVertices( cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif - - // ghostOwner = inputSubGraph.findOwner(w); + msgInd++; + NumMessagesBundled++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + PCounter[ghostOwner]++; -#ifdef privateQueues privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); privateQMsgType.push_back(FAILURE); privateQOwner.push_back(ghostOwner); -#endif -#ifndef privateQueues - QLocalVtx.push_back(v); - QGhostVtx.push_back(w); - QMsgType.push_back(FAILURE); - QOwner.push_back(ghostOwner); -#endif - PCounter[ghostOwner]++; - NumMessagesBundled++; - msgInd++; } // End of if(GHOST) } // End of for loop } // End of Else: w == -1 @@ -270,26 +258,18 @@ void processMatchedVertices( fflush(stdout); #endif + msgInd++; + NumMessagesBundled++; ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + PCounter[ghostOwner]++; -#ifdef privateQueues privateQLocalVtx.push_back(u); privateQGhostVtx.push_back(v); privateQMsgType.push_back(SUCCESS); privateQOwner.push_back(ghostOwner); -#endif -#ifndef privateQueues - QLocalVtx.push_back(u); - QGhostVtx.push_back(v); - QMsgType.push_back(SUCCESS); - QOwner.push_back(ghostOwner); -#endif - PCounter[ghostOwner]++; - NumMessagesBundled++; - msgInd++; } // End of If( v != Mate[u] ) // omp_unset_lock(&MateLock[u - StartIndex]); @@ -306,14 +286,15 @@ void processMatchedVertices( if (privateU.size() < UCHUNK && !U.empty()) continue; -#ifdef privateQueues + printf("Executed \n"); +#ifdef error #pragma omp critical(U) { while (!privateU.empty()) U.push_back(privateU.pop_back()); } #endif -#ifndef privateQueues +#ifndef error queuesTransfer(U, privateU, QLocalVtx, QGhostVtx, QMsgType, QOwner, privateQLocalVtx, @@ -322,7 +303,7 @@ void processMatchedVertices( privateQOwner); #endif } - } // End of while ( /*!Q.empty()*/ !U.empty() ) + } // End of while ( !U.empty() ) queuesTransfer(U, privateU, QLocalVtx, QGhostVtx, diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp index ed2829c6..cbae1fc2 100644 --- a/amgprec/impl/aggregator/queueTransfer.cpp +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -12,6 +12,7 @@ void queuesTransfer(staticQueue &U, staticQueue &privateQOwner) { + #pragma omp critical(U) { while (!privateU.empty()) From a9bb6b26fad62f7e95438b0495810d02c7313f23 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 16 Jul 2022 11:20:39 -0500 Subject: [PATCH 57/96] processMatchedVertices partially working mixed critical and lock version --- amgprec/impl/aggregator/MatchBoxPC.h | 5 +- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 4 +- .../aggregator/processMatchedVertices.cpp | 288 +++++++++--------- 3 files changed, 151 insertions(+), 146 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index d1e26fbc..58053c18 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -66,7 +66,7 @@ using namespace std; #define NUM_THREAD 4 -#define UCHUNK 100000 +#define UCHUNK 1000 const MilanLongInt REQUEST = 1; const MilanLongInt SUCCESS = 2; @@ -323,7 +323,8 @@ extern "C" staticQueue &privateQLocalVtx, staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, - staticQueue &privateQOwner); + staticQueue &privateQOwner, + omp_lock_t *MateLock); void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, MilanInt *BufferSizePtr, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index d5ac4394..ffe27f68 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -281,6 +281,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector UChunkBeingProcessed; UChunkBeingProcessed.reserve(UCHUNK); +//#define PRINT_DEBUG_INFO_ processMatchedVertices(NLVer, UChunkBeingProcessed, U, @@ -310,7 +311,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( privateQLocalVtx, privateQGhostVtx, privateQMsgType, - privateQOwner); + privateQOwner, + MateLock); ///////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////// SEND BUNDLED MESSAGES ///////////////////////////////////// diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index d766bc42..d92f1a57 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -32,30 +32,24 @@ void processMatchedVertices( staticQueue &privateQLocalVtx, staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, - staticQueue &privateQOwner) + staticQueue &privateQOwner, + omp_lock_t *MateLock) { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0; - // TODO check that the queues arrives empty - assert(privateQGhostVtx.empty()); - assert(privateQLocalVtx.empty()); - assert(privateQMsgType.empty()); - assert(privateQOwner.empty()); - -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) - { - #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << "=========================************===============================" << endl; - fflush(stdout); - fflush(stdout); + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); #endif #ifdef COUNT_LOCAL_VERTEX - MilanLongInt localVertices = 0; + MilanLongInt localVertices = 0; #endif +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) + { // TODO what would be the optimal UCHUNK // TODO refactor @@ -89,23 +83,29 @@ void processMatchedVertices( if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex: -#pragma omp critical(innerProcessMatched) - { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; - fflush(stdout); + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); #endif - // If the current vertex is pointing to a matched vertex and is not matched - // FIXME is there a way to make candidateMate private? - // for the moment it could generate an error. - if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap) and - candidateMate[v - StartIndex] == u) + // If the current vertex is pointing to a matched vertex and is not matched + // FIXME is there a way to make candidateMate private? + // for the moment it could generate an error. + if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + { + + bool seh = false; +#pragma omp critical(prova) { + seh = candidateMate[v - StartIndex] != u; + } + if (seh) + continue; +#pragma omp critical(prova) + { // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, 0, @@ -117,48 +117,44 @@ void processMatchedVertices( Ghost2LocalMap); candidateMate[v - StartIndex] = w; + } - // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w; - fflush(stdout); + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); #endif - // If found a dominating edge: - if (w >= 0) - { - - // TODO is it possible to lock without a critical region? - // TODO there must be a more elegant and efficient way to do this - /* - while(true) { - if (omp_test_lock(&MateLock[v - StartIndex])) { - if (omp_test_lock(&MateLock[w - StartIndex])) break; - else omp_unset_lock(&MateLock[v - StartIndex]); - } - } - */ + // If found a dominating edge: + if (w >= 0) + { - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message:"; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); #endif - msgInd++; - NumMessagesBundled++; - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - PCounter[ghostOwner]++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); +#pragma omp atomic + PCounter[ghostOwner]++; +#pragma omp atomic + msgInd++; +#pragma omp atomic + NumMessagesBundled++; - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(REQUEST); - privateQOwner.push_back(ghostOwner); + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); +#pragma omp critical(prova) + { if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + while (!omp_test_lock(&MateLock[v - StartIndex])) + ; Mate[v - StartIndex] = w; // v is a local vertex GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex // Q.push_back(u); @@ -170,16 +166,23 @@ void processMatchedVertices( fflush(stdout); #endif - // TODO refactor this // Decrement the counter: PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S); - + omp_unset_lock(&MateLock[v - StartIndex]); } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex + } + } // End of if a Ghost Vertex + else + { // w is a local vertex +#pragma omp critical(prova) + { if (candidateMate[w - StartIndex] == v) { + while (!omp_test_lock(&MateLock[v - StartIndex])) + ; + while (!omp_test_lock(&MateLock[w - StartIndex])) + ; + Mate[v - StartIndex] = w; // v is a local vertex Mate[w - StartIndex] = v; // w is a local vertex // Q.push_back(u); @@ -190,121 +193,120 @@ void processMatchedVertices( cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); #endif + omp_unset_lock(&MateLock[v - StartIndex]); + omp_unset_lock(&MateLock[w - StartIndex]); } // End of if(CandidateMate(w) = v - } // End of Else - - // omp_unset_lock(&MateLock[v - StartIndex]); - // omp_unset_lock(&MateLock[w - StartIndex]); + } + } // End of Else - } // End of if(w >=0) - else + } // End of if(w >=0) + else + { + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); #endif - msgInd++; - NumMessagesBundled++; - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - PCounter[ghostOwner]++; - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); +#pragma omp atomic + PCounter[ghostOwner]++; +#pragma omp atomic + msgInd++; +#pragma omp atomic + NumMessagesBundled++; - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); - } // End of If (candidateMate[v-StartIndex] == u + } // End of if(GHOST) + } // End of for loop + } // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of critical region if + } // End of If (candidateMate[v-StartIndex] == u } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: else { // Neighbor is a ghost vertex -#pragma omp critical(innerProcessMatched) + while (!omp_test_lock(&MateLock[u - StartIndex])) + ; +#pragma omp critical(prova) { - - // while(!omp_test_lock(&MateLock[u - StartIndex])); - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) - { // u is local - // Build the Message Packet: - // Message[0] = u; //LOCAL - // Message[1] = v; //GHOST - // Message[2] = SUCCESS; //TYPE - // Send a Request (Asynchronous) + } + if (v != Mate[u - StartIndex]) + { // u is local #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); #endif - msgInd++; - NumMessagesBundled++; - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - PCounter[ghostOwner]++; - - privateQLocalVtx.push_back(u); - privateQGhostVtx.push_back(v); - privateQMsgType.push_back(SUCCESS); - privateQOwner.push_back(ghostOwner); - - } // End of If( v != Mate[u] ) + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); +#pragma omp atomic + PCounter[ghostOwner]++; +#pragma omp atomic + msgInd++; +#pragma omp atomic + NumMessagesBundled++; - // omp_unset_lock(&MateLock[u - StartIndex]); + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); - } // End of critical region - } // End of Else //A Ghost Vertex + } // End of If( v != Mate[u] ) - } // End of For Loop adj(u) + omp_unset_lock(&MateLock[u - StartIndex]); - } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex + } // End of Else //A Ghost Vertex - // Ask for the critical section only when a certain amount - // of data have been accumulated in the private queue - if (privateU.size() < UCHUNK && !U.empty()) - continue; + } // End of for - printf("Executed \n"); -#ifdef error + // TODO commenting that part of code might generate errors + // Ask for the critical section only when there are no more data to + // compute. + if (/*privateU.size() < UCHUNK &&*/ !U.empty()) + continue; #pragma omp critical(U) - { - while (!privateU.empty()) - U.push_back(privateU.pop_back()); - } -#endif + { + while (!privateU.empty()) + U.push_back(privateU.pop_back()); + } + #ifndef error - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); +#pragma omp critical(privateMsg) + { + while (!privateQLocalVtx.empty()) + { + QLocalVtx.push_back(privateQLocalVtx.pop_back()); + QGhostVtx.push_back(privateQGhostVtx.pop_back()); + QMsgType.push_back(privateQMsgType.pop_back()); + QOwner.push_back(privateQOwner.pop_back()); + } + } #endif + } } } // End of while ( !U.empty() ) - queuesTransfer(U, privateU, QLocalVtx, QGhostVtx, QMsgType, QOwner, privateQLocalVtx, @@ -329,4 +331,4 @@ void processMatchedVertices( *msgIndPtr = msgInd; *NumMessagesBundledPtr = NumMessagesBundled; *SPtr = S; -} \ No newline at end of file +} From 1374f21ba807ed0483050477726b1452a3037302 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 16 Jul 2022 13:54:40 -0500 Subject: [PATCH 58/96] refactor increment on variables passed by reference in processMatchedVertices.cpp --- .../aggregator/processMatchedVertices.cpp | 33 +++++++------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index d92f1a57..3816080c 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -37,7 +37,6 @@ void processMatchedVertices( { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; - MilanLongInt myCard = *myCardPtr, msgInd = *msgIndPtr, NumMessagesBundled = *NumMessagesBundledPtr, S = *SPtr, privateMyCard = 0; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; @@ -48,7 +47,7 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateMyCard, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) { // TODO what would be the optimal UCHUNK @@ -140,9 +139,9 @@ void processMatchedVertices( #pragma omp atomic PCounter[ghostOwner]++; #pragma omp atomic - msgInd++; + (*msgIndPtr)++; #pragma omp atomic - NumMessagesBundled++; + (*NumMessagesBundledPtr)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -160,14 +159,15 @@ void processMatchedVertices( // Q.push_back(u); privateU.push_back(v); privateU.push_back(w); - privateMyCard++; +#pragma omp atomic + (*myCardPtr)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); #endif // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S); + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr); omp_unset_lock(&MateLock[v - StartIndex]); } // End of if CandidateMate[w] = v } @@ -188,7 +188,8 @@ void processMatchedVertices( // Q.push_back(u); privateU.push_back(v); privateU.push_back(w); - privateMyCard++; +#pragma omp atomic + (*myCardPtr)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); @@ -222,9 +223,9 @@ void processMatchedVertices( #pragma omp atomic PCounter[ghostOwner]++; #pragma omp atomic - msgInd++; + (*msgIndPtr)++; #pragma omp atomic - NumMessagesBundled++; + (*NumMessagesBundledPtr)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -264,10 +265,9 @@ void processMatchedVertices( #pragma omp atomic PCounter[ghostOwner]++; #pragma omp atomic - msgInd++; + (*msgIndPtr)++; #pragma omp atomic - NumMessagesBundled++; - + (*NumMessagesBundledPtr)++; privateQLocalVtx.push_back(u); privateQGhostVtx.push_back(v); privateQMsgType.push_back(SUCCESS); @@ -314,11 +314,6 @@ void processMatchedVertices( privateQMsgType, privateQOwner); -// TODO it is possible that this is not working as expected -// further investigation needed. -#pragma omp atomic - myCard += privateMyCard; - #ifdef COUNT_LOCAL_VERTEX printf("Count local vertexes: %ld for thread %d of processor %d\n", localVertices, @@ -327,8 +322,4 @@ void processMatchedVertices( #endif } - *myCardPtr = myCard; - *msgIndPtr = msgInd; - *NumMessagesBundledPtr = NumMessagesBundled; - *SPtr = S; } From 71d4cdc3191ab782a4f90ae25e18eca187f1de2f Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 06:11:11 -0500 Subject: [PATCH 59/96] processMatchedVertices rollback to critical regions --- amgprec/impl/aggregator/MatchBoxPC.h | 41 +- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 6 +- amgprec/impl/aggregator/findOwnerOfGhost.cpp | 2 + .../aggregator/processMatchedVertices.cpp | 400 ++++++++---------- 4 files changed, 207 insertions(+), 242 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 58053c18..fba63883 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -66,7 +66,7 @@ using namespace std; #define NUM_THREAD 4 -#define UCHUNK 1000 +#define UCHUNK 5 const MilanLongInt REQUEST = 1; const MilanLongInt SUCCESS = 2; @@ -295,7 +295,6 @@ extern "C" void processMatchedVertices( MilanLongInt NLVer, - vector &UChunkBeingProcessed, staticQueue &U, staticQueue &privateU, MilanLongInt StartIndex, @@ -326,25 +325,25 @@ extern "C" staticQueue &privateQOwner, omp_lock_t *MateLock); - void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, - MilanInt *BufferSizePtr, - MilanLongInt *Buffer, - vector &PCumulative, - vector &PMessageBundle, - vector &PSizeInfoMessages, - MilanLongInt *PCounter, - MilanLongInt NumMessagesBundled, - MilanLongInt *msgActualPtr, - MilanLongInt *MessageIndexPtr, - MilanInt numProcs, - MilanInt myRank, - MPI_Comm comm, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - vector &SRequest, - vector &SStatus); + void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, + MilanInt *BufferSizePtr, + MilanLongInt *Buffer, + vector &PCumulative, + vector &PMessageBundle, + vector &PSizeInfoMessages, + MilanLongInt *PCounter, + MilanLongInt NumMessagesBundled, + MilanLongInt *msgActualPtr, + MilanLongInt *MessageIndexPtr, + MilanInt numProcs, + MilanInt myRank, + MPI_Comm comm, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &SRequest, + vector &SStatus); void processMessages( MilanLongInt NLVer, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index ffe27f68..950e844b 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -278,12 +278,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - vector UChunkBeingProcessed; - UChunkBeingProcessed.reserve(UCHUNK); - -//#define PRINT_DEBUG_INFO_ processMatchedVertices(NLVer, - UChunkBeingProcessed, + //UChunkBeingProcessed, U, privateU, StartIndex, diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp index 59a87bc3..1c41b439 100644 --- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp +++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp @@ -1,5 +1,7 @@ #include "MatchBoxPC.h" + +//TODO parallelize this ///Find the owner of a ghost node: MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt myRank, MilanInt numProcs) { diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 3816080c..6255b35c 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -4,7 +4,6 @@ void processMatchedVertices( MilanLongInt NLVer, - vector &UChunkBeingProcessed, staticQueue &U, staticQueue &privateU, MilanLongInt StartIndex, @@ -37,7 +36,6 @@ void processMatchedVertices( { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; - #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; fflush(stdout); @@ -50,276 +48,246 @@ void processMatchedVertices( #pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) { - // TODO what would be the optimal UCHUNK - // TODO refactor - vector UChunkBeingProcessed; - UChunkBeingProcessed.reserve(UCHUNK); + // TODO what would be the optimal UCHUNK + // TODO refactor + vector UChunkBeingProcessed; + UChunkBeingProcessed.reserve(UCHUNK); - while (!U.empty()) - { + while (!U.empty()) + { - extractUChunk(UChunkBeingProcessed, U, privateU); + extractUChunk(UChunkBeingProcessed, U, privateU); - for (MilanLongInt u : UChunkBeingProcessed) - { + for (MilanLongInt u : UChunkBeingProcessed) + { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")u: " << u; - fflush(stdout); + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); #endif - if ((u >= StartIndex) && (u <= EndIndex)) - { // Process Only the Local Vertices + if ((u >= StartIndex) && (u <= EndIndex)) + { // Process Only the Local Vertices #ifdef COUNT_LOCAL_VERTEX - localVertices++; + localVertices++; #endif - // Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; // Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) - { - v = verLocInd[k]; + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) + { + v = verLocInd[k]; - if ((v >= StartIndex) && (v <= EndIndex)) - { // If Local Vertex: + if ((v >= StartIndex) && (v <= EndIndex)) + { // If Local Vertex: #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; - fflush(stdout); + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); #endif - // If the current vertex is pointing to a matched vertex and is not matched - // FIXME is there a way to make candidateMate private? - // for the moment it could generate an error. - if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) - { - - bool seh = false; -#pragma omp critical(prova) - { - seh = candidateMate[v - StartIndex] != u; - } - if (seh) - continue; - -#pragma omp critical(prova) + // If the current vertex is pointing to a matched vertex and is not matched + // FIXME is there a way to make candidateMate private? + // for the moment it could generate an error. + if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], - verLocPtr[v - StartIndex + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - - candidateMate[v - StartIndex] = w; - } +#pragma omp critical + { + if (candidateMate[v - StartIndex] == u) + { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w; - fflush(stdout); + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); #endif - // If found a dominating edge: - if (w >= 0) - { + // If found a dominating edge: + if (w >= 0) + { - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message:"; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); #endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*msgIndPtr)++; + (*msgIndPtr)++; #pragma omp atomic - (*NumMessagesBundledPtr)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(REQUEST); - privateQOwner.push_back(ghostOwner); - -#pragma omp critical(prova) - { - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - while (!omp_test_lock(&MateLock[v - StartIndex])) - ; - Mate[v - StartIndex] = w; // v is a local vertex - GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex - // Q.push_back(u); - privateU.push_back(v); - privateU.push_back(w); + (*NumMessagesBundledPtr)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) + { + Mate[v - StartIndex] = w; // v is a local vertex + GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex + privateU.push_back(v); + privateU.push_back(w); #pragma omp atomic - (*myCardPtr)++; + (*myCardPtr)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); #endif - // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr); - omp_unset_lock(&MateLock[v - StartIndex]); - } // End of if CandidateMate[w] = v - } - } // End of if a Ghost Vertex - else - { // w is a local vertex -#pragma omp critical(prova) - { - if (candidateMate[w - StartIndex] == v) - { - while (!omp_test_lock(&MateLock[v - StartIndex])) - ; - while (!omp_test_lock(&MateLock[w - StartIndex])) - ; - - Mate[v - StartIndex] = w; // v is a local vertex - Mate[w - StartIndex] = v; // w is a local vertex - // Q.push_back(u); - privateU.push_back(v); - privateU.push_back(w); + // Decrement the counter: + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr); + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + privateU.push_back(v); + privateU.push_back(w); #pragma omp atomic - (*myCardPtr)++; + (*myCardPtr)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); #endif - omp_unset_lock(&MateLock[v - StartIndex]); - omp_unset_lock(&MateLock[w - StartIndex]); - } // End of if(CandidateMate(w) = v - } - } // End of Else - - } // End of if(w >=0) - else - { - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + } // End of if(CandidateMate(w) = v + } // End of Else + + } // End of if(w >=0) + else + { + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*msgIndPtr)++; + (*msgIndPtr)++; #pragma omp atomic - (*NumMessagesBundledPtr)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); - - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - - } // End of If (candidateMate[v-StartIndex] == u - - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else - { // Neighbor is a ghost vertex - - while (!omp_test_lock(&MateLock[u - StartIndex])) - ; -#pragma omp critical(prova) - { - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - } - if (v != Mate[u - StartIndex]) - { // u is local + (*NumMessagesBundledPtr)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + } // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } + } // End of task + } // End of If (candidateMate[v-StartIndex] == u + + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else + { // Neighbor is a ghost vertex + +#pragma omp critical + { + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + { // u is local #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*msgIndPtr)++; + (*msgIndPtr)++; #pragma omp atomic - (*NumMessagesBundledPtr)++; - privateQLocalVtx.push_back(u); - privateQGhostVtx.push_back(v); - privateQMsgType.push_back(SUCCESS); - privateQOwner.push_back(ghostOwner); - - } // End of If( v != Mate[u] ) - - omp_unset_lock(&MateLock[u - StartIndex]); + (*NumMessagesBundledPtr)++; + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); - } // End of Else //A Ghost Vertex + } // End of If( v != Mate[u] ) - } // End of for + } // End of task + } // End of Else //A Ghost Vertex + } // End of inner for - // TODO commenting that part of code might generate errors - // Ask for the critical section only when there are no more data to - // compute. - if (/*privateU.size() < UCHUNK &&*/ !U.empty()) - continue; + // TODO privateU.size() < UCHUNK could be commented but it generate errors, why? + if (privateU.size() > UCHUNK || U.empty()) + { #pragma omp critical(U) - { - while (!privateU.empty()) - U.push_back(privateU.pop_back()); - } + { + while (!privateU.empty()) + U.push_back(privateU.pop_back()); + } #ifndef error #pragma omp critical(privateMsg) - { - while (!privateQLocalVtx.empty()) - { - QLocalVtx.push_back(privateQLocalVtx.pop_back()); - QGhostVtx.push_back(privateQGhostVtx.pop_back()); - QMsgType.push_back(privateQMsgType.pop_back()); - QOwner.push_back(privateQOwner.pop_back()); - } - } + { + while (!privateQLocalVtx.empty()) + { + QLocalVtx.push_back(privateQLocalVtx.pop_back()); + QGhostVtx.push_back(privateQGhostVtx.pop_back()); + QMsgType.push_back(privateQMsgType.pop_back()); + QOwner.push_back(privateQOwner.pop_back()); + } + } + #endif - } - } - } // End of while ( !U.empty() ) - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); + } // End of private.size() + } + } // End of outer for + } // End of while ( !U.empty() ) + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); #ifdef COUNT_LOCAL_VERTEX - printf("Count local vertexes: %ld for thread %d of processor %d\n", - localVertices, - omp_get_thread_num(), - myRank); + printf("Count local vertexes: %ld for thread %d of processor %d\n", + localVertices, + omp_get_thread_num(), + myRank); #endif - } + } // End of parallel region } From 9ab54adf3f392de5a30dac7df1533e54dd4d5c39 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 08:59:23 -0500 Subject: [PATCH 60/96] processMatchedVertices parallelized --- .../aggregator/processMatchedVertices.cpp | 314 +++++++++--------- 1 file changed, 166 insertions(+), 148 deletions(-) diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 6255b35c..bfdbd6cb 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -36,6 +36,7 @@ void processMatchedVertices( { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; + int option; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; fflush(stdout); @@ -45,56 +46,55 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) { - // TODO what would be the optimal UCHUNK - // TODO refactor - vector UChunkBeingProcessed; - UChunkBeingProcessed.reserve(UCHUNK); + // TODO what would be the optimal UCHUNK + // TODO refactor + vector UChunkBeingProcessed; + UChunkBeingProcessed.reserve(UCHUNK); - while (!U.empty()) - { + while (!U.empty()) + { - extractUChunk(UChunkBeingProcessed, U, privateU); + extractUChunk(UChunkBeingProcessed, U, privateU); - for (MilanLongInt u : UChunkBeingProcessed) - { + for (MilanLongInt u : UChunkBeingProcessed) + { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")u: " << u; - fflush(stdout); + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); #endif - if ((u >= StartIndex) && (u <= EndIndex)) - { // Process Only the Local Vertices + if ((u >= StartIndex) && (u <= EndIndex)) + { // Process Only the Local Vertices #ifdef COUNT_LOCAL_VERTEX - localVertices++; + localVertices++; #endif - // Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; // Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) - { - v = verLocInd[k]; + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) + { + option = -1; + v = verLocInd[k]; - if ((v >= StartIndex) && (v <= EndIndex)) - { // If Local Vertex: + if ((v >= StartIndex) && (v <= EndIndex)) + { // If Local Vertex: #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; - fflush(stdout); + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); #endif - // If the current vertex is pointing to a matched vertex and is not matched - // FIXME is there a way to make candidateMate private? - // for the moment it could generate an error. - if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) - { + // If the current vertex is pointing to a matched vertex and is not matched + if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + { #pragma omp critical + { + if (candidateMate[v - StartIndex] == u) { - if (candidateMate[v - StartIndex] == u) - { // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], @@ -122,171 +122,189 @@ void processMatchedVertices( cout << "\n(" << myRank << ")Sending a request message:"; cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); #endif - - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); -#pragma omp atomic - PCounter[ghostOwner]++; -#pragma omp atomic - (*msgIndPtr)++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(REQUEST); - privateQOwner.push_back(ghostOwner); + option = 2; if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + option = 1; Mate[v - StartIndex] = w; // v is a local vertex GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex - privateU.push_back(v); - privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); -#endif // Decrement the counter: PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr); } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex + } // End of if a Ghost Vertex else { // w is a local vertex if (candidateMate[w - StartIndex] == v) { + option = 3; Mate[v - StartIndex] = w; // v is a local vertex Mate[w - StartIndex] = v; // w is a local vertex - privateU.push_back(v); - privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); #endif } // End of if(CandidateMate(w) = v - } // End of Else + } // End of Else } // End of if(w >=0) - else - { - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + else option = 4;// End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } + } // End of task + } // End of If (candidateMate[v-StartIndex] == u + + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else + { // Neighbor is a ghost vertex + +#pragma omp critical + { + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) option = 5; // u is local + } // End of critical + } // End of Else //A Ghost Vertex + + switch (option) + { + case -1: + // No things to do + break; + case 1: + // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v + privateU.push_back(v); + privateU.push_back(w); +#pragma omp atomic + (*myCardPtr)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); #endif - - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + case 2: + // Found a dominating edge, it is a ghost + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*msgIndPtr)++; + (*msgIndPtr)++; #pragma omp atomic - (*NumMessagesBundledPtr)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); - - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } - } // End of task - } // End of If (candidateMate[v-StartIndex] == u - - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else - { // Neighbor is a ghost vertex + (*NumMessagesBundledPtr)++; + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + privateU.push_back(v); + privateU.push_back(w); +#pragma omp atomic + (*myCardPtr)++; + break; + case 4: + // Could not find a dominating vertex + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost -#pragma omp critical - { - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) - { // u is local +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); +#pragma omp atomic + PCounter[ghostOwner]++; +#pragma omp atomic + (*msgIndPtr)++; +#pragma omp atomic + (*NumMessagesBundledPtr)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + break; + default: + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*msgIndPtr)++; + (*msgIndPtr)++; #pragma omp atomic - (*NumMessagesBundledPtr)++; - privateQLocalVtx.push_back(u); - privateQGhostVtx.push_back(v); - privateQMsgType.push_back(SUCCESS); - privateQOwner.push_back(ghostOwner); + (*NumMessagesBundledPtr)++; + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); - } // End of If( v != Mate[u] ) + break; + } //End of switch - } // End of task - } // End of Else //A Ghost Vertex - } // End of inner for + } // End of inner for - // TODO privateU.size() < UCHUNK could be commented but it generate errors, why? - if (privateU.size() > UCHUNK || U.empty()) - { + // TODO privateU.size() < UCHUNK could be commented but it generate errors, why? + if (privateU.size() > UCHUNK || U.empty()) + { #pragma omp critical(U) - { - while (!privateU.empty()) - U.push_back(privateU.pop_back()); - } + { + while (!privateU.empty()) + U.push_back(privateU.pop_back()); + } #ifndef error #pragma omp critical(privateMsg) + { + while (!privateQLocalVtx.empty()) { - while (!privateQLocalVtx.empty()) - { - QLocalVtx.push_back(privateQLocalVtx.pop_back()); - QGhostVtx.push_back(privateQGhostVtx.pop_back()); - QMsgType.push_back(privateQMsgType.pop_back()); - QOwner.push_back(privateQOwner.pop_back()); - } + QLocalVtx.push_back(privateQLocalVtx.pop_back()); + QGhostVtx.push_back(privateQGhostVtx.pop_back()); + QMsgType.push_back(privateQMsgType.pop_back()); + QOwner.push_back(privateQOwner.pop_back()); } + } #endif - } // End of private.size() - } - } // End of outer for - } // End of while ( !U.empty() ) - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); + } // End of private.size() + } + } // End of outer for + } // End of while ( !U.empty() ) + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); #ifdef COUNT_LOCAL_VERTEX - printf("Count local vertexes: %ld for thread %d of processor %d\n", - localVertices, - omp_get_thread_num(), - myRank); + printf("Count local vertexes: %ld for thread %d of processor %d\n", + localVertices, + omp_get_thread_num(), + myRank); #endif } // End of parallel region From d24c8c2d46591bf818fe14324e7af7ee2455b8bb Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 09:43:48 -0500 Subject: [PATCH 61/96] processCrossEdges is now atomic --- amgprec/impl/aggregator/processCrossEdge.cpp | 24 +++++----- .../aggregator/processMatchedVertices.cpp | 45 ++++++++++--------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp index ee367a61..45b3918e 100644 --- a/amgprec/impl/aggregator/processCrossEdge.cpp +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -4,20 +4,20 @@ void PROCESS_CROSS_EDGE(vector &Counter, MilanLongInt edge, MilanLongInt *SPtr) { - // Decrement the counter: // Start: PARALLEL_PROCESS_CROSS_EDGE_B - if (Counter[edge] > 0) - { - Counter[edge] -= 1; // Decrement - if (Counter[edge] == 0) - { - (*SPtr)--; // Decrement S + MilanLongInt captureCounter; + +#pragma omp atomic capture + captureCounter = --Counter[edge]; // Decrement + + if (captureCounter == 0) +#pragma omp atomic + (*SPtr)--; // Decrement S + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages"; - fflush(stdout); + cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages"; + fflush(stdout); #endif - } - } // End of if Counter[edge] > 0 - // End: PARALLEL_PROCESS_CROSS_EDGE_B + // End: PARALLEL_PROCESS_CROSS_EDGE_B } \ No newline at end of file diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index bfdbd6cb..87ea7a4f 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -130,8 +130,6 @@ void processMatchedVertices( Mate[v - StartIndex] = w; // v is a local vertex GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex - // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex else @@ -150,7 +148,8 @@ void processMatchedVertices( } // End of Else } // End of if(w >=0) - else option = 4;// End of Else: w == -1 + else + option = 4; // End of Else: w == -1 // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) } } // End of task @@ -164,10 +163,10 @@ void processMatchedVertices( { if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) option = 5; // u is local - } // End of critical - } // End of Else //A Ghost Vertex - + if (v != Mate[u - StartIndex]) + option = 5; // u is local + } // End of critical + } // End of Else //A Ghost Vertex switch (option) { @@ -184,6 +183,8 @@ void processMatchedVertices( cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); #endif + // Decrement the counter: + PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr); case 2: // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); @@ -241,29 +242,29 @@ void processMatchedVertices( } // End of for loop break; default: - + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*msgIndPtr)++; + (*msgIndPtr)++; #pragma omp atomic - (*NumMessagesBundledPtr)++; - privateQLocalVtx.push_back(u); - privateQGhostVtx.push_back(v); - privateQMsgType.push_back(SUCCESS); - privateQOwner.push_back(ghostOwner); + (*NumMessagesBundledPtr)++; + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); break; - } //End of switch + } // End of switch } // End of inner for From cb660e044d9203f3eee0e3e47edadef4f2a2061f Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 11:27:17 -0500 Subject: [PATCH 62/96] Remoe MateLock --- amgprec/impl/aggregator/MatchBoxPC.h | 7 ++----- ...eApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 10 ++-------- amgprec/impl/aggregator/clean.cpp | 7 +------ amgprec/impl/aggregator/initialize.cpp | 5 ----- amgprec/impl/aggregator/processCrossEdge.cpp | 2 ++ amgprec/impl/aggregator/processMatchedVertices.cpp | 3 +-- 6 files changed, 8 insertions(+), 26 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index fba63883..1ff2cb56 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -217,7 +217,6 @@ extern "C" MilanLongInt *S, MilanLongInt *verLocInd, MilanLongInt *verLocPtr, - omp_lock_t *MateLock, map &Ghost2LocalMap, vector &Counter, vector &verGhostPtr, @@ -249,8 +248,7 @@ extern "C" MilanLongInt msgInd, MilanLongInt *msgIndSent, MilanLongInt NumMessagesBundled, - MilanReal *msgPercent, - omp_lock_t *MateLock); + MilanReal *msgPercent); void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, MilanLongInt *verLocPtr, @@ -322,8 +320,7 @@ extern "C" staticQueue &privateQLocalVtx, staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, - staticQueue &privateQOwner, - omp_lock_t *MateLock); + staticQueue &privateQOwner); void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, MilanInt *BufferSizePtr, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 950e844b..8c02ddcf 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -190,14 +190,10 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanInt BufferSize; MilanLongInt *Buffer; - // Declare the locks - omp_lock_t MateLock[NLVer]; - initialize(NLVer, NLEdge, StartIndex, EndIndex, &numGhostEdges, &numGhostVertices, &S, verLocInd, verLocPtr, - MateLock, Ghost2LocalMap, Counter, verGhostPtr, verGhostInd, tempCounter, GMate, @@ -307,8 +303,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( privateQLocalVtx, privateQGhostVtx, privateQMsgType, - privateQOwner, - MateLock); + privateQOwner); ///////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////// SEND BUNDLED MESSAGES ///////////////////////////////////// @@ -616,8 +611,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( msgInd, msgIndSent, NumMessagesBundled, - msgPercent, - MateLock); + msgPercent); finishTime = MPI_Wtime(); *ph2_time = finishTime - startTime; // Time taken for Phase-2 diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp index d91076c9..29fa351d 100644 --- a/amgprec/impl/aggregator/clean.cpp +++ b/amgprec/impl/aggregator/clean.cpp @@ -16,8 +16,7 @@ void clean(MilanLongInt NLVer, MilanLongInt msgInd, MilanLongInt *msgIndSent, MilanLongInt NumMessagesBundled, - MilanReal *msgPercent, - omp_lock_t *MateLock) + MilanReal *msgPercent) { // Cleanup Phase @@ -83,10 +82,6 @@ void clean(MilanLongInt NLVer, *msgPercent = 0; } } - // Destroy the locks -#pragma omp taskloop num_tasks(NUM_THREAD) - for (int i = 0; i < NLVer; i++) - omp_destroy_lock(&MateLock[i]); #ifdef DEBUG_HANG_ if (myRank == 0) diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 979cdcf5..8325e455 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -7,7 +7,6 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt *S, MilanLongInt *verLocInd, MilanLongInt *verLocPtr, - omp_lock_t *MateLock, map &Ghost2LocalMap, vector &Counter, vector &verGhostPtr, @@ -40,10 +39,6 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp single { - // Initialize the locks -#pragma omp taskloop num_tasks(NUM_THREAD) - for (i = 0; i < NLVer; i++) - omp_init_lock(&MateLock[i]); #ifdef TIME_TRACKER double Ghost2LocalInitialization = MPI_Wtime(); diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp index 45b3918e..30efd79d 100644 --- a/amgprec/impl/aggregator/processCrossEdge.cpp +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -10,6 +10,8 @@ void PROCESS_CROSS_EDGE(vector &Counter, #pragma omp atomic capture captureCounter = --Counter[edge]; // Decrement + //assert(captureCounter >= 0); + if (captureCounter == 0) #pragma omp atomic (*SPtr)--; // Decrement S diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 87ea7a4f..11d7466d 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -31,8 +31,7 @@ void processMatchedVertices( staticQueue &privateQLocalVtx, staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, - staticQueue &privateQOwner, - omp_lock_t *MateLock) + staticQueue &privateQOwner) { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; From 4f07a70ed13826586538bc5c15bf12a1cbbd59f1 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 11:48:52 -0500 Subject: [PATCH 63/96] initialize refactoring --- amgprec/impl/aggregator/findOwnerOfGhost.cpp | 2 +- amgprec/impl/aggregator/initialize.cpp | 139 +++++++++---------- 2 files changed, 67 insertions(+), 74 deletions(-) diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp index 1c41b439..109802de 100644 --- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp +++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp @@ -5,7 +5,7 @@ ///Find the owner of a ghost node: MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt myRank, MilanInt numProcs) { - //MilanLongInt Size = mVerDistance.size(); + MilanLongInt mStartInd = mVerDistance[myRank]; MilanInt Start = 0; MilanInt End = numProcs; diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 8325e455..3e7ac207 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -1,36 +1,35 @@ #include "MatchBoxPC.h" void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt *numGhostEdgesPtr, - MilanLongInt *numGhostVerticesPtr, - MilanLongInt *S, - MilanLongInt *verLocInd, - MilanLongInt *verLocPtr, - map &Ghost2LocalMap, - vector &Counter, - vector &verGhostPtr, - vector &verGhostInd, - vector &tempCounter, - vector &GMate, - vector &Message, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - MilanLongInt *&candidateMate, - staticQueue &U, - staticQueue &privateU, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner) + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt *numGhostEdges, + MilanLongInt *numGhostVertices, + MilanLongInt *S, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + map &Ghost2LocalMap, + vector &Counter, + vector &verGhostPtr, + vector &verGhostInd, + vector &tempCounter, + vector &GMate, + vector &Message, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + MilanLongInt *&candidateMate, + staticQueue &U, + staticQueue &privateU, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner) { - MilanLongInt insertMe = 0, numGhostEdges = 0, numGhostVertices = 0; + MilanLongInt insertMe = 0; MilanLongInt adj1, adj2; int i, v, k, w; - // index that starts with zero to |Vg| - 1 map::iterator storedAlready; @@ -55,19 +54,17 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * only when a ghost edge is found and ghost edges are a minority, * circa 3.5% during the tests. */ - #pragma omp task depend(out \ - : numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, numGhostVertices) + : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices) { - -#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \ - : numGhostEdges) +#pragma omp taskloop num_tasks(NUM_THREAD) for (i = 0; i < NLEdge; i++) { // O(m) - Each edge stored twice insertMe = verLocInd[i]; if ((insertMe < StartIndex) || (insertMe > EndIndex)) { // Find a ghost - numGhostEdges++; +#pragma omp atomic + (*numGhostEdges)++; #pragma omp critical { storedAlready = Ghost2LocalMap.find(insertMe); @@ -76,24 +73,24 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, Counter[storedAlready->second]++; // Increment the counter } else - { // Insert an entry for the ghost: - Ghost2LocalMap[insertMe] = numGhostVertices; // Add a map entry - Counter.push_back(1); // Initialize the counter - numGhostVertices++; // Increment the number of ghost vertices - } // End of else() + { // Insert an entry for the ghost: + Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry + Counter.push_back(1); // Initialize the counter + (*numGhostVertices)++; // Increment the number of ghost vertices + } // End of else() } } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) } // End of for(ghost vertices) } // end of task depend - // numGhostEdges = atomicNumGhostEdges; + // *numGhostEdges = atomicNumGhostEdges; #ifdef TIME_TRACKER Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization; fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization); #endif #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")NGhosts:" << numGhostVertices << " GhostEdges: " << numGhostEdges; + cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges; if (!Ghost2LocalMap.empty()) { cout << "\n(" << myRank << ")Final Map : on process "; @@ -111,16 +108,16 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp task depend(out \ : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \ - : numGhostVertices, numGhostEdges) + : *numGhostVertices, *numGhostEdges) { // Initialize adjacency Lists for Ghost Vertices: try { - verGhostPtr.reserve(numGhostVertices + 1); // Pointer Vector - tempCounter.reserve(numGhostVertices); // Pointer Vector - verGhostInd.reserve(numGhostEdges); // Index Vector - GMate.reserve(numGhostVertices); // Ghost Mate Vector + verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector + tempCounter.reserve(*numGhostVertices); // Pointer Vector + verGhostInd.reserve(*numGhostEdges); // Index Vector + GMate.reserve(*numGhostVertices); // Ghost Mate Vector } catch (length_error) { @@ -129,11 +126,11 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, exit(1); } // Initialize the Vectors: - verGhostPtr.resize(numGhostVertices + 1, 0); // Pointer Vector - tempCounter.resize(numGhostVertices, 0); // Temporary Counter - verGhostInd.resize(numGhostEdges, -1); // Index Vector - GMate.resize(numGhostVertices, -1); // Temporary Counter - verGhostPtr[0] = 0; // The first value + verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector + tempCounter.resize(*numGhostVertices, 0); // Temporary Counter + verGhostInd.resize(*numGhostEdges, -1); // Index Vector + GMate.resize(*numGhostVertices, -1); // Temporary Counter + verGhostPtr[0] = 0; // The first value #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Ghost Vertex Pointer: "; fflush(stdout); @@ -143,13 +140,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp task depend(out \ : verGhostPtr) depend(in \ - : Counter, numGhostVertices) + : Counter, *numGhostVertices) { #ifdef TIME_TRACKER double verGhostPtrInitialization = MPI_Wtime(); #endif - for (i = 0; i < numGhostVertices; i++) + for (i = 0; i < *numGhostVertices; i++) { // O(|Ghost Vertices|) verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; #ifdef PRINT_DEBUG_INFO_ @@ -165,8 +162,8 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, } // End of task #ifdef PRINT_DEBUG_INFO_ - if (numGhostVertices > 0) - cout << verGhostPtr[numGhostVertices] << "\n"; + if (*numGhostVertices > 0) + cout << verGhostPtr[*numGhostVertices] << "\n"; fflush(stdout); #endif @@ -220,22 +217,22 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Ghost Vertex Index: "; - for (v = 0; v < numGhostEdges; v++) + for (v = 0; v < *numGhostEdges; v++) cout << verGhostInd[v] << "\t"; cout << endl; fflush(stdout); #endif -#pragma omp task depend(in \ - : numGhostEdges) depend(out \ - : QLocalVtx, QGhostVtx, QMsgType, QOwner) +#pragma omp task depend(in \ + : *numGhostEdges) depend(out \ + : QLocalVtx, QGhostVtx, QMsgType, QOwner) { try { - QLocalVtx.reserve(numGhostEdges); // Local Vertex - QGhostVtx.reserve(numGhostEdges); // Ghost Vertex - QMsgType.reserve(numGhostEdges); // Message Type (Request/Failure) - QOwner.reserve(numGhostEdges); // Owner of the ghost: COmpute once and use later + QLocalVtx.reserve(*numGhostEdges); // Local Vertex + QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex + QMsgType.reserve(*numGhostEdges); // Message Type (Request/Failure) + QOwner.reserve(*numGhostEdges); // Owner of the ghost: COmpute once and use later } catch (length_error) { @@ -268,24 +265,20 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, fflush(stdout); #endif -#pragma omp task depend(in \ - : numGhostEdges, numGhostVertices) depend(out \ - : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) +#pragma omp task depend(in \ + : *numGhostVertices) depend(out \ + : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) { - // The values calculated in this function are sent back to the calling function - *numGhostEdgesPtr = numGhostEdges; - *numGhostVerticesPtr = numGhostVertices; - // Allocate Data Structures: /* * candidateMate was a vector and has been replaced with an array * there is no point in using the vector (or maybe there is (???)) * so I replaced it with an array wich is slightly faster */ - candidateMate = new MilanLongInt[NLVer + numGhostVertices]; + candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)]; - *S = numGhostVertices; // Initialize S with number of Ghost Vertices + *S = (*numGhostVertices); // Initialize S with number of Ghost Vertices /* * Create the Queue Data Structure for the Dominating Set @@ -295,13 +288,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * of a staticQueue I had to destroy the previous object and instantiate * a new one of the correct size. */ - new (&U) staticQueue(NLVer + numGhostVertices); + new (&U) staticQueue(NLVer + (*numGhostVertices)); // TODO how can I decide a more meaningfull size? - MilanLongInt size = numGhostVertices; + MilanLongInt size = (*numGhostVertices); // Initialize the privte data structure - new (&privateU) staticQueue(NLVer + numGhostVertices); // TODO how can I put a meaningfull size? + new (&privateU) staticQueue(NLVer + (*numGhostVertices)); // TODO how can I put a meaningfull size? new (&privateQLocalVtx) staticQueue(size); new (&privateQGhostVtx) staticQueue(size); new (&privateQMsgType) staticQueue(size); From a71fe82752d79028d98d5585e8ebc108fc6cc58c Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 12:03:48 -0500 Subject: [PATCH 64/96] PROCESS_CROSS_EDGE refactoring --- amgprec/impl/aggregator/MatchBoxPC.h | 3 +-- ...EdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 2 +- amgprec/impl/aggregator/processCrossEdge.cpp | 5 ++--- amgprec/impl/aggregator/processExposedVertex.cpp | 2 +- amgprec/impl/aggregator/processMatchedVertices.cpp | 2 +- amgprec/impl/aggregator/processMessages.cpp | 8 ++++---- 6 files changed, 10 insertions(+), 12 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 1ff2cb56..1e84b7ca 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -287,8 +287,7 @@ extern "C" staticQueue &privateQMsgType, staticQueue &privateQOwner); - void PROCESS_CROSS_EDGE(vector &Counter, - MilanLongInt edge, + void PROCESS_CROSS_EDGE(MilanLongInt *edge, MilanLongInt *SPtr); void processMatchedVertices( diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 8c02ddcf..ab031f68 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -456,7 +456,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( fflush(stdout); #endif - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S); + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp index 30efd79d..30af9f20 100644 --- a/amgprec/impl/aggregator/processCrossEdge.cpp +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -1,14 +1,13 @@ #include "MatchBoxPC.h" -void PROCESS_CROSS_EDGE(vector &Counter, - MilanLongInt edge, +void PROCESS_CROSS_EDGE(MilanLongInt *edge, MilanLongInt *SPtr) { // Start: PARALLEL_PROCESS_CROSS_EDGE_B MilanLongInt captureCounter; #pragma omp atomic capture - captureCounter = --Counter[edge]; // Decrement + captureCounter = --(*edge); // Decrement //assert(captureCounter >= 0); diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index 97840b19..a2ea6c8d 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -115,7 +115,7 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, //TODO refactor this!! // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], &S); + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 11d7466d..fe983285 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -183,7 +183,7 @@ void processMatchedVertices( fflush(stdout); #endif // Decrement the counter: - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], SPtr); + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); case 2: // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index 7e5c3915..c812e96d 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -194,7 +194,7 @@ void processMessages( fflush(stdout); #endif - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); } // End of if ( candidateMate[v-StartIndex] == u )e } // End of if ( Mate[v] == -1 ) } // End of REQUEST @@ -207,7 +207,7 @@ void processMessages( fflush(stdout); #endif GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); #ifdef DEBUG_GHOST_ if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { @@ -260,7 +260,7 @@ void processMessages( fflush(stdout); #endif - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[w], S); + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex else @@ -319,7 +319,7 @@ void processMessages( fflush(stdout); #endif GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore - PROCESS_CROSS_EDGE(Counter, Ghost2LocalMap[u], S); // Decrease the counter + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter } // End of else: CASE III } // End of else: CASE I } From 3e945c75b44ccf75a097deaabb5700286e052b2c Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 13:20:49 -0500 Subject: [PATCH 65/96] Refactoring, removed all useless Pointer passed in functions --- amgprec/impl/aggregator/processCrossEdge.cpp | 4 +- .../impl/aggregator/processExposedVertex.cpp | 96 +++++++++---------- amgprec/impl/aggregator/processMessages.cpp | 25 +++-- .../impl/aggregator/sendBundledMessages.cpp | 58 +++++------ 4 files changed, 85 insertions(+), 98 deletions(-) diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp index 30af9f20..e844f127 100644 --- a/amgprec/impl/aggregator/processCrossEdge.cpp +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -1,7 +1,7 @@ #include "MatchBoxPC.h" void PROCESS_CROSS_EDGE(MilanLongInt *edge, - MilanLongInt *SPtr) + MilanLongInt *S) { // Start: PARALLEL_PROCESS_CROSS_EDGE_B MilanLongInt captureCounter; @@ -13,7 +13,7 @@ void PROCESS_CROSS_EDGE(MilanLongInt *edge, if (captureCounter == 0) #pragma omp atomic - (*SPtr)--; // Decrement S + (*S)--; // Decrement S #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages"; diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index a2ea6c8d..c28a7c66 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -1,45 +1,43 @@ #include "MatchBoxPC.h" void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, - MilanLongInt *candidateMate, - MilanLongInt *verLocInd, - MilanLongInt *verLocPtr, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - MilanLongInt *Mate, - vector &GMate, - map &Ghost2LocalMap, - MilanReal *edgeLocWeight, - MilanLongInt *myCardPtr, - MilanLongInt *msgIndPtr, - MilanLongInt *NumMessagesBundledPtr, - MilanLongInt *SPtr, - MilanLongInt *verDistance, - MilanLongInt *PCounter, - vector &Counter, - MilanInt myRank, - MilanInt numProcs, - staticQueue &U, - staticQueue &privateU, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner) + MilanLongInt *candidateMate, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *Mate, + vector &GMate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, + MilanLongInt *S, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + staticQueue &U, + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner) { - MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0, S = *SPtr; - MilanLongInt myCard = 0, msgInd = 0; - MilanLongInt NumMessagesBundled = 0; + MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0; MilanInt ghostOwner = 0; #pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) { #pragma omp for reduction(+ \ - : msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static) + : PCounter[:numProcs]) schedule(static) for (v = 0; v < NLVer; v++) { // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) @@ -76,8 +74,8 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, if (w >= 0) { - - myCard++; +#pragma omp atomic + (*myCard)++; if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex #ifdef PRINT_DEBUG_INFO_ @@ -85,9 +83,10 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; fflush(stdout); #endif - - msgInd++; - NumMessagesBundled++; +#pragma omp atomic + (*msgInd)++; +#pragma omp atomic + (*NumMessagesBundled)++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); @@ -97,7 +96,6 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, privateQGhostVtx.push_back(w); privateQMsgType.push_back(REQUEST); privateQOwner.push_back(ghostOwner); - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) { @@ -113,9 +111,9 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, fflush(stdout); #endif - //TODO refactor this!! - // Decrement the counter: - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S); + // TODO refactor this!! + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); } // End of if CandidateMate[w] = v } // End of if a Ghost Vertex @@ -159,9 +157,10 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif - - msgInd++; - NumMessagesBundled++; +#pragma omp atomic + (*msgInd)++; +#pragma omp atomic + (*NumMessagesBundled)++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); @@ -184,14 +183,5 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, privateQMsgType, privateQOwner); -//TODO move this outside of the parallel region!! -#pragma omp master - { - *myCardPtr = myCard; - *msgIndPtr = msgInd; - *NumMessagesBundledPtr = NumMessagesBundled; - *SPtr = S; - } - } // End of parallel region } \ No newline at end of file diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index c812e96d..43fc5ce9 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -9,9 +9,9 @@ void processMessages( vector &Counter, MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt *myCardPtr, - MilanLongInt *msgIndPtr, - MilanLongInt *msgActualPtr, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *msgActual, MilanReal *edgeLocWeight, MilanLongInt *verDistance, MilanLongInt *verLocPtr, @@ -30,7 +30,7 @@ void processMessages( MilanInt Sender; MPI_Status computeStatus; - MilanLongInt bundleSize, myCard = *myCardPtr, msgInd = *msgIndPtr, msgActual = *msgActualPtr, w; + MilanLongInt bundleSize, w; MilanLongInt adj11, adj12, k1; MilanLongInt ghostOwner; int error_codeC; @@ -188,7 +188,7 @@ void processMessages( Mate[v - StartIndex] = u; // v is local U.push_back(v); U.push_back(u); - myCard++; + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; fflush(stdout); @@ -246,15 +246,15 @@ void processMessages( assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; + (*msgInd)++; + (*msgActual)++; if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { Mate[v - StartIndex] = w; // v is local GMate[Ghost2LocalMap[w]] = v; // w is ghost U.push_back(v); U.push_back(w); - myCard++; + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; fflush(stdout); @@ -272,7 +272,7 @@ void processMessages( // Q.push_back(u); U.push_back(v); U.push_back(w); - myCard++; + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; fflush(stdout); @@ -303,8 +303,8 @@ void processMessages( assert(ghostOwner != -1); assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; + (*msgInd)++; + (*msgActual)++; } // End of if(GHOST) } // End of for loop } // End of Else: w == -1 @@ -324,8 +324,5 @@ void processMessages( } // End of else: CASE I } - *myCardPtr = myCard; - *msgIndPtr = msgInd; - *msgActualPtr = msgActual; return; } \ No newline at end of file diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp index f3dd3e46..8665759c 100644 --- a/amgprec/impl/aggregator/sendBundledMessages.cpp +++ b/amgprec/impl/aggregator/sendBundledMessages.cpp @@ -1,15 +1,15 @@ #include "MatchBoxPC.h" -void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, - MilanInt *BufferSizePtr, +void sendBundledMessages(MilanLongInt *numGhostEdges, + MilanInt *BufferSize, MilanLongInt *Buffer, vector &PCumulative, vector &PMessageBundle, vector &PSizeInfoMessages, MilanLongInt *PCounter, MilanLongInt NumMessagesBundled, - MilanLongInt *msgActualPtr, - MilanLongInt *MessageIndexPtr, + MilanLongInt *msgActual, + MilanLongInt *MessageIndex, MilanInt numProcs, MilanInt myRank, MPI_Comm comm, @@ -21,8 +21,8 @@ void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, vector &SStatus) { - MilanLongInt myIndex = 0, msgActual = *msgActualPtr, MessageIndex = *MessageIndexPtr, numGhostEdges = *numGhostEdgesPtr, numMessagesToSend; - MilanInt i = 0, OneMessageSize = 0, BufferSize = *BufferSizePtr; + MilanLongInt myIndex = 0, numMessagesToSend; + MilanInt i = 0, OneMessageSize = 0; #ifdef DEBUG_HANG_ if (myRank == 0) @@ -105,7 +105,7 @@ PSizeInfoMessages.resize(numProcs * 3, 0); // Send the Messages #pragma omp task depend(inout \ : SRequest, PSizeInfoMessages, PCumulative) depend(out \ - : msgActual, MessageIndex) + : *msgActual, *MessageIndex) { for (i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers! @@ -124,9 +124,9 @@ PSizeInfoMessages.resize(numProcs * 3, 0); if (PSizeInfoMessages[i * 3 + 0] > 0) { // Send only if it is a nonempty packet MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, - &SRequest[MessageIndex]); - msgActual++; - MessageIndex++; + &SRequest[(*MessageIndex)]); + (*msgActual)++; + (*MessageIndex)++; // Now Send the message with the data packet: #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl; @@ -136,8 +136,8 @@ PSizeInfoMessages.resize(numProcs * 3, 0); fflush(stdout); #endif MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], - TypeMap(), i, BundleTag, comm, &SRequest[MessageIndex]); - MessageIndex++; + TypeMap(), i, BundleTag, comm, &SRequest[(*MessageIndex)]); + (*MessageIndex)++; } // End of if size > 0 } } @@ -154,16 +154,16 @@ PSizeInfoMessages.resize(numProcs * 3, 0); QOwner.clear(); } -#pragma omp task depend(inout : OneMessageSize, BufferSize) depend(out : numMessagesToSend) depend(in : numGhostEdges) +#pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges) { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges; - cout << "\n(" << myRank << ")Total number of potential message X 2 = " << numGhostEdges * 2; + cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; + cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2; cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled; - if (numGhostEdges > 0) + if (*numGhostEdges > 0) { - cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(numGhostEdges * 2)) * 100.0 << "% \n"; + cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n"; } fflush(stdout); #endif @@ -177,39 +177,39 @@ PSizeInfoMessages.resize(numProcs * 3, 0); // Request, Success, Failure. // But only two will be sent from a given processor. // Substract the number of messages that have already been sent as bundled messages: - numMessagesToSend = numGhostEdges * 2 - NumMessagesBundled; - BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; + numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled; + *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; } -#pragma omp task depend(out : Buffer) depend(in : BufferSize) +#pragma omp task depend(out : Buffer) depend(in : *BufferSize) { Buffer = 0; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize; cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD; - cout << "\n(" << myRank << ")Number of Ghost edges = " << numGhostEdges; + cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend; - cout << "\n(" << myRank << ")BufferSize = " << BufferSize; + cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize); cout << "\n(" << myRank << ")Attaching Buffer on.. "; fflush(stdout); #endif - if (BufferSize > 0) + if ((*BufferSize) > 0) { - Buffer = (MilanLongInt *)malloc(BufferSize); // Allocate memory + Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory if (Buffer == 0) { cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n"; exit(1); } - MPI_Buffer_attach(Buffer, BufferSize); // Attach the Buffer + MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer } } } } -*MessageIndexPtr = MessageIndex; -*msgActualPtr = msgActual; -*numGhostEdgesPtr = numGhostEdges; -*BufferSizePtr = BufferSize; +//*MessageIndexPtr = MessageIndex; +//*msgActualPtr = msgActual; +//*numGhostEdgesPtr = numGhostEdges; +//*BufferSizePtr = BufferSize; } \ No newline at end of file From 44f174a5714063b50cb08a689b237258aea2f75b Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sun, 17 Jul 2022 13:44:58 -0500 Subject: [PATCH 66/96] findOwnerOfGhost optimization and refactor --- amgprec/impl/aggregator/findOwnerOfGhost.cpp | 57 +++++++------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp index 109802de..b9d60614 100644 --- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp +++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp @@ -1,48 +1,29 @@ #include "MatchBoxPC.h" - -//TODO parallelize this -///Find the owner of a ghost node: +/// Find the owner of a ghost node: MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, - MilanInt myRank, MilanInt numProcs) { + MilanInt myRank, MilanInt numProcs) +{ MilanLongInt mStartInd = mVerDistance[myRank]; MilanInt Start = 0; MilanInt End = numProcs; MilanInt Current = 0; -#if 0 - if ( vtxIndex < mStartInd ) - End = myRank; - else - Start = myRank; -#endif + while (Start <= End) + { + Current = (End + Start) / 2; + // CASE-1: + if (mVerDistance[Current] == vtxIndex) return Current; + else // CASE 2: + if (mVerDistance[Current] > vtxIndex) + End = Current - 1; + else // CASE 3: + Start = Current + 1; + } // End of While() + + if (mVerDistance[Current] > vtxIndex) + return (Current - 1); - while ( Start <= End ) { - Current = (End + Start)/2; - //CASE-1: - if ( mVerDistance[Current] == vtxIndex ) { - while ( mVerDistance[Current+1] == vtxIndex ) { - Current++; - if ( Current == numProcs ) - return (-1); - } - return (Current); - } - else { //CASE 2: - if ( mVerDistance[Current] > vtxIndex ) - End = Current - 1; - else //CASE 3: - Start = Current + 1; - } - } //End of While() - if ( Current == 0 ) - return (Current); - else { - if ( mVerDistance[Current] > vtxIndex ) - return (Current-1); - else - return (Current); - } //End of else - return (-1); //It should not reach here! -} //End of findOwnerOfGhost() + return Current; +} // End of findOwnerOfGhost() From 22d9baf29608a3f3b190f33ac735361528f56ae9 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Mon, 18 Jul 2022 14:11:53 -0500 Subject: [PATCH 67/96] isAlreadyMatched substituted with atomic read in one place --- amgprec/impl/aggregator/MatchBoxPC.h | 1 + ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 204 +++++++++--------- amgprec/impl/aggregator/isAlreadyMatched.cpp | 1 + .../aggregator/processMatchedVertices.cpp | 15 +- 4 files changed, 108 insertions(+), 113 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 1e84b7ca..e8a2e2cc 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -292,6 +292,7 @@ extern "C" void processMatchedVertices( MilanLongInt NLVer, + vector &UChunkBeingProcessed, staticQueue &U, staticQueue &privateU, MilanLongInt StartIndex, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index ab031f68..a4fb68e5 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -274,8 +274,12 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// + // TODO what would be the optimal UCHUNK + vector UChunkBeingProcessed; + UChunkBeingProcessed.reserve(UCHUNK); + processMatchedVertices(NLVer, - //UChunkBeingProcessed, + UChunkBeingProcessed, U, privateU, StartIndex, @@ -382,117 +386,47 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( v = verLocInd[k]; if ((v >= StartIndex) && (v <= EndIndex)) { // v is a Local Vertex: - if (Mate[v - StartIndex] >= 0) // v is already matched - continue; + //if (Mate[v - StartIndex] >= 0) // v is already matched + // continue; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; fflush(stdout); #endif - if (candidateMate[v - StartIndex] == u) - { // Only if pointing to the matched vertex - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - w = -1; - heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN - for (k1 = adj11; k1 < adj12; k1++) - { - if ((verLocInd[k1] < StartIndex) || (verLocInd[k1] > EndIndex)) - { // Is it a ghost vertex? - if (GMate[Ghost2LocalMap[verLocInd[k1]]] >= 0) // Already matched - continue; - } - else - { // A local vertex - if (Mate[verLocInd[k1] - StartIndex] >= 0) // Already matched - continue; - } - - if ((edgeLocWeight[k1] > heaviestEdgeWt) || - ((edgeLocWeight[k1] == heaviestEdgeWt) && (w < verLocInd[k1]))) - { - heaviestEdgeWt = edgeLocWeight[k1]; - w = verLocInd[k1]; - } - } // End of for loop - candidateMate[v - StartIndex] = w; - // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w; - fflush(stdout); -#endif - // If found a dominating edge: - if (w >= 0) - { - if ((w < StartIndex) || (w > EndIndex)) - { // w is a ghost - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message:"; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); -#endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - Mate[v - StartIndex] = w; // v is local - GMate[Ghost2LocalMap[w]] = v; // w is ghost - // Q.push_back(u); - U.push_back(v); - U.push_back(w); - myCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); -#endif - - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S); - - } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex - if (candidateMate[w - StartIndex] == v) - { - Mate[v - StartIndex] = w; // v is local - Mate[w - StartIndex] = v; // w is local - // Q.push_back(u); - U.push_back(v); - U.push_back(w); - myCard++; + // If the current vertex is pointing to a matched vertex and is not matched + if (Mate[v - StartIndex] < 0) + { + if (candidateMate[v - StartIndex] == u) + { // Only if pointing to the matched vertex + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); #endif - } // End of if(CandidateMate(w) = v - } // End of Else - } // End of if(w >=0) - else - { // no dominating edge found: w == -1 - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) + // If found a dominating edge: + if (w >= 0) { - w = verLocInd[k1]; if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + { // w is a ghost // Build the Message Packet: Message[0] = v; // LOCAL Message[1] = w; // GHOST - Message[2] = FAILURE; // TYPE + Message[2] = REQUEST; // TYPE // Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Sending a request message:"; cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif @@ -502,12 +436,72 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); msgInd++; msgActual++; - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of If (candidateMate[v-StartIndex] == u) - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) + { + Mate[v - StartIndex] = w; // v is local + GMate[Ghost2LocalMap[w]] = v; // w is ghost + // Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S); + + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + Mate[v - StartIndex] = w; // v is local + Mate[w - StartIndex] = v; // w is local + // Q.push_back(u); + U.push_back(v); + U.push_back(w); + myCard++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + { // no dominating edge found: w == -1 + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + msgInd++; + msgActual++; + } // End of if(GHOST) + } // End of for loop + } // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of If (candidateMate[v-StartIndex] == u) + } + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: else { // Neighbor v is a ghost vertex if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp index 38ae73f5..d4efd416 100644 --- a/amgprec/impl/aggregator/isAlreadyMatched.cpp +++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp @@ -1,5 +1,6 @@ #include "MatchBoxPC.h" +//TODO can be optimized!! /** * //TODO documentation * @param k diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index fe983285..0054ffa2 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -4,6 +4,7 @@ void processMatchedVertices( MilanLongInt NLVer, + vector &UChunkBeingProcessed, staticQueue &U, staticQueue &privateU, MilanLongInt StartIndex, @@ -36,6 +37,8 @@ void processMatchedVertices( MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; int option; + MilanLongInt mateVal; + #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; fflush(stdout); @@ -45,14 +48,9 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) { - // TODO what would be the optimal UCHUNK - // TODO refactor - vector UChunkBeingProcessed; - UChunkBeingProcessed.reserve(UCHUNK); - while (!U.empty()) { @@ -86,9 +84,10 @@ void processMatchedVertices( cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; fflush(stdout); #endif - +#pragma omp atomic read + mateVal = Mate[v - StartIndex]; // If the current vertex is pointing to a matched vertex and is not matched - if (not isAlreadyMatched(v, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + if (mateVal < 0) { #pragma omp critical { From cdf92ea2b247e1b35f5e6ea6413b7104076654e7 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 20 Jul 2022 15:37:29 -0500 Subject: [PATCH 68/96] processMatchedVerticess add send messages with error --- amgprec/impl/aggregator/MatchBoxPC.h | 49 ++++++---- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 60 ++++++++++-- .../aggregator/processMatchedVertices.cpp | 98 +++++++++++++++---- amgprec/impl/aggregator/processMessages.cpp | 4 +- .../impl/aggregator/sendBundledMessages.cpp | 17 ++-- amgprec/stZnqhkT | 1 + 6 files changed, 169 insertions(+), 60 deletions(-) create mode 100644 amgprec/stZnqhkT diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index e8a2e2cc..dafe381e 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -76,6 +76,8 @@ const MilanLongInt SIZEINFO = 4; const int ComputeTag = 7; // Predefined tag const int BundleTag = 9; // Predefined tag +static vector DEFAULT_VECTOR; + // MPI type map template MPI_Datatype TypeMap(); @@ -320,27 +322,32 @@ extern "C" staticQueue &privateQLocalVtx, staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, - staticQueue &privateQOwner); - - void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, - MilanInt *BufferSizePtr, - MilanLongInt *Buffer, - vector &PCumulative, - vector &PMessageBundle, - vector &PSizeInfoMessages, - MilanLongInt *PCounter, - MilanLongInt NumMessagesBundled, - MilanLongInt *msgActualPtr, - MilanLongInt *MessageIndexPtr, - MilanInt numProcs, - MilanInt myRank, - MPI_Comm comm, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - vector &SRequest, - vector &SStatus); + staticQueue &privateQOwner, + bool sendMessages = false, + MPI_Comm comm = NULL, + MilanLongInt *msgActual = nullptr, + MilanLongInt *msgInd = nullptr, + vector &Message = DEFAULT_VECTOR); + + void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, + MilanInt *BufferSizePtr, + MilanLongInt *Buffer, + vector &PCumulative, + vector &PMessageBundle, + vector &PSizeInfoMessages, + MilanLongInt *PCounter, + MilanLongInt NumMessagesBundled, + MilanLongInt *msgActualPtr, + MilanLongInt *MessageIndexPtr, + MilanInt numProcs, + MilanInt myRank, + MPI_Comm comm, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &SRequest, + vector &SStatus); void processMessages( MilanLongInt NLVer, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index a4fb68e5..99fd57c3 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -368,9 +368,51 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////////////////////////////////////////////////////////////// /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - while (/*!Q.empty()*/ !U.empty()) + ///* + +//#define error +#ifdef error + processMatchedVertices(NLVer, + UChunkBeingProcessed, + U, + privateU, + StartIndex, + EndIndex, + &myCard, + &msgInd, + &NumMessagesBundled, + &S, + verLocPtr, + verLocInd, + verDistance, + PCounter, + Counter, + myRank, + numProcs, + candidateMate, + GMate, + Mate, + Ghost2LocalMap, + edgeLocWeight, + QLocalVtx, + QGhostVtx, + QMsgType, + QOwner, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner, + true, + comm, + &msgActual, + &msgInd, + Message); +#endif +#ifndef error + + while (!U.empty()) { - // Q.pop_front(); + u = U.pop_front(); // Get an element from the queue #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")u: " << u; @@ -385,9 +427,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( { v = verLocInd[k]; if ((v >= StartIndex) && (v <= EndIndex)) - { // v is a Local Vertex: - //if (Mate[v - StartIndex] >= 0) // v is already matched - // continue; + { // v is a Local Vertex: + // if (Mate[v - StartIndex] >= 0) // v is already matched + // continue; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; fflush(stdout); @@ -500,8 +542,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } // End of Else: w == -1 // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) } // End of If (candidateMate[v-StartIndex] == u) - } - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + } // if (Mate[v - StartIndex] < 0) + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: else { // Neighbor v is a ghost vertex if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) @@ -536,7 +578,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( } // End of Else //A Ghost Vertex } // End of For Loop adj(u) } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex - } // End of while ( /*!Q.empty()*/ !U.empty() ) + } // End of while ( !U.empty() ) +#endif + ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// //// BREAK IF NO MESSAGES EXPECTED ///////// diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 0054ffa2..296bfa15 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -32,7 +32,12 @@ void processMatchedVertices( staticQueue &privateQLocalVtx, staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, - staticQueue &privateQOwner) + staticQueue &privateQOwner, + bool sendMessages, + MPI_Comm comm, + MilanLongInt *msgActual, + MilanLongInt *msgInd, + vector &Message) { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; @@ -48,7 +53,7 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) { while (!U.empty()) @@ -144,16 +149,14 @@ void processMatchedVertices( #endif } // End of if(CandidateMate(w) = v } // End of Else - - } // End of if(w >=0) + } // End of if(w >=0) else option = 4; // End of Else: w == -1 // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } - } // End of task - } // End of If (candidateMate[v-StartIndex] == u - - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + } // End of If (candidateMate[v-StartIndex] == u + } // End of task + } // mateval < 0 + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: else { // Neighbor is a ghost vertex @@ -184,16 +187,36 @@ void processMatchedVertices( // Decrement the counter: PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); case 2: + // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + if (sendMessages) + { + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) + //#pragma omp master + // { + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); +// } #pragma omp atomic - PCounter[ghostOwner]++; + (*msgActual)++; + } + else + { #pragma omp atomic - (*msgIndPtr)++; + PCounter[ghostOwner]++; #pragma omp atomic - (*NumMessagesBundledPtr)++; + (*NumMessagesBundledPtr)++; + } + +#pragma omp atomic + (*msgIndPtr)++; + privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); privateQMsgType.push_back(REQUEST); @@ -224,12 +247,30 @@ void processMatchedVertices( ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + if (sendMessages) + { + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) + //#pragma omp master + // { + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); +// } +#pragma omp atomic + (*msgActual)++; + } + else + { #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*msgIndPtr)++; + (*NumMessagesBundledPtr)++; + } + #pragma omp atomic - (*NumMessagesBundledPtr)++; + (*msgIndPtr)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -239,6 +280,7 @@ void processMatchedVertices( } // End of if(GHOST) } // End of for loop break; + case 5: default: #ifdef PRINT_DEBUG_INFO_ @@ -250,12 +292,32 @@ void processMatchedVertices( ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); + if (sendMessages) + { + // Build the Message Packet: + Message[0] = u; // LOCAL + Message[1] = v; // GHOST + Message[2] = SUCCESS; // TYPE + + // Send a Request (Asynchronous) + //#pragma omp master + // { + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); +// } #pragma omp atomic - PCounter[ghostOwner]++; + (*msgActual)++; + } + else + { #pragma omp atomic - (*msgIndPtr)++; + (*NumMessagesBundledPtr)++; #pragma omp atomic - (*NumMessagesBundledPtr)++; + PCounter[ghostOwner]++; + } + +#pragma omp atomic + (*msgIndPtr)++; + privateQLocalVtx.push_back(u); privateQGhostVtx.push_back(v); privateQMsgType.push_back(SUCCESS); diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index 43fc5ce9..bb21396b 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -104,7 +104,7 @@ void processMessages( ReceiveBuffer.resize(bundleSize, -1); // Initialize #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Message Bundle Before: " << endl; - for (i = 0; i < bundleSize; i++) + for (int i = 0; i < bundleSize; i++) cout << ReceiveBuffer[i] << ","; cout << endl; fflush(stdout); @@ -119,7 +119,7 @@ void processMessages( } #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Message Bundle After: " << endl; - for (i = 0; i < bundleSize; i++) + for (int i = 0; i < bundleSize; i++) cout << ReceiveBuffer[i] << ","; cout << endl; fflush(stdout); diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp index 8665759c..f7fd2f78 100644 --- a/amgprec/impl/aggregator/sendBundledMessages.cpp +++ b/amgprec/impl/aggregator/sendBundledMessages.cpp @@ -9,7 +9,7 @@ void sendBundledMessages(MilanLongInt *numGhostEdges, MilanLongInt *PCounter, MilanLongInt NumMessagesBundled, MilanLongInt *msgActual, - MilanLongInt *MessageIndex, + MilanLongInt *msgInd, MilanInt numProcs, MilanInt myRank, MPI_Comm comm, @@ -105,7 +105,7 @@ PSizeInfoMessages.resize(numProcs * 3, 0); // Send the Messages #pragma omp task depend(inout \ : SRequest, PSizeInfoMessages, PCumulative) depend(out \ - : *msgActual, *MessageIndex) + : *msgActual, *msgInd) { for (i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers! @@ -124,9 +124,9 @@ PSizeInfoMessages.resize(numProcs * 3, 0); if (PSizeInfoMessages[i * 3 + 0] > 0) { // Send only if it is a nonempty packet MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, - &SRequest[(*MessageIndex)]); + &SRequest[(*msgInd)]); (*msgActual)++; - (*MessageIndex)++; + (*msgInd)++; // Now Send the message with the data packet: #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl; @@ -136,8 +136,8 @@ PSizeInfoMessages.resize(numProcs * 3, 0); fflush(stdout); #endif MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], - TypeMap(), i, BundleTag, comm, &SRequest[(*MessageIndex)]); - (*MessageIndex)++; + TypeMap(), i, BundleTag, comm, &SRequest[(*msgInd)]); + (*msgInd)++; } // End of if size > 0 } } @@ -207,9 +207,4 @@ PSizeInfoMessages.resize(numProcs * 3, 0); } } } - -//*MessageIndexPtr = MessageIndex; -//*msgActualPtr = msgActual; -//*numGhostEdgesPtr = numGhostEdges; -//*BufferSizePtr = BufferSize; } \ No newline at end of file diff --git a/amgprec/stZnqhkT b/amgprec/stZnqhkT new file mode 100644 index 00000000..8b277f0d --- /dev/null +++ b/amgprec/stZnqhkT @@ -0,0 +1 @@ +! From abf258e2e8a337870bdb7404975307861867fe69 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 20 Jul 2022 15:45:29 -0500 Subject: [PATCH 69/96] isAlreadyMatched is now atomic --- amgprec/impl/aggregator/clean.cpp | 2 -- amgprec/impl/aggregator/isAlreadyMatched.cpp | 29 ++++++++++++++------ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp index 29fa351d..62f366b2 100644 --- a/amgprec/impl/aggregator/clean.cpp +++ b/amgprec/impl/aggregator/clean.cpp @@ -1,8 +1,6 @@ #include "MatchBoxPC.h" // TODO comment -// TODO use task -// TODO destroy the locks void clean(MilanLongInt NLVer, MilanInt myRank, diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp index d4efd416..a7d65c15 100644 --- a/amgprec/impl/aggregator/isAlreadyMatched.cpp +++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp @@ -1,6 +1,5 @@ #include "MatchBoxPC.h" -//TODO can be optimized!! /** * //TODO documentation * @param k @@ -13,14 +12,14 @@ * @return */ bool isAlreadyMatched(MilanLongInt node, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - vector &GMate, - MilanLongInt* Mate, - map &Ghost2LocalMap -) { + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap) +{ - bool result = false; + /* #pragma omp critical(Mate) { if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex? @@ -30,6 +29,18 @@ bool isAlreadyMatched(MilanLongInt node, } } + */ + MilanLongInt val; + if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex + { +#pragma omp atomic read + val = GMate[Ghost2LocalMap[node]]; + return val >= 0; // Already matched + } + + // If not ghost vertex +#pragma omp atomic read + val = Mate[node - StartIndex]; - return result; + return val >= 0; // Already matched } \ No newline at end of file From 9b065602a8573eaf0c0f8f105e5b44ccb4fcc203 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Wed, 20 Jul 2022 16:24:37 -0500 Subject: [PATCH 70/96] Fixed race condition in processExposedVertices --- .../impl/aggregator/processExposedVertex.cpp | 200 ++++++++++-------- 1 file changed, 108 insertions(+), 92 deletions(-) diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index c28a7c66..c53f2f53 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -32,14 +32,14 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, { MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0; - MilanInt ghostOwner = 0; + MilanInt ghostOwner = 0, option; -#pragma omp parallel private(k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) { -#pragma omp for reduction(+ \ - : PCounter[:numProcs]) schedule(static) +#pragma omp for reduction(+ : PCounter[:numProcs]) schedule(static) for (v = 0; v < NLVer; v++) { + option = -1; // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) k = candidateMate[v]; candidateMate[v] = verLocInd[k]; @@ -58,29 +58,75 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, if (w >= 0) { - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) - { - w = computeCandidateMate(verLocPtr[v], - verLocPtr[v + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - candidateMate[v] = w; - } - - if (w >= 0) +#pragma omp critical(processExposed) { + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; + } + + if (w >= 0) + { #pragma omp atomic - (*myCard)++; + (*myCard)++; + if ((w < StartIndex) || (w > EndIndex)) + { // w is a ghost vertex + option = 2; + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) + { + option = 1; + Mate[v] = w; + GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost + + } // End of if CandidateMate[w] = v + + } // End of if a Ghost Vertex + else + { // w is a local vertex + + if (candidateMate[w - StartIndex] == (v + StartIndex)) + { + option = 3; + Mate[v] = w; // v is local + Mate[w - StartIndex] = v + StartIndex; // w is local + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") "; + fflush(stdout); +#endif + + } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) + } // End of Else + + } // End of second if + + } // End critical processExposed + + } // End of if(w >=0) + else + { + // This piece of code is executed a really small amount of times + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; if ((w < StartIndex) || (w > EndIndex)) - { // w is a ghost vertex + { // A ghost + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message (291):"; - cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif #pragma omp atomic @@ -94,86 +140,56 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, privateQLocalVtx.push_back(v + StartIndex); privateQGhostVtx.push_back(w); - privateQMsgType.push_back(REQUEST); + privateQMsgType.push_back(FAILURE); privateQOwner.push_back(ghostOwner); - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) - { - - privateU.push_back(v + StartIndex); - privateU.push_back(w); - Mate[v] = w; - // FIXME could this instruction create errors? - GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost - -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")"; - fflush(stdout); -#endif - - // TODO refactor this!! - // Decrement the counter: - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); - } // End of if CandidateMate[w] = v - - } // End of if a Ghost Vertex - else - { // w is a local vertex - - if (candidateMate[w - StartIndex] == (v + StartIndex)) - { - privateU.push_back(v + StartIndex); - privateU.push_back(w); + } // End of if(GHOST) + } // End of for loop + } + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - Mate[v] = w; // v is local - // FIXME this instruction could create errors - Mate[w - StartIndex] = v + StartIndex; // w is local + switch (option) + { + case -1: + break; + case 1: + privateU.push_back(v + StartIndex); + privateU.push_back(w); #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")"; + fflush(stdout); #endif - } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) - } // End of Else - - continue; - } // End of second if - - } // End of if(w >=0) - - // This piece of code is executed a really small amount of times, I will not allocate a - // huge amount of memory for the private data structures. - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); + case 2: #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); + cout << "\n(" << myRank << ")Sending a request message (291):"; + cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); #endif #pragma omp atomic - (*msgInd)++; + (*msgInd)++; #pragma omp atomic - (*NumMessagesBundled)++; - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - PCounter[ghostOwner]++; - - privateQLocalVtx.push_back(v + StartIndex); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); - - } // End of if(GHOST) - } // End of for loop - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + (*NumMessagesBundled)++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + + privateQLocalVtx.push_back(v + StartIndex); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + default: + privateU.push_back(v + StartIndex); + privateU.push_back(w); + break; + } + } // End of for ( v=0; v < NLVer; v++ ) queuesTransfer(U, privateU, QLocalVtx, From 9d1a416f9966ef461c4aaccbccbcf01bb5b90539 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Thu, 21 Jul 2022 15:45:31 -0500 Subject: [PATCH 71/96] add rm to exec.sh --- exec.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/exec.sh b/exec.sh index 50edf4ad..1181f776 100755 --- a/exec.sh +++ b/exec.sh @@ -1,4 +1,20 @@ -rm amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o +cd amgprec/impl/aggregator/ +rm MatchBoxPC.o +rm sendBundledMessages.o +rm initialize.o +rm extractUChunk.o +rm isAlreadyMatched.o +rm findOwnerOfGhost.o +rm computeCandidateMate.o +rm parallelComputeCandidateMateB.o +rm processMatchedVertices.o +rm processCrossEdge.o +rm queueTransfer.o +rm processMessages.o +rm processExposedVertex.o +rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o +rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o +cd ../../../ make all cd samples/advanced/pdegen make amg_d_pde3d From e328f3969c69b61d5110daf95ceb4f0f700fe82b Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Fri, 22 Jul 2022 07:25:09 -0500 Subject: [PATCH 72/96] queueTransfer optimization in processMatchedVertices --- amgprec/impl/aggregator/MatchBoxPC.h | 4 +- .../aggregator/processMatchedVertices.cpp | 58 ++++++------------- amgprec/impl/aggregator/processMessages.cpp | 7 +++ 3 files changed, 27 insertions(+), 42 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index dafe381e..cb7d95e2 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -65,8 +65,8 @@ using namespace std; -#define NUM_THREAD 4 -#define UCHUNK 5 +const int NUM_THREAD = 2; +const int UCHUNK = 50; const MilanLongInt REQUEST = 1; const MilanLongInt SUCCESS = 2; diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 296bfa15..5e233ce9 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -199,10 +199,11 @@ void processMatchedVertices( Message[1] = w; // GHOST Message[2] = REQUEST; // TYPE // Send a Request (Asynchronous) - //#pragma omp master - // { + + printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + fflush(stdout); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -// } + #pragma omp atomic (*msgActual)++; } @@ -254,10 +255,10 @@ void processMatchedVertices( Message[1] = w; // GHOST Message[2] = FAILURE; // TYPE // Send a Request (Asynchronous) - //#pragma omp master - // { + + printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + fflush(stdout); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -// } #pragma omp atomic (*msgActual)++; } @@ -300,10 +301,9 @@ void processMatchedVertices( Message[2] = SUCCESS; // TYPE // Send a Request (Asynchronous) - //#pragma omp master - // { + // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + fflush(stdout); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -// } #pragma omp atomic (*msgActual)++; } @@ -327,39 +327,17 @@ void processMatchedVertices( } // End of switch } // End of inner for - - // TODO privateU.size() < UCHUNK could be commented but it generate errors, why? - if (privateU.size() > UCHUNK || U.empty()) - { -#pragma omp critical(U) - { - while (!privateU.empty()) - U.push_back(privateU.pop_back()); - } - -#ifndef error -#pragma omp critical(privateMsg) - { - while (!privateQLocalVtx.empty()) - { - QLocalVtx.push_back(privateQLocalVtx.pop_back()); - QGhostVtx.push_back(privateQGhostVtx.pop_back()); - QMsgType.push_back(privateQMsgType.pop_back()); - QOwner.push_back(privateQOwner.pop_back()); - } - } - -#endif - } // End of private.size() } } // End of outer for - } // End of while ( !U.empty() ) - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + } // End of while ( !U.empty() ) #ifdef COUNT_LOCAL_VERTEX printf("Count local vertexes: %ld for thread %d of processor %d\n", diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index bb21396b..474453e3 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -28,6 +28,8 @@ void processMessages( staticQueue &U) { +//#define PRINT_DEBUG_INFO_ + MilanInt Sender; MPI_Status computeStatus; MilanLongInt bundleSize, w; @@ -92,6 +94,8 @@ void processMessages( if (Message[2] == SIZEINFO) { + //printf("Inizio sizeinfo\n"); + fflush(stdout); #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl; @@ -124,6 +128,9 @@ void processMessages( cout << endl; fflush(stdout); #endif + + //printf("Fine sizeinfo\n"); + fflush(stdout); } else { // Just a single message: From aa45e2fe936db3d150aedb01409dfe933984cb5e Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 05:14:26 -0500 Subject: [PATCH 73/96] processMatchedVerticesAndSendMessages.cpp unoptimized --- amgprec/impl/aggregator/Makefile | 2 +- amgprec/impl/aggregator/MatchBoxPC.h | 46 ++- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 178 +------- .../aggregator/processMatchedVertices.cpp | 78 +--- .../processMatchedVerticesAndSendMessages.cpp | 380 ++++++++++++++++++ amgprec/impl/aggregator/processMessages.cpp | 16 +- 6 files changed, 435 insertions(+), 265 deletions(-) create mode 100644 amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile index f1760822..b3b1ac94 100644 --- a/amgprec/impl/aggregator/Makefile +++ b/amgprec/impl/aggregator/Makefile @@ -70,6 +70,7 @@ findOwnerOfGhost.o \ computeCandidateMate.o \ parallelComputeCandidateMateB.o \ processMatchedVertices.o \ +processMatchedVerticesAndSendMessages.o \ processCrossEdge.o \ queueTransfer.o \ processMessages.o \ @@ -77,7 +78,6 @@ processExposedVertex.o \ algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \ algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o - OBJS = $(FOBJS) $(MPCOBJS) LIBNAME=libamg_prec.a diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index cb7d95e2..8bba9540 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -65,8 +65,8 @@ using namespace std; -const int NUM_THREAD = 2; -const int UCHUNK = 50; +const int NUM_THREAD = 4; +const int UCHUNK = 10; const MilanLongInt REQUEST = 1; const MilanLongInt SUCCESS = 2; @@ -293,6 +293,38 @@ extern "C" MilanLongInt *SPtr); void processMatchedVertices( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + staticQueue &U, + staticQueue &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner); + + void processMatchedVerticesAndSendMessages( MilanLongInt NLVer, vector &UChunkBeingProcessed, staticQueue &U, @@ -323,11 +355,11 @@ extern "C" staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, staticQueue &privateQOwner, - bool sendMessages = false, - MPI_Comm comm = NULL, - MilanLongInt *msgActual = nullptr, - MilanLongInt *msgInd = nullptr, - vector &Message = DEFAULT_VECTOR); + bool sendMessages, + MPI_Comm comm, + MilanLongInt *msgActual, + MilanLongInt *msgInd, + vector &Message); void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, MilanInt *BufferSizePtr, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 99fd57c3..d8e8bfb7 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -368,11 +368,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////////////////////////////////////////////////////////////// /////////////////////////// PROCESS MATCHED VERTICES ////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - ///* -//#define error -#ifdef error - processMatchedVertices(NLVer, + processMatchedVerticesAndSendMessages(NLVer, UChunkBeingProcessed, U, privateU, @@ -407,179 +404,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( &msgActual, &msgInd, Message); -#endif -#ifndef error - - while (!U.empty()) - { - - u = U.pop_front(); // Get an element from the queue -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")u: " << u; - fflush(stdout); -#endif - if ((u >= StartIndex) && (u <= EndIndex)) - { // Process Only If a Local Vertex - // Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; // Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) - { - v = verLocInd[k]; - if ((v >= StartIndex) && (v <= EndIndex)) - { // v is a Local Vertex: - // if (Mate[v - StartIndex] >= 0) // v is already matched - // continue; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; - fflush(stdout); -#endif - // If the current vertex is pointing to a matched vertex and is not matched - if (Mate[v - StartIndex] < 0) - { - if (candidateMate[v - StartIndex] == u) - { // Only if pointing to the matched vertex - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], - verLocPtr[v - StartIndex + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - - candidateMate[v - StartIndex] = w; - // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w; - fflush(stdout); -#endif - // If found a dominating edge: - if (w >= 0) - { - if ((w < StartIndex) || (w > EndIndex)) - { // w is a ghost - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message:"; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); -#endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - Mate[v - StartIndex] = w; // v is local - GMate[Ghost2LocalMap[w]] = v; // w is ghost - // Q.push_back(u); - U.push_back(v); - U.push_back(w); - myCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); -#endif - - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], &S); - - } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex - if (candidateMate[w - StartIndex] == v) - { - Mate[v - StartIndex] = w; // v is local - Mate[w - StartIndex] = v; // w is local - // Q.push_back(u); - U.push_back(v); - U.push_back(w); - myCard++; -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); -#endif - } // End of if(CandidateMate(w) = v - } // End of Else - } // End of if(w >=0) - else - { // no dominating edge found: w == -1 - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = FAILURE; // TYPE - // Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); -#endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of If (candidateMate[v-StartIndex] == u) - } // if (Mate[v - StartIndex] < 0) - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else - { // Neighbor v is a ghost vertex - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) - { // u is a local vertex - // Build the Message Packet: - Message[0] = u; // LOCAL - Message[1] = v; // GHOST - Message[2] = SUCCESS; // TYPE - // Send a Request (Asynchronous) -#ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs); - fflush(stdout); -#endif - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - msgInd++; - msgActual++; -#ifdef DEBUG_GHOST_ - if ((u < StartIndex) || (u > EndIndex)) - { - cout << "\n(" << myRank << ") " << __LINE__ << " From Send: should not happen: u= " << u << " v= " << v << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; - fflush(stdout); - } -#endif - - } // End of If( v != Mate[u] ) - } // End of Else //A Ghost Vertex - } // End of For Loop adj(u) - } // End of if ( (u >= StartIndex) && (u <= EndIndex) ) //Process Only If a Local Vertex - } // End of while ( !U.empty() ) -#endif ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 5e233ce9..edb1f788 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -1,7 +1,5 @@ #include "MatchBoxPC.h" -//#define error - void processMatchedVertices( MilanLongInt NLVer, vector &UChunkBeingProcessed, @@ -32,12 +30,7 @@ void processMatchedVertices( staticQueue &privateQLocalVtx, staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, - staticQueue &privateQOwner, - bool sendMessages, - MPI_Comm comm, - MilanLongInt *msgActual, - MilanLongInt *msgInd, - vector &Message) + staticQueue &privateQOwner) { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; @@ -53,7 +46,7 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) { while (!U.empty()) @@ -192,29 +185,10 @@ void processMatchedVertices( ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); - if (sendMessages) - { - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) - - printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - fflush(stdout); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - #pragma omp atomic - (*msgActual)++; - } - else - { + PCounter[ghostOwner]++; #pragma omp atomic - PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; - } - + (*NumMessagesBundledPtr)++; #pragma omp atomic (*msgIndPtr)++; @@ -248,28 +222,10 @@ void processMatchedVertices( ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); - if (sendMessages) - { - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = FAILURE; // TYPE - // Send a Request (Asynchronous) - - printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - fflush(stdout); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic - (*msgActual)++; - } - else - { #pragma omp atomic - PCounter[ghostOwner]++; + PCounter[ghostOwner]++; #pragma omp atomic - (*NumMessagesBundledPtr)++; - } - + (*NumMessagesBundledPtr)++; #pragma omp atomic (*msgIndPtr)++; @@ -293,27 +249,11 @@ void processMatchedVertices( ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); assert(ghostOwner != -1); assert(ghostOwner != myRank); - if (sendMessages) - { - // Build the Message Packet: - Message[0] = u; // LOCAL - Message[1] = v; // GHOST - Message[2] = SUCCESS; // TYPE - - // Send a Request (Asynchronous) - // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - fflush(stdout); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic - (*msgActual)++; - } - else - { + #pragma omp atomic - (*NumMessagesBundledPtr)++; + (*NumMessagesBundledPtr)++; #pragma omp atomic - PCounter[ghostOwner]++; - } + PCounter[ghostOwner]++; #pragma omp atomic (*msgIndPtr)++; diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp new file mode 100644 index 00000000..e61d561f --- /dev/null +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -0,0 +1,380 @@ +#include "MatchBoxPC.h" + +void processMatchedVerticesAndSendMessages( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + staticQueue &U, + staticQueue &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + staticQueue &privateQLocalVtx, + staticQueue &privateQGhostVtx, + staticQueue &privateQMsgType, + staticQueue &privateQOwner, + bool sendMessages, + MPI_Comm comm, + MilanLongInt *msgActual, + MilanLongInt *msgInd, + vector &Message) +{ + + MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; + int option; + MilanLongInt mateVal; + + vector messagesToSend; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif + +#ifdef COUNT_LOCAL_VERTEX + MilanLongInt localVertices = 0; +#endif +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) + { + + while (!U.empty()) + { + + extractUChunk(UChunkBeingProcessed, U, privateU); + + for (MilanLongInt u : UChunkBeingProcessed) + { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); +#endif + if ((u >= StartIndex) && (u <= EndIndex)) + { // Process Only the Local Vertices + +#ifdef COUNT_LOCAL_VERTEX + localVertices++; +#endif + + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) + { + option = -1; + v = verLocInd[k]; + + if ((v >= StartIndex) && (v <= EndIndex)) + { // If Local Vertex: + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); +#endif +#pragma omp atomic read + mateVal = Mate[v - StartIndex]; + // If the current vertex is pointing to a matched vertex and is not matched + if (mateVal < 0) + { +#pragma omp critical + { + if (candidateMate[v - StartIndex] == u) + { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) + { + + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); +#endif + option = 2; + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) + { + option = 1; + Mate[v - StartIndex] = w; // v is a local vertex + GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex + + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else + { // w is a local vertex + if (candidateMate[w - StartIndex] == v) + { + option = 3; + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + option = 4; // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of If (candidateMate[v-StartIndex] == u + } // End of task + } // mateval < 0 + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else + { // Neighbor is a ghost vertex + +#pragma omp critical + { + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + option = 5; // u is local + } // End of critical + } // End of Else //A Ghost Vertex + + switch (option) + { + case -1: + // No things to do + break; + case 1: + // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v + privateU.push_back(v); + privateU.push_back(w); +#pragma omp atomic + (*myCardPtr)++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); + case 2: + + // Found a dominating edge, it is a ghost + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + if (sendMessages) + { + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) + + //printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + //fflush(stdout); +#pragma omp critical(sendMessage) + { + messagesToSend.push_back(v); + messagesToSend.push_back(w); + messagesToSend.push_back(REQUEST); + messagesToSend.push_back(ghostOwner); + } + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + +#pragma omp atomic + (*msgActual)++; + } + else + { +#pragma omp atomic + PCounter[ghostOwner]++; +#pragma omp atomic + (*NumMessagesBundledPtr)++; + } + +#pragma omp atomic + (*msgIndPtr)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + privateU.push_back(v); + privateU.push_back(w); +#pragma omp atomic + (*myCardPtr)++; + break; + case 4: + // Could not find a dominating vertex + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) + { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) + { // A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + if (sendMessages) + { + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) + + //printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + //fflush(stdout); +#pragma omp critical(sendMessage) + { + messagesToSend.push_back(v); + messagesToSend.push_back(w); + messagesToSend.push_back(FAILURE); + messagesToSend.push_back(ghostOwner); + } + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); +#pragma omp atomic + (*msgActual)++; + } + else + { +#pragma omp atomic + PCounter[ghostOwner]++; +#pragma omp atomic + (*NumMessagesBundledPtr)++; + } + +#pragma omp atomic + (*msgIndPtr)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + break; + case 5: + default: + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); +#endif + + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + assert(ghostOwner != -1); + assert(ghostOwner != myRank); + if (sendMessages) + { + // Build the Message Packet: + Message[0] = u; // LOCAL + Message[1] = v; // GHOST + Message[2] = SUCCESS; // TYPE + + // Send a Request (Asynchronous) + //printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + //fflush(stdout); +#pragma omp critical(sendMessage) + { + messagesToSend.push_back(u); + messagesToSend.push_back(v); + messagesToSend.push_back(SUCCESS); + messagesToSend.push_back(ghostOwner); + } + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); +#pragma omp atomic + (*msgActual)++; + } + else + { +#pragma omp atomic + (*NumMessagesBundledPtr)++; +#pragma omp atomic + PCounter[ghostOwner]++; + } + +#pragma omp atomic + (*msgIndPtr)++; + + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); + + break; + } // End of switch + + } // End of inner for + } + } // End of outer for + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + } // End of while ( !U.empty() ) + +#ifdef COUNT_LOCAL_VERTEX + printf("Count local vertexes: %ld for thread %d of processor %d\n", + localVertices, + omp_get_thread_num(), + myRank); + +#endif + } // End of parallel region + + for (int i = 0; i < messagesToSend.size(); i += 4) + { + Message[0] = messagesToSend[i]; + Message[1] = messagesToSend[i + 1]; + Message[2] = messagesToSend[i + 2]; + ghostOwner = messagesToSend[i + 3]; + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + } +} diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index 474453e3..c6cb2531 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -28,7 +28,7 @@ void processMessages( staticQueue &U) { -//#define PRINT_DEBUG_INFO_ + //#define PRINT_DEBUG_INFO_ MilanInt Sender; MPI_Status computeStatus; @@ -94,8 +94,6 @@ void processMessages( if (Message[2] == SIZEINFO) { - //printf("Inizio sizeinfo\n"); - fflush(stdout); #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl; @@ -128,9 +126,6 @@ void processMessages( cout << endl; fflush(stdout); #endif - - //printf("Fine sizeinfo\n"); - fflush(stdout); } else { // Just a single message: @@ -162,8 +157,7 @@ void processMessages( fflush(stdout); #endif - - //Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop + // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3) { u = ReceiveBuffer[bundleCounter - 3]; // GHOST @@ -325,10 +319,10 @@ void processMessages( cout << "\n(" << myRank << ")Message type is FAILURE" << endl; fflush(stdout); #endif - GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter - } // End of else: CASE III - } // End of else: CASE I + } // End of else: CASE III + } // End of else: CASE I } return; From 5efee2004175bad45761608e74dd05d55bc2f5ad Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 05:52:27 -0500 Subject: [PATCH 74/96] Optimization, replaced all useless atomic with reduction --- amgprec/impl/aggregator/MatchBoxPC.h | 1 - ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1 - amgprec/impl/aggregator/initialize.cpp | 6 +- .../impl/aggregator/processExposedVertex.cpp | 21 +++-- .../aggregator/processMatchedVertices.cpp | 58 +++++++------- .../processMatchedVerticesAndSendMessages.cpp | 78 +++++++++---------- amgprec/impl/aggregator/processMessages.cpp | 8 +- 7 files changed, 81 insertions(+), 92 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 8bba9540..d4b8c04c 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -358,7 +358,6 @@ extern "C" bool sendMessages, MPI_Comm comm, MilanLongInt *msgActual, - MilanLongInt *msgInd, vector &Message); void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index d8e8bfb7..7b47c7c9 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -402,7 +402,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( true, comm, &msgActual, - &msgInd, Message); ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 3e7ac207..477f5f6d 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -57,13 +57,13 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, #pragma omp task depend(out \ : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices) { -#pragma omp taskloop num_tasks(NUM_THREAD) +#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \ + : numGhostEdges[:1]) for (i = 0; i < NLEdge; i++) { // O(m) - Each edge stored twice insertMe = verLocInd[i]; if ((insertMe < StartIndex) || (insertMe > EndIndex)) { // Find a ghost -#pragma omp atomic (*numGhostEdges)++; #pragma omp critical { @@ -76,7 +76,7 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, { // Insert an entry for the ghost: Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry Counter.push_back(1); // Initialize the counter - (*numGhostVertices)++; // Increment the number of ghost vertices + (*numGhostVertices)++; // Increment the number of ghost vertices } // End of else() } } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index c53f2f53..91035372 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -34,9 +34,13 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0; MilanInt ghostOwner = 0, option; -#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) \ + num_threads(NUM_THREAD) + { -#pragma omp for reduction(+ : PCounter[:numProcs]) schedule(static) +#pragma omp for reduction(+ \ + : PCounter[:numProcs], myCard[:1], msgInd[:1], NumMessagesBundled[:1]) schedule(static) for (v = 0; v < NLVer; v++) { option = -1; @@ -76,7 +80,6 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, if (w >= 0) { -#pragma omp atomic (*myCard)++; if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex @@ -129,13 +132,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); fflush(stdout); #endif -#pragma omp atomic (*msgInd)++; -#pragma omp atomic (*NumMessagesBundled)++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); PCounter[ghostOwner]++; privateQLocalVtx.push_back(v + StartIndex); @@ -169,13 +170,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; fflush(stdout); #endif -#pragma omp atomic (*msgInd)++; -#pragma omp atomic (*NumMessagesBundled)++; ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); PCounter[ghostOwner]++; privateQLocalVtx.push_back(v + StartIndex); diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index edb1f788..1e7b2641 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -7,9 +7,9 @@ void processMatchedVertices( staticQueue &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt *myCardPtr, - MilanLongInt *msgIndPtr, - MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, MilanLongInt *SPtr, MilanLongInt *verLocPtr, MilanLongInt *verLocInd, @@ -46,7 +46,14 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ + num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1]) { while (!U.empty()) @@ -171,8 +178,8 @@ void processMatchedVertices( // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); @@ -183,14 +190,11 @@ void processMatchedVertices( // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); -#pragma omp atomic + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic - (*msgIndPtr)++; + (*NumMessagesBundled)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -200,8 +204,8 @@ void processMatchedVertices( case 3: privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + + (*myCard)++; break; case 4: // Could not find a dominating vertex @@ -220,14 +224,12 @@ void processMatchedVertices( #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); -#pragma omp atomic + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic - (*msgIndPtr)++; + (*NumMessagesBundled)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -247,16 +249,12 @@ void processMatchedVertices( #endif ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic + (*NumMessagesBundled)++; PCounter[ghostOwner]++; - -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(u); privateQGhostVtx.push_back(v); diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp index e61d561f..7775d193 100644 --- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -7,9 +7,9 @@ void processMatchedVerticesAndSendMessages( staticQueue &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, - MilanLongInt *myCardPtr, - MilanLongInt *msgIndPtr, - MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, MilanLongInt *SPtr, MilanLongInt *verLocPtr, MilanLongInt *verLocInd, @@ -34,7 +34,6 @@ void processMatchedVerticesAndSendMessages( bool sendMessages, MPI_Comm comm, MilanLongInt *msgActual, - MilanLongInt *msgInd, vector &Message) { @@ -53,7 +52,16 @@ void processMatchedVerticesAndSendMessages( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) num_threads(NUM_THREAD) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) \ +default(shared) \ +num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1], msgActual \ + [:1]) { while (!U.empty()) @@ -178,8 +186,7 @@ void processMatchedVerticesAndSendMessages( // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; fflush(stdout); @@ -190,8 +197,8 @@ void processMatchedVerticesAndSendMessages( // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); if (sendMessages) { // Build the Message Packet: @@ -200,8 +207,8 @@ void processMatchedVerticesAndSendMessages( Message[2] = REQUEST; // TYPE // Send a Request (Asynchronous) - //printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - //fflush(stdout); + // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); #pragma omp critical(sendMessage) { messagesToSend.push_back(v); @@ -211,19 +218,15 @@ void processMatchedVerticesAndSendMessages( } // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic (*msgActual)++; } else { -#pragma omp atomic PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; + (*NumMessagesBundled)++; } -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -233,8 +236,7 @@ void processMatchedVerticesAndSendMessages( case 3: privateU.push_back(v); privateU.push_back(w); -#pragma omp atomic - (*myCardPtr)++; + (*myCard)++; break; case 4: // Could not find a dominating vertex @@ -253,8 +255,8 @@ void processMatchedVerticesAndSendMessages( #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); if (sendMessages) { // Build the Message Packet: @@ -263,8 +265,8 @@ void processMatchedVerticesAndSendMessages( Message[2] = FAILURE; // TYPE // Send a Request (Asynchronous) - //printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - //fflush(stdout); + // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); #pragma omp critical(sendMessage) { messagesToSend.push_back(v); @@ -273,19 +275,15 @@ void processMatchedVerticesAndSendMessages( messagesToSend.push_back(ghostOwner); } // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic (*msgActual)++; } else { -#pragma omp atomic PCounter[ghostOwner]++; -#pragma omp atomic - (*NumMessagesBundledPtr)++; + (*NumMessagesBundled)++; } -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(v); privateQGhostVtx.push_back(w); @@ -305,8 +303,8 @@ void processMatchedVerticesAndSendMessages( #endif ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); if (sendMessages) { // Build the Message Packet: @@ -315,8 +313,8 @@ void processMatchedVerticesAndSendMessages( Message[2] = SUCCESS; // TYPE // Send a Request (Asynchronous) - //printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - //fflush(stdout); + // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); #pragma omp critical(sendMessage) { messagesToSend.push_back(u); @@ -325,19 +323,15 @@ void processMatchedVerticesAndSendMessages( messagesToSend.push_back(ghostOwner); } // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); -#pragma omp atomic (*msgActual)++; } else { -#pragma omp atomic - (*NumMessagesBundledPtr)++; -#pragma omp atomic + (*NumMessagesBundled)++; PCounter[ghostOwner]++; } -#pragma omp atomic - (*msgIndPtr)++; + (*msgInd)++; privateQLocalVtx.push_back(u); privateQGhostVtx.push_back(v); @@ -371,10 +365,10 @@ void processMatchedVerticesAndSendMessages( for (int i = 0; i < messagesToSend.size(); i += 4) { - Message[0] = messagesToSend[i]; - Message[1] = messagesToSend[i + 1]; + Message[0] = messagesToSend[i]; + Message[1] = messagesToSend[i + 1]; Message[2] = messagesToSend[i + 2]; - ghostOwner = messagesToSend[i + 3]; + ghostOwner = messagesToSend[i + 3]; MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); } } diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index c6cb2531..4150a330 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -243,8 +243,8 @@ void processMessages( fflush(stdout); #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); (*msgInd)++; @@ -301,8 +301,8 @@ void processMessages( fflush(stdout); #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - assert(ghostOwner != -1); - assert(ghostOwner != myRank); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); (*msgInd)++; (*msgActual)++; From 1ab166b38b975f2b561fb4e592ec627c58effbbc Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 08:24:50 -0500 Subject: [PATCH 75/96] Improved performance of processMatchedVerticesAndSendMessages.cpp --- amgprec/impl/aggregator/MatchBoxPC.h | 1 - ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 1 - .../processMatchedVerticesAndSendMessages.cpp | 153 ++++++++---------- 3 files changed, 63 insertions(+), 92 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index d4b8c04c..8fcc495b 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -355,7 +355,6 @@ extern "C" staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, staticQueue &privateQOwner, - bool sendMessages, MPI_Comm comm, MilanLongInt *msgActual, vector &Message); diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 7b47c7c9..612ac95f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -399,7 +399,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( privateQGhostVtx, privateQMsgType, privateQOwner, - true, comm, &msgActual, Message); diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp index 7775d193..49235870 100644 --- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -31,7 +31,6 @@ void processMatchedVerticesAndSendMessages( staticQueue &privateQGhostVtx, staticQueue &privateQMsgType, staticQueue &privateQOwner, - bool sendMessages, MPI_Comm comm, MilanLongInt *msgActual, vector &Message) @@ -41,7 +40,7 @@ void processMatchedVerticesAndSendMessages( int option; MilanLongInt mateVal; - vector messagesToSend; + vector privatemessagesToSend, messagesToSend; #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; @@ -52,16 +51,15 @@ void processMatchedVerticesAndSendMessages( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ - firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) \ -default(shared) \ -num_threads(NUM_THREAD) \ - reduction(+ \ - : msgInd[:1], PCounter \ - [:numProcs], myCard \ - [:1], NumMessagesBundled \ - [:1], msgActual \ - [:1]) +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option, privatemessagesToSend) \ + firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ + num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1], msgActual \ + [:1]) { while (!U.empty()) @@ -199,33 +197,22 @@ num_threads(NUM_THREAD) ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); // assert(ghostOwner != -1); // assert(ghostOwner != myRank); - if (sendMessages) - { - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) - - // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - // fflush(stdout); -#pragma omp critical(sendMessage) - { - messagesToSend.push_back(v); - messagesToSend.push_back(w); - messagesToSend.push_back(REQUEST); - messagesToSend.push_back(ghostOwner); - } - // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - - (*msgActual)++; - } - else - { - PCounter[ghostOwner]++; - (*NumMessagesBundled)++; - } + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) + + // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); + privatemessagesToSend.push_back(v); + privatemessagesToSend.push_back(w); + privatemessagesToSend.push_back(REQUEST); + privatemessagesToSend.push_back(ghostOwner); + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; (*msgInd)++; privateQLocalVtx.push_back(v); @@ -257,32 +244,22 @@ num_threads(NUM_THREAD) ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); // assert(ghostOwner != -1); // assert(ghostOwner != myRank); - if (sendMessages) - { - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = FAILURE; // TYPE - // Send a Request (Asynchronous) - - // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - // fflush(stdout); -#pragma omp critical(sendMessage) - { - messagesToSend.push_back(v); - messagesToSend.push_back(w); - messagesToSend.push_back(FAILURE); - messagesToSend.push_back(ghostOwner); - } - // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - (*msgActual)++; - } - else - { - PCounter[ghostOwner]++; - (*NumMessagesBundled)++; - } + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) + + // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout); + privatemessagesToSend.push_back(v); + privatemessagesToSend.push_back(w); + privatemessagesToSend.push_back(FAILURE); + privatemessagesToSend.push_back(ghostOwner); + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; (*msgInd)++; privateQLocalVtx.push_back(v); @@ -305,32 +282,22 @@ num_threads(NUM_THREAD) ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); // assert(ghostOwner != -1); // assert(ghostOwner != myRank); - if (sendMessages) - { - // Build the Message Packet: - Message[0] = u; // LOCAL - Message[1] = v; // GHOST - Message[2] = SUCCESS; // TYPE - - // Send a Request (Asynchronous) - // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - // fflush(stdout); -#pragma omp critical(sendMessage) - { - messagesToSend.push_back(u); - messagesToSend.push_back(v); - messagesToSend.push_back(SUCCESS); - messagesToSend.push_back(ghostOwner); - } - // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - (*msgActual)++; - } - else - { - (*NumMessagesBundled)++; - PCounter[ghostOwner]++; - } + // Build the Message Packet: + Message[0] = u; // LOCAL + Message[1] = v; // GHOST + Message[2] = SUCCESS; // TYPE + + // Send a Request (Asynchronous) + // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); + // fflush(stdout);) + privatemessagesToSend.push_back(u); + privatemessagesToSend.push_back(v); + privatemessagesToSend.push_back(SUCCESS); + privatemessagesToSend.push_back(ghostOwner); + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; (*msgInd)++; privateQLocalVtx.push_back(u); @@ -340,11 +307,17 @@ num_threads(NUM_THREAD) break; } // End of switch - - } // End of inner for + } // End of inner for } } // End of outer for +#pragma omp critical(sendMessageTransfer) + { + messagesToSend.insert(messagesToSend.end(), privatemessagesToSend.begin(), privatemessagesToSend.end()); + + privatemessagesToSend.clear(); + } + queuesTransfer(U, privateU, QLocalVtx, QGhostVtx, QMsgType, QOwner, privateQLocalVtx, From 066c1a5e62de13c322d1101834fa4a492e7af72b Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 09:27:35 -0500 Subject: [PATCH 76/96] optimization processMatchedVerticesAndSendMessages.cpp --- amgprec/impl/aggregator/MatchBoxPC.h | 4 - ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 4 - .../processMatchedVerticesAndSendMessages.cpp | 100 ++++++++---------- 3 files changed, 42 insertions(+), 66 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 8fcc495b..01cc0589 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -351,10 +351,6 @@ extern "C" vector &QGhostVtx, vector &QMsgType, vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner, MPI_Comm comm, MilanLongInt *msgActual, vector &Message); diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 612ac95f..c1210ea7 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -395,10 +395,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( QGhostVtx, QMsgType, QOwner, - privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner, comm, &msgActual, Message); diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp index 49235870..9d4077a7 100644 --- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -27,20 +27,22 @@ void processMatchedVerticesAndSendMessages( vector &QGhostVtx, vector &QMsgType, vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner, MPI_Comm comm, MilanLongInt *msgActual, vector &Message) { + MilanLongInt initialSize = QLocalVtx.size(); MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; int option; MilanLongInt mateVal; - vector privatemessagesToSend, messagesToSend; + // TODO reserve!!! + vector privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; + privateQLocalVtx.reserve(100000); + privateQGhostVtx.reserve(100000); + privateQMsgType.reserve(100000); + privateQOwner.reserve(100000); #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; @@ -51,7 +53,7 @@ void processMatchedVerticesAndSendMessages( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option, privatemessagesToSend) \ +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ num_threads(NUM_THREAD) \ reduction(+ \ @@ -195,21 +197,12 @@ void processMatchedVerticesAndSendMessages( // Found a dominating edge, it is a ghost ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - // assert(ghostOwner != -1); - // assert(ghostOwner != myRank); // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) - - // printf("Send case 2: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - // fflush(stdout); - privatemessagesToSend.push_back(v); - privatemessagesToSend.push_back(w); - privatemessagesToSend.push_back(REQUEST); - privatemessagesToSend.push_back(ghostOwner); + // Message[0] = v; // LOCAL + // Message[1] = w; // GHOST + // Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); (*msgActual)++; @@ -242,21 +235,12 @@ void processMatchedVerticesAndSendMessages( #endif ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - // assert(ghostOwner != -1); - // assert(ghostOwner != myRank); // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = FAILURE; // TYPE - // Send a Request (Asynchronous) - - // printf("Send case 4: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - // fflush(stdout); - privatemessagesToSend.push_back(v); - privatemessagesToSend.push_back(w); - privatemessagesToSend.push_back(FAILURE); - privatemessagesToSend.push_back(ghostOwner); + // Message[0] = v; // LOCAL + // Message[1] = w; // GHOST + // Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); (*msgActual)++; @@ -280,21 +264,12 @@ void processMatchedVerticesAndSendMessages( #endif ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - // assert(ghostOwner != -1); - // assert(ghostOwner != myRank); // Build the Message Packet: - Message[0] = u; // LOCAL - Message[1] = v; // GHOST - Message[2] = SUCCESS; // TYPE - + // Message[0] = u; // LOCAL + // Message[1] = v; // GHOST + // Message[2] = SUCCESS; // TYPE // Send a Request (Asynchronous) - // printf("Send case 5: (%ld, %ld, %ld)\n", Message[0], Message[1], Message[2]); - // fflush(stdout);) - privatemessagesToSend.push_back(u); - privatemessagesToSend.push_back(v); - privatemessagesToSend.push_back(SUCCESS); - privatemessagesToSend.push_back(ghostOwner); // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); (*msgActual)++; @@ -311,19 +286,25 @@ void processMatchedVerticesAndSendMessages( } } // End of outer for +#pragma omp critical(U) + { + while (!privateU.empty()) + U.push_back(privateU.pop_back()); + } + #pragma omp critical(sendMessageTransfer) { - messagesToSend.insert(messagesToSend.end(), privatemessagesToSend.begin(), privatemessagesToSend.end()); - privatemessagesToSend.clear(); - } + QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); + QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); + QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); + QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); + privateQLocalVtx.clear(); + privateQGhostVtx.clear(); + privateQMsgType.clear(); + privateQOwner.clear(); + } } // End of while ( !U.empty() ) @@ -336,12 +317,15 @@ void processMatchedVerticesAndSendMessages( #endif } // End of parallel region - for (int i = 0; i < messagesToSend.size(); i += 4) + //Send the messages + for (int i = initialSize; i < QOwner.size(); i++) { - Message[0] = messagesToSend[i]; - Message[1] = messagesToSend[i + 1]; - Message[2] = messagesToSend[i + 2]; - ghostOwner = messagesToSend[i + 3]; + + Message[0] = QLocalVtx[i]; + Message[1] = QGhostVtx[i]; + Message[2] = QMsgType[i]; + ghostOwner = QOwner[i]; + MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); } } From 500403dbdac33e4a9af4a6a125bbbbc561d79e49 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 11:13:21 -0500 Subject: [PATCH 77/96] Replaced some staticQueues with vectors for performance reasons --- amgprec/impl/aggregator/MatchBoxPC.h | 36 +++++----- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 65 ++++++++++--------- amgprec/impl/aggregator/initialize.cpp | 17 ++--- .../impl/aggregator/processExposedVertex.cpp | 15 +++-- .../aggregator/processMatchedVertices.cpp | 8 +-- .../processMatchedVerticesAndSendMessages.cpp | 40 ++++-------- amgprec/impl/aggregator/queueTransfer.cpp | 38 ++++++----- 7 files changed, 110 insertions(+), 109 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 01cc0589..1066f8ef 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -189,10 +189,10 @@ extern "C" vector &QGhostVtx, vector &QMsgType, vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner); + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); bool isAlreadyMatched(MilanLongInt node, MilanLongInt StartIndex, @@ -233,10 +233,10 @@ extern "C" MilanLongInt *&candidateMate, staticQueue &U, staticQueue &privateU, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner); + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); void clean(MilanLongInt NLVer, MilanInt myRank, @@ -284,10 +284,10 @@ extern "C" vector &QGhostVtx, vector &QMsgType, vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner); + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); void PROCESS_CROSS_EDGE(MilanLongInt *edge, MilanLongInt *SPtr); @@ -319,10 +319,10 @@ extern "C" vector &QGhostVtx, vector &QMsgType, vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner); + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); void processMatchedVerticesAndSendMessages( MilanLongInt NLVer, @@ -351,6 +351,10 @@ extern "C" vector &QGhostVtx, vector &QMsgType, vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner, MPI_Comm comm, MilanLongInt *msgActual, vector &Message); diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index c1210ea7..4297391a 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -182,7 +182,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector GMate; // Proportional to the number of ghost vertices MilanLongInt S; MilanLongInt privateMyCard = 0; - staticQueue U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; + staticQueue U, privateU; vector PCumulative, PMessageBundle, PSizeInfoMessages; vector SRequest; // Requests that are used for each send message vector SStatus; // Status of sent messages, used in MPI_Wait @@ -190,6 +190,9 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanInt BufferSize; MilanLongInt *Buffer; + vector privateQLocalVtx, privateQGhostVtx, privateQMsgType; + vector privateQOwner; + initialize(NLVer, NLEdge, StartIndex, EndIndex, &numGhostEdges, &numGhostVertices, &S, @@ -370,34 +373,38 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( /////////////////////////////////////////////////////////////////////////////////// processMatchedVerticesAndSendMessages(NLVer, - UChunkBeingProcessed, - U, - privateU, - StartIndex, - EndIndex, - &myCard, - &msgInd, - &NumMessagesBundled, - &S, - verLocPtr, - verLocInd, - verDistance, - PCounter, - Counter, - myRank, - numProcs, - candidateMate, - GMate, - Mate, - Ghost2LocalMap, - edgeLocWeight, - QLocalVtx, - QGhostVtx, - QMsgType, - QOwner, - comm, - &msgActual, - Message); + UChunkBeingProcessed, + U, + privateU, + StartIndex, + EndIndex, + &myCard, + &msgInd, + &NumMessagesBundled, + &S, + verLocPtr, + verLocInd, + verDistance, + PCounter, + Counter, + myRank, + numProcs, + candidateMate, + GMate, + Mate, + Ghost2LocalMap, + edgeLocWeight, + QLocalVtx, + QGhostVtx, + QMsgType, + QOwner, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner, + comm, + &msgActual, + Message); ///////////////////////// END OF PROCESS MATCHED VERTICES ///////////////////////// diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 477f5f6d..47f424fd 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -21,10 +21,10 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt *&candidateMate, staticQueue &U, staticQueue &privateU, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner) + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) { MilanLongInt insertMe = 0; @@ -295,10 +295,11 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, // Initialize the privte data structure new (&privateU) staticQueue(NLVer + (*numGhostVertices)); // TODO how can I put a meaningfull size? - new (&privateQLocalVtx) staticQueue(size); - new (&privateQGhostVtx) staticQueue(size); - new (&privateQMsgType) staticQueue(size); - new (&privateQOwner) staticQueue(size); + + privateQLocalVtx.reserve(*numGhostVertices); + privateQGhostVtx.reserve(*numGhostVertices); + privateQMsgType.reserve(*numGhostVertices); + privateQOwner.reserve(*numGhostVertices); } // end of task } // End of single region diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index 91035372..c330e724 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -25,10 +25,10 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, vector &QGhostVtx, vector &QMsgType, vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner) + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) { MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0; @@ -39,8 +39,11 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, num_threads(NUM_THREAD) { -#pragma omp for reduction(+ \ - : PCounter[:numProcs], myCard[:1], msgInd[:1], NumMessagesBundled[:1]) schedule(static) +#pragma omp for reduction(+ \ + : PCounter[:numProcs], myCard \ + [:1], msgInd \ + [:1], NumMessagesBundled \ + [:1]) schedule(static) for (v = 0; v < NLVer; v++) { option = -1; diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 1e7b2641..510c9877 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -27,10 +27,10 @@ void processMatchedVertices( vector &QGhostVtx, vector &QMsgType, vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner) + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) { MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp index 9d4077a7..debfc5ca 100644 --- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -27,6 +27,10 @@ void processMatchedVerticesAndSendMessages( vector &QGhostVtx, vector &QMsgType, vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner, MPI_Comm comm, MilanLongInt *msgActual, vector &Message) @@ -37,13 +41,6 @@ void processMatchedVerticesAndSendMessages( int option; MilanLongInt mateVal; - // TODO reserve!!! - vector privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner; - privateQLocalVtx.reserve(100000); - privateQGhostVtx.reserve(100000); - privateQMsgType.reserve(100000); - privateQOwner.reserve(100000); - #ifdef PRINT_DEBUG_INFO_ cout << "\n(" << myRank << "=========================************===============================" << endl; fflush(stdout); @@ -53,7 +50,7 @@ void processMatchedVerticesAndSendMessages( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ +#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ num_threads(NUM_THREAD) \ reduction(+ \ @@ -286,25 +283,12 @@ void processMatchedVerticesAndSendMessages( } } // End of outer for -#pragma omp critical(U) - { - while (!privateU.empty()) - U.push_back(privateU.pop_back()); - } - -#pragma omp critical(sendMessageTransfer) - { - - QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); - QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); - QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); - QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); - - privateQLocalVtx.clear(); - privateQGhostVtx.clear(); - privateQMsgType.clear(); - privateQOwner.clear(); - } + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); } // End of while ( !U.empty() ) @@ -317,7 +301,7 @@ void processMatchedVerticesAndSendMessages( #endif } // End of parallel region - //Send the messages + // Send the messages for (int i = initialSize; i < QOwner.size(); i++) { diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp index cbae1fc2..0439a08c 100644 --- a/amgprec/impl/aggregator/queueTransfer.cpp +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -1,32 +1,34 @@ #include "MatchBoxPC.h" void queuesTransfer(staticQueue &U, - staticQueue &privateU, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - staticQueue &privateQLocalVtx, - staticQueue &privateQGhostVtx, - staticQueue &privateQMsgType, - staticQueue &privateQOwner) + staticQueue &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) { - #pragma omp critical(U) { while (!privateU.empty()) U.push_back(privateU.pop_back()); } -#pragma omp critical(privateMsg) +#pragma omp critical(sendMessageTransfer) { - while (!privateQLocalVtx.empty()) - { - QLocalVtx.push_back(privateQLocalVtx.pop_back()); - QGhostVtx.push_back(privateQGhostVtx.pop_back()); - QMsgType.push_back(privateQMsgType.pop_back()); - QOwner.push_back(privateQOwner.pop_back()); - } + + QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); + QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); + QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); + QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); } + + privateQLocalVtx.clear(); + privateQGhostVtx.clear(); + privateQMsgType.clear(); + privateQOwner.clear(); } \ No newline at end of file From a259e8ab53cb000416233940207972badafd7daa Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 11:34:43 -0500 Subject: [PATCH 78/96] extractUChunch optimization --- amgprec/impl/aggregator/extractUChunk.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp index b5bc1f5f..e26d1011 100644 --- a/amgprec/impl/aggregator/extractUChunk.cpp +++ b/amgprec/impl/aggregator/extractUChunk.cpp @@ -11,14 +11,18 @@ void extractUChunk( { if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U + { while (!privateU.empty()) - U.push_back(privateU.pop_front()); - - for (int i = 0; i < UCHUNK; i++) - { // Pop the new nodes - if (U.empty()) - break; - UChunkBeingProcessed.push_back(U.pop_front()); + UChunkBeingProcessed.push_back(privateU.pop_back()); + } + else + { + for (int i = 0; i < UCHUNK; i++) + { // Pop the new nodes + if (U.empty()) + break; + UChunkBeingProcessed.push_back(U.pop_back()); + } } } // End of critical U From 6414d3aef32c0818babf5d3dc139c66ac5918328 Mon Sep 17 00:00:00 2001 From: StefanoPetrilli Date: Sat, 23 Jul 2022 12:47:43 -0500 Subject: [PATCH 79/96] U and privateU are now vectors --- amgprec/impl/aggregator/MatchBoxPC.h | 26 +++++++++---------- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 3 +-- amgprec/impl/aggregator/extractUChunk.cpp | 12 +++++---- amgprec/impl/aggregator/initialize.cpp | 15 +++++------ .../impl/aggregator/processExposedVertex.cpp | 4 +-- .../aggregator/processMatchedVertices.cpp | 25 ++++++++++++++++-- .../processMatchedVerticesAndSendMessages.cpp | 4 +-- amgprec/impl/aggregator/processMessages.cpp | 2 +- amgprec/impl/aggregator/queueTransfer.cpp | 9 ++++--- 9 files changed, 60 insertions(+), 40 deletions(-) diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 1066f8ef..a1fddb59 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -183,8 +183,8 @@ extern "C" MilanLongInt *verLocInd, MilanReal *edgeLocWeight); - void queuesTransfer(staticQueue &U, - staticQueue &privateU, + void queuesTransfer(vector &U, + vector &privateU, vector &QLocalVtx, vector &QGhostVtx, vector &QMsgType, @@ -231,8 +231,8 @@ extern "C" vector &QMsgType, vector &QOwner, MilanLongInt *&candidateMate, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, vector &privateQLocalVtx, vector &privateQGhostVtx, vector &privateQMsgType, @@ -278,8 +278,8 @@ extern "C" vector &Counter, MilanInt myRank, MilanInt numProcs, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, vector &QLocalVtx, vector &QGhostVtx, vector &QMsgType, @@ -295,8 +295,8 @@ extern "C" void processMatchedVertices( MilanLongInt NLVer, vector &UChunkBeingProcessed, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, MilanLongInt *myCardPtr, @@ -327,8 +327,8 @@ extern "C" void processMatchedVerticesAndSendMessages( MilanLongInt NLVer, vector &UChunkBeingProcessed, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, MilanLongInt *myCardPtr, @@ -404,12 +404,12 @@ extern "C" MilanLongInt u, MilanLongInt v, MilanLongInt *SPtr, - staticQueue &U); + vector &U); void extractUChunk( vector &UChunkBeingProcessed, - staticQueue &U, - staticQueue &privateU); + vector &U, + vector &privateU); void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( MilanLongInt NLVer, MilanLongInt NLEdge, diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index 4297391a..bb2dd5a7 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -182,7 +182,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector GMate; // Proportional to the number of ghost vertices MilanLongInt S; MilanLongInt privateMyCard = 0; - staticQueue U, privateU; vector PCumulative, PMessageBundle, PSizeInfoMessages; vector SRequest; // Requests that are used for each send message vector SStatus; // Status of sent messages, used in MPI_Wait @@ -192,6 +191,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( vector privateQLocalVtx, privateQGhostVtx, privateQMsgType; vector privateQOwner; + vector U, privateU; initialize(NLVer, NLEdge, StartIndex, EndIndex, &numGhostEdges, @@ -240,7 +240,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( * TODO: Test when it's actually more efficient to execute this code * in parallel. */ - PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer, candidateMate, verLocInd, diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp index e26d1011..923a0b51 100644 --- a/amgprec/impl/aggregator/extractUChunk.cpp +++ b/amgprec/impl/aggregator/extractUChunk.cpp @@ -2,8 +2,8 @@ void extractUChunk( vector &UChunkBeingProcessed, - staticQueue &U, - staticQueue &privateU) + vector &U, + vector &privateU) { UChunkBeingProcessed.clear(); @@ -13,7 +13,8 @@ void extractUChunk( if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U { while (!privateU.empty()) - UChunkBeingProcessed.push_back(privateU.pop_back()); + UChunkBeingProcessed.push_back(privateU.back()); + privateU.pop_back(); } else { @@ -21,9 +22,10 @@ void extractUChunk( { // Pop the new nodes if (U.empty()) break; - UChunkBeingProcessed.push_back(U.pop_back()); + UChunkBeingProcessed.push_back(U.back()); + U.pop_back(); } } - } // End of critical U + } // End of critical U // End of critical U } \ No newline at end of file diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp index 47f424fd..17a4169e 100644 --- a/amgprec/impl/aggregator/initialize.cpp +++ b/amgprec/impl/aggregator/initialize.cpp @@ -19,8 +19,8 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, vector &QMsgType, vector &QOwner, MilanLongInt *&candidateMate, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, vector &privateQLocalVtx, vector &privateQGhostVtx, vector &privateQMsgType, @@ -288,18 +288,15 @@ void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, * of a staticQueue I had to destroy the previous object and instantiate * a new one of the correct size. */ - new (&U) staticQueue(NLVer + (*numGhostVertices)); + //new (&U) staticQueue(NLVer + (*numGhostVertices)); + U.reserve(NLVer + (*numGhostVertices)); - // TODO how can I decide a more meaningfull size? - MilanLongInt size = (*numGhostVertices); - - // Initialize the privte data structure - new (&privateU) staticQueue(NLVer + (*numGhostVertices)); // TODO how can I put a meaningfull size? - + // Initialize the private vectors privateQLocalVtx.reserve(*numGhostVertices); privateQGhostVtx.reserve(*numGhostVertices); privateQMsgType.reserve(*numGhostVertices); privateQOwner.reserve(*numGhostVertices); + privateU.reserve(*numGhostVertices); } // end of task } // End of single region diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp index c330e724..49227158 100644 --- a/amgprec/impl/aggregator/processExposedVertex.cpp +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -19,8 +19,8 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, vector &Counter, MilanInt myRank, MilanInt numProcs, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, vector &QLocalVtx, vector &QGhostVtx, vector &QMsgType, diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index 510c9877..e96dcc1d 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -3,8 +3,8 @@ void processMatchedVertices( MilanLongInt NLVer, vector &UChunkBeingProcessed, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, MilanLongInt *myCard, @@ -275,6 +275,27 @@ void processMatchedVertices( privateQMsgType, privateQOwner); +#pragma omp critical(U) + { + U.insert(U.end(), privateU.begin(), privateU.end()); + } + + privateU.clear(); + +#pragma omp critical(sendMessageTransfer) + { + + QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); + QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); + QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); + QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); + } + + privateQLocalVtx.clear(); + privateQGhostVtx.clear(); + privateQMsgType.clear(); + privateQOwner.clear(); + } // End of while ( !U.empty() ) #ifdef COUNT_LOCAL_VERTEX diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp index debfc5ca..3322a05b 100644 --- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -3,8 +3,8 @@ void processMatchedVerticesAndSendMessages( MilanLongInt NLVer, vector &UChunkBeingProcessed, - staticQueue &U, - staticQueue &privateU, + vector &U, + vector &privateU, MilanLongInt StartIndex, MilanLongInt EndIndex, MilanLongInt *myCard, diff --git a/amgprec/impl/aggregator/processMessages.cpp b/amgprec/impl/aggregator/processMessages.cpp index 4150a330..804790c6 100644 --- a/amgprec/impl/aggregator/processMessages.cpp +++ b/amgprec/impl/aggregator/processMessages.cpp @@ -25,7 +25,7 @@ void processMessages( MilanLongInt u, MilanLongInt v, MilanLongInt *S, - staticQueue &U) + vector &U) { //#define PRINT_DEBUG_INFO_ diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp index 0439a08c..7200b43d 100644 --- a/amgprec/impl/aggregator/queueTransfer.cpp +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -1,7 +1,7 @@ #include "MatchBoxPC.h" -void queuesTransfer(staticQueue &U, - staticQueue &privateU, +void queuesTransfer(vector &U, + vector &privateU, vector &QLocalVtx, vector &QGhostVtx, vector &QMsgType, @@ -14,10 +14,11 @@ void queuesTransfer(staticQueue &U, #pragma omp critical(U) { - while (!privateU.empty()) - U.push_back(privateU.pop_back()); + U.insert(U.end(), privateU.begin(), privateU.end()); } + privateU.clear(); + #pragma omp critical(sendMessageTransfer) { From d59c9e6c0a56e0399b4884d2ca11c5d3ebe5556c Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Tue, 22 Nov 2022 03:02:51 -0500 Subject: [PATCH 80/96] Updates towards OpenMP version. --- amgprec/amg_d_matchboxp_mod.f90 | 2 +- amgprec/impl/aggregator/Makefile | 1 + amgprec/impl/aggregator/clean.cpp | 4 ++-- samples/advanced/pdegen/Makefile | 2 +- samples/advanced/pdegen/runs/amg_pde3d.inp | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/amgprec/amg_d_matchboxp_mod.f90 b/amgprec/amg_d_matchboxp_mod.f90 index a18d62d6..f6cb38ff 100644 --- a/amgprec/amg_d_matchboxp_mod.f90 +++ b/amgprec/amg_d_matchboxp_mod.f90 @@ -1109,7 +1109,7 @@ contains verlocptr(:) = verlocptr(:) + 1 verlocind(:) = verlocind(:) + 1 verdistance(:) = verdistance(:) + 1 - + if (me==0) write(0,*) 'Ph0/1/2 time ',ph0_time, ph1_time, ph2_time if (debug_sync) then call psb_barrier(ictxt) if (me == 0) write(0,*)' Done MatchBoxP ' diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile index 00e43088..11027ac1 100644 --- a/amgprec/impl/aggregator/Makefile +++ b/amgprec/impl/aggregator/Makefile @@ -67,6 +67,7 @@ initialize.o \ extractUChunk.o \ isAlreadyMatched.o \ findOwnerOfGhost.o \ +clean.o \ computeCandidateMate.o \ parallelComputeCandidateMateB.o \ processMatchedVertices.o \ diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp index 62f366b2..f316aee7 100644 --- a/amgprec/impl/aggregator/clean.cpp +++ b/amgprec/impl/aggregator/clean.cpp @@ -33,7 +33,7 @@ void clean(MilanLongInt NLVer, cout << "\n(" << myRank << ") Waitall " << endl; fflush(stdout); #endif - return; + //return; MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); @@ -88,4 +88,4 @@ void clean(MilanLongInt NLVer, #endif } } -} \ No newline at end of file +} diff --git a/samples/advanced/pdegen/Makefile b/samples/advanced/pdegen/Makefile index 8a49c73a..b5092a22 100644 --- a/samples/advanced/pdegen/Makefile +++ b/samples/advanced/pdegen/Makefile @@ -3,7 +3,7 @@ AMGINCDIR=$(AMGDIR)/include include $(AMGINCDIR)/Make.inc.amg4psblas AMGMODDIR=$(AMGDIR)/modules AMGLIBDIR=$(AMGDIR)/lib -AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec -llapack -lblas +AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec FINCLUDES=$(FMFLAG). $(FMFLAG)$(AMGMODDIR) $(FMFLAG)$(AMGINCDIR) $(PSBLAS_INCLUDES) $(FIFLAG). LINKOPT= diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index b6c448c3..0cd5d6c5 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0020 ! IDIM; domain size. Linear system size is IDIM**3 +0200 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC From 32994c7ce82dbdae0fd908b6f87bebf15b9f8a2e Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Tue, 13 Dec 2022 06:44:12 -0500 Subject: [PATCH 81/96] Better parameters in matchboxp_mod --- amgprec/amg_d_matchboxp_mod.f90 | 17 +++++++++-------- amgprec/amg_s_matchboxp_mod.f90 | 15 ++++++++------- amgprec/stZnqhkT | 1 - 3 files changed, 17 insertions(+), 16 deletions(-) delete mode 100644 amgprec/stZnqhkT diff --git a/amgprec/amg_d_matchboxp_mod.f90 b/amgprec/amg_d_matchboxp_mod.f90 index f6cb38ff..2df58797 100644 --- a/amgprec/amg_d_matchboxp_mod.f90 +++ b/amgprec/amg_d_matchboxp_mod.f90 @@ -146,6 +146,7 @@ contains & debug_ilaggr=.false., debug_sync=.false. integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 logical, parameter :: do_timings=.true. + integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2 ictxt = desc_a%get_ctxt() call psb_info(ictxt,iam,np) @@ -187,7 +188,7 @@ contains call desc_a%l2gip(ilv,info,owned=.false.) call psb_geall(ilaggr,desc_a,info) - ilaggr = -1 + ilaggr = ilaggr_neginit call psb_geasb(ilaggr,desc_a,info) nr = a%get_nrows() nc = a%get_ncols() @@ -259,7 +260,7 @@ contains cycle else - if (ilaggr(k) == -1) then + if (ilaggr(k) == ilaggr_neginit) then wk = w(k) widx = w(idx) @@ -267,7 +268,7 @@ contains nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2) if (nrmagg > epsilon(nrmagg)) then if (idx <= nr) then - if (ilaggr(idx) == -1) then + if (ilaggr(idx) == ilaggr_neginit) then ! Now, if both vertices are local, the aggregate is local ! (kinda obvious). nlaggr(iam) = nlaggr(iam) + 1 @@ -294,7 +295,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if else ! Use a statistically unbiased tie-breaking rule, @@ -309,7 +310,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if end if end if @@ -332,7 +333,7 @@ contains if (do_timings) call psb_tic(idx_phase3) ! Ok, now compute offsets, gather halo and fix non-local - ! aggregates (those where ilaggr == -2) + ! aggregates (those where ilaggr == ilaggr_nonlocal) call psb_sum(ictxt,nlaggr) ntaggr = sum(nlaggr(0:np-1)) naggrm1 = sum(nlaggr(0:iam-1)) @@ -347,7 +348,7 @@ contains call psb_halo(wtemp,desc_a,info) ! Cleanup as yet unmarked entries do k=1,nr - if (ilaggr(k) == -2) then + if (ilaggr(k) == ilaggr_nonlocal) then idx = mate(k) if (idx > nr) then i = ilaggr(idx) @@ -1109,7 +1110,7 @@ contains verlocptr(:) = verlocptr(:) + 1 verlocind(:) = verlocind(:) + 1 verdistance(:) = verdistance(:) + 1 - if (me==0) write(0,*) 'Ph0/1/2 time ',ph0_time, ph1_time, ph2_time + if (debug_sync) then call psb_barrier(ictxt) if (me == 0) write(0,*)' Done MatchBoxP ' diff --git a/amgprec/amg_s_matchboxp_mod.f90 b/amgprec/amg_s_matchboxp_mod.f90 index 9061344f..9144d70e 100644 --- a/amgprec/amg_s_matchboxp_mod.f90 +++ b/amgprec/amg_s_matchboxp_mod.f90 @@ -146,6 +146,7 @@ contains & debug_ilaggr=.false., debug_sync=.false. integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 logical, parameter :: do_timings=.true. + integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2 ictxt = desc_a%get_ctxt() call psb_info(ictxt,iam,np) @@ -187,7 +188,7 @@ contains call desc_a%l2gip(ilv,info,owned=.false.) call psb_geall(ilaggr,desc_a,info) - ilaggr = -1 + ilaggr = ilaggr_neginit call psb_geasb(ilaggr,desc_a,info) nr = a%get_nrows() nc = a%get_ncols() @@ -259,7 +260,7 @@ contains cycle else - if (ilaggr(k) == -1) then + if (ilaggr(k) == ilaggr_neginit) then wk = w(k) widx = w(idx) @@ -267,7 +268,7 @@ contains nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2) if (nrmagg > epsilon(nrmagg)) then if (idx <= nr) then - if (ilaggr(idx) == -1) then + if (ilaggr(idx) == ilaggr_neginit) then ! Now, if both vertices are local, the aggregate is local ! (kinda obvious). nlaggr(iam) = nlaggr(iam) + 1 @@ -294,7 +295,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if else ! Use a statistically unbiased tie-breaking rule, @@ -309,7 +310,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if end if end if @@ -332,7 +333,7 @@ contains if (do_timings) call psb_tic(idx_phase3) ! Ok, now compute offsets, gather halo and fix non-local - ! aggregates (those where ilaggr == -2) + ! aggregates (those where ilaggr == ilaggr_nonlocal) call psb_sum(ictxt,nlaggr) ntaggr = sum(nlaggr(0:np-1)) naggrm1 = sum(nlaggr(0:iam-1)) @@ -347,7 +348,7 @@ contains call psb_halo(wtemp,desc_a,info) ! Cleanup as yet unmarked entries do k=1,nr - if (ilaggr(k) == -2) then + if (ilaggr(k) == ilaggr_nonlocal) then idx = mate(k) if (idx > nr) then i = ilaggr(idx) diff --git a/amgprec/stZnqhkT b/amgprec/stZnqhkT deleted file mode 100644 index 8b277f0d..00000000 --- a/amgprec/stZnqhkT +++ /dev/null @@ -1 +0,0 @@ -! From ebe9b451775dd0df0ac0d9c0ffa28db64e87da87 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Fri, 10 Feb 2023 07:50:58 -0500 Subject: [PATCH 82/96] Modify MATCHBOXP to fix OpenMP. Performance to be reviewed --- ...mEdgesLinearSearchMesgBndlSmallMateCMP.cpp | 175 ++++-- .../impl/aggregator/computeCandidateMate.cpp | 41 +- .../parallelComputeCandidateMateB.cpp | 3 +- .../impl/aggregator/processExposedVertex.cpp | 156 +++--- .../aggregator/processMatchedVertices.cpp | 437 +++++++-------- .../processMatchedVerticesAndSendMessages.cpp | 517 +++++++++--------- amgprec/impl/aggregator/processMessages.cpp | 366 ++++++------- amgprec/impl/aggregator/queueTransfer.cpp | 3 +- .../impl/aggregator/sendBundledMessages.cpp | 241 ++++---- 9 files changed, 978 insertions(+), 961 deletions(-) diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp index bb2dd5a7..49b366a6 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -70,7 +70,7 @@ Statistics: ph0_time, ph1_time, ph2_time: Runtimes Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2) */ - +//#define DEBUG_HANG_ #ifdef SERIAL_MPI #else @@ -110,17 +110,24 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( #endif #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]"; + cout << "\n(" << myRank << ") verDistance [" ; + for (int i = 0; i < numProcs; i++) + cout << verDistance[i] << "," << verDistance[i+1]; + cout << "]\n"; fflush(stdout); #endif #ifdef DEBUG_HANG_ - if (myRank == 0) - cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]"; + if (myRank == 0) { + cout << "\n(" << myRank << ") verDistance [" ; + for (int i = 0; i < numProcs; i++) + cout << verDistance[i] << "," ; + cout << verDistance[numProcs]<< "]\n"; + } fflush(stdout); #endif MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank - MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank + MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank MPI_Status computeStatus; @@ -211,7 +218,11 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( finishTime = MPI_Wtime(); *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization - +#ifdef DEBUG_HANG_ + cout << myRank << " Finished initialization" << endl; + fflush(stdout); +#endif + startTime = MPI_Wtime(); ///////////////////////////////////////////////////////////////////////////////////////// @@ -233,6 +244,17 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( edgeLocWeight, candidateMate); +#ifdef DEBUG_HANG_ + cout << myRank << " Finished Exposed Vertex" << endl; + fflush(stdout); +#if 0 + cout << myRank << " candidateMate after parallelCompute " < &privateQGhostVtx, vector &privateQMsgType, vector &privateQOwner) -{ +{ MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0; - MilanInt ghostOwner = 0, option; + MilanInt ghostOwner = 0, option, igw; -#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) \ - firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) \ - num_threads(NUM_THREAD) +#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) \ + default(shared) num_threads(NUM_THREAD) { #pragma omp for reduction(+ \ : PCounter[:numProcs], myCard \ [:1], msgInd \ [:1], NumMessagesBundled \ - [:1]) schedule(static) - for (v = 0; v < NLVer; v++) - { + [:1]) \ + schedule(static) + for (v = 0; v < NLVer; v++) { option = -1; // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) k = candidateMate[v]; @@ -67,91 +67,81 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, #pragma omp critical(processExposed) { - if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) - { - w = computeCandidateMate(verLocPtr[v], - verLocPtr[v + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - candidateMate[v] = w; + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; } - - if (w >= 0) - { - (*myCard)++; - if ((w < StartIndex) || (w > EndIndex)) - { // w is a ghost vertex - option = 2; - - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) - { - option = 1; - Mate[v] = w; - GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost - - } // End of if CandidateMate[w] = v - - } // End of if a Ghost Vertex - else - { // w is a local vertex - - if (candidateMate[w - StartIndex] == (v + StartIndex)) - { - option = 3; - Mate[v] = w; // v is local - Mate[w - StartIndex] = v + StartIndex; // w is local - + + if (w >= 0) { + (*myCard)++; + if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex + option = 2; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) { + option = 1; + Mate[v] = w; + GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost + + } // End of if CandidateMate[w] = v + + } // End of if a Ghost Vertex + else { // w is a local vertex + + if (candidateMate[w - StartIndex] == (v + StartIndex)) { + option = 3; + Mate[v] = w; // v is local + Mate[w - StartIndex] = v + StartIndex; // w is local + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") "; + fflush(stdout); #endif - - } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) - } // End of Else - + + } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) + } // End of Else + } // End of second if - + } // End critical processExposed - + } // End of if(w >=0) - else - { - // This piece of code is executed a really small amount of times - adj11 = verLocPtr[v]; - adj12 = verLocPtr[v + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + else { + // This piece of code is executed a really small amount of times + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); #endif - (*msgInd)++; - (*NumMessagesBundled)++; - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - // assert(ghostOwner != -1); - // assert(ghostOwner != myRank); - PCounter[ghostOwner]++; - - privateQLocalVtx.push_back(v + StartIndex); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); - - } // End of if(GHOST) - } // End of for loop + (*msgInd)++; + (*NumMessagesBundled)++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + + privateQLocalVtx.push_back(v + StartIndex); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop } // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - + switch (option) { case -1: @@ -202,4 +192,4 @@ void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, privateQOwner); } // End of parallel region -} \ No newline at end of file +} diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp index e96dcc1d..d9363c39 100644 --- a/amgprec/impl/aggregator/processMatchedVertices.cpp +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -46,264 +46,249 @@ void processMatchedVertices( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ - firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ - num_threads(NUM_THREAD) \ - reduction(+ \ - : msgInd[:1], PCounter \ - [:numProcs], myCard \ - [:1], NumMessagesBundled \ + //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, \ + privateQMsgType, privateQOwner, UChunkBeingProcessed) \ + default(shared) num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ [:1]) { - while (!U.empty()) - { - - extractUChunk(UChunkBeingProcessed, U, privateU); - - for (MilanLongInt u : UChunkBeingProcessed) - { + while (!U.empty()) { + + extractUChunk(UChunkBeingProcessed, U, privateU); + + for (MilanLongInt u : UChunkBeingProcessed) { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")u: " << u; - fflush(stdout); + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); #endif - if ((u >= StartIndex) && (u <= EndIndex)) - { // Process Only the Local Vertices - + if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices + #ifdef COUNT_LOCAL_VERTEX - localVertices++; + localVertices++; #endif - - // Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; // Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) - { - option = -1; - v = verLocInd[k]; - - if ((v >= StartIndex) && (v <= EndIndex)) - { // If Local Vertex: - + + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) { + option = -1; + v = verLocInd[k]; + + if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex: + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; - fflush(stdout); + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); #endif #pragma omp atomic read - mateVal = Mate[v - StartIndex]; - // If the current vertex is pointing to a matched vertex and is not matched - if (mateVal < 0) - { + mateVal = Mate[v - StartIndex]; + // If the current vertex is pointing to a matched vertex and is not matched + if (mateVal < 0) { #pragma omp critical - { - if (candidateMate[v - StartIndex] == u) - { - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], - verLocPtr[v - StartIndex + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - - candidateMate[v - StartIndex] = w; - + { + if (candidateMate[v - StartIndex] == u) { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w; - fflush(stdout); + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); #endif - // If found a dominating edge: - if (w >= 0) - { - - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + // If found a dominating edge: + if (w >= 0) { + if ((w < StartIndex) || (w > EndIndex)) { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message:"; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); #endif - option = 2; - - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - option = 1; - Mate[v - StartIndex] = w; // v is a local vertex - GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex - - } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex - if (candidateMate[w - StartIndex] == v) - { - option = 3; - Mate[v - StartIndex] = w; // v is a local vertex - Mate[w - StartIndex] = v; // w is a local vertex - + option = 2; + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + option = 1; + Mate[v - StartIndex] = w; // v is a local vertex + GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex + + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else { // w is a local vertex + if (candidateMate[w - StartIndex] == v) { + option = 3; + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); #endif - } // End of if(CandidateMate(w) = v - } // End of Else - } // End of if(w >=0) - else - option = 4; // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of If (candidateMate[v-StartIndex] == u - } // End of task - } // mateval < 0 - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else - { // Neighbor is a ghost vertex - + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + option = 4; // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of If (candidateMate[v-StartIndex] == u + } // End of task + } // mateval < 0 + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { // Neighbor is a ghost vertex + #pragma omp critical - { - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) - option = 5; // u is local - } // End of critical - } // End of Else //A Ghost Vertex - - switch (option) - { - case -1: - // No things to do - break; - case 1: - // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v - privateU.push_back(v); - privateU.push_back(w); - - (*myCard)++; + { + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + option = 5; // u is local + } // End of critical + } // End of Else //A Ghost Vertex + + switch (option) + { + case -1: + // No things to do + break; + case 1: + // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v + privateU.push_back(v); + privateU.push_back(w); + + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); #endif - // Decrement the counter: - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); - case 2: - - // Found a dominating edge, it is a ghost - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - // assert(ghostOwner != -1); - // assert(ghostOwner != myRank); - PCounter[ghostOwner]++; - (*NumMessagesBundled)++; - (*msgInd)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(REQUEST); - privateQOwner.push_back(ghostOwner); - break; - case 3: - privateU.push_back(v); - privateU.push_back(w); - - (*myCard)++; - break; - case 4: - // Could not find a dominating vertex - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); + case 2: + + // Found a dominating edge, it is a ghost + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + (*NumMessagesBundled)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + privateU.push_back(v); + privateU.push_back(w); + + (*myCard)++; + break; + case 4: + // Could not find a dominating vertex + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { // A ghost + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); #endif - - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - // assert(ghostOwner != -1); - // assert(ghostOwner != myRank); - - PCounter[ghostOwner]++; - (*NumMessagesBundled)++; - (*msgInd)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); - - } // End of if(GHOST) - } // End of for loop - break; - case 5: - default: - + + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + + PCounter[ghostOwner]++; + (*NumMessagesBundled)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + break; + case 5: + default: + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); #endif - - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - // assert(ghostOwner != -1); - // assert(ghostOwner != myRank); - - (*NumMessagesBundled)++; - PCounter[ghostOwner]++; - (*msgInd)++; - - privateQLocalVtx.push_back(u); - privateQGhostVtx.push_back(v); - privateQMsgType.push_back(SUCCESS); - privateQOwner.push_back(ghostOwner); - - break; - } // End of switch - - } // End of inner for - } - } // End of outer for - - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); - + + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + + (*NumMessagesBundled)++; + PCounter[ghostOwner]++; + (*msgInd)++; + + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); + + break; + } // End of switch + + } // End of inner for + } + } // End of outer for + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + #pragma omp critical(U) - { - U.insert(U.end(), privateU.begin(), privateU.end()); - } - - privateU.clear(); - + { + U.insert(U.end(), privateU.begin(), privateU.end()); + } + + privateU.clear(); + #pragma omp critical(sendMessageTransfer) - { - - QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); - QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); - QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); - QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); - } - - privateQLocalVtx.clear(); - privateQGhostVtx.clear(); - privateQMsgType.clear(); - privateQOwner.clear(); - + { + + QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); + QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); + QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); + QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); + } + + privateQLocalVtx.clear(); + privateQGhostVtx.clear(); + privateQMsgType.clear(); + privateQOwner.clear(); + } // End of while ( !U.empty() ) - + #ifdef COUNT_LOCAL_VERTEX printf("Count local vertexes: %ld for thread %d of processor %d\n", localVertices, omp_get_thread_num(), myRank); - + #endif } // End of parallel region } diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp index 3322a05b..469d7a16 100644 --- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -1,39 +1,39 @@ #include "MatchBoxPC.h" - +//#define DEBUG_HANG_ void processMatchedVerticesAndSendMessages( - MilanLongInt NLVer, - vector &UChunkBeingProcessed, - vector &U, - vector &privateU, - MilanLongInt StartIndex, - MilanLongInt EndIndex, - MilanLongInt *myCard, - MilanLongInt *msgInd, - MilanLongInt *NumMessagesBundled, - MilanLongInt *SPtr, - MilanLongInt *verLocPtr, - MilanLongInt *verLocInd, - MilanLongInt *verDistance, - MilanLongInt *PCounter, - vector &Counter, - MilanInt myRank, - MilanInt numProcs, - MilanLongInt *candidateMate, - vector &GMate, - MilanLongInt *Mate, - map &Ghost2LocalMap, - MilanReal *edgeLocWeight, - vector &QLocalVtx, - vector &QGhostVtx, - vector &QMsgType, - vector &QOwner, - vector &privateQLocalVtx, - vector &privateQGhostVtx, - vector &privateQMsgType, - vector &privateQOwner, - MPI_Comm comm, - MilanLongInt *msgActual, - vector &Message) + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + vector &U, + vector &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner, + MPI_Comm comm, + MilanLongInt *msgActual, + vector &Message) { MilanLongInt initialSize = QLocalVtx.size(); @@ -50,266 +50,259 @@ void processMatchedVerticesAndSendMessages( #ifdef COUNT_LOCAL_VERTEX MilanLongInt localVertices = 0; #endif -#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ - firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ - num_threads(NUM_THREAD) \ - reduction(+ \ - : msgInd[:1], PCounter \ - [:numProcs], myCard \ - [:1], NumMessagesBundled \ - [:1], msgActual \ + //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\ + privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ + num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1], msgActual \ [:1]) { - while (!U.empty()) - { - - extractUChunk(UChunkBeingProcessed, U, privateU); - - for (MilanLongInt u : UChunkBeingProcessed) - { + while (!U.empty()) { + + extractUChunk(UChunkBeingProcessed, U, privateU); + + for (MilanLongInt u : UChunkBeingProcessed) { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")u: " << u; - fflush(stdout); + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); #endif - if ((u >= StartIndex) && (u <= EndIndex)) - { // Process Only the Local Vertices - + if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices + #ifdef COUNT_LOCAL_VERTEX - localVertices++; + localVertices++; #endif - - // Get the Adjacency list for u - adj1 = verLocPtr[u - StartIndex]; // Pointer - adj2 = verLocPtr[u - StartIndex + 1]; - for (k = adj1; k < adj2; k++) - { - option = -1; - v = verLocInd[k]; - - if ((v >= StartIndex) && (v <= EndIndex)) - { // If Local Vertex: - + + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) { + option = -1; + v = verLocInd[k]; + + if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex: + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; - fflush(stdout); + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); #endif #pragma omp atomic read - mateVal = Mate[v - StartIndex]; - // If the current vertex is pointing to a matched vertex and is not matched - if (mateVal < 0) - { + mateVal = Mate[v - StartIndex]; + // If the current vertex is pointing to a matched vertex and is not matched + if (mateVal < 0) { #pragma omp critical - { - if (candidateMate[v - StartIndex] == u) - { - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], - verLocPtr[v - StartIndex + 1], - edgeLocWeight, 0, - verLocInd, - StartIndex, - EndIndex, - GMate, - Mate, - Ghost2LocalMap); - - candidateMate[v - StartIndex] = w; - + { + if (candidateMate[v - StartIndex] == u) { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w; - fflush(stdout); + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); #endif - // If found a dominating edge: - if (w >= 0) - { - - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost + // If found a dominating edge: + if (w >= 0) { + + if ((w < StartIndex) || (w > EndIndex)) { // A ghost #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message:"; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); #endif - option = 2; - - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - option = 1; - Mate[v - StartIndex] = w; // v is a local vertex - GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex - - } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex - if (candidateMate[w - StartIndex] == v) - { - option = 3; - Mate[v - StartIndex] = w; // v is a local vertex - Mate[w - StartIndex] = v; // w is a local vertex - + option = 2; + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + option = 1; + Mate[v - StartIndex] = w; // v is a local vertex + GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex + + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else { // w is a local vertex + if (candidateMate[w - StartIndex] == v) { + option = 3; + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); #endif - } // End of if(CandidateMate(w) = v - } // End of Else - } // End of if(w >=0) - else - option = 4; // End of Else: w == -1 - // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of If (candidateMate[v-StartIndex] == u - } // End of task - } // mateval < 0 - } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: - else - { // Neighbor is a ghost vertex - + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + option = 4; // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of If (candidateMate[v-StartIndex] == u + } // End of task + } // mateval < 0 + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { // Neighbor is a ghost vertex + #pragma omp critical - { - if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) - candidateMate[NLVer + Ghost2LocalMap[v]] = -1; - if (v != Mate[u - StartIndex]) - option = 5; // u is local - } // End of critical - } // End of Else //A Ghost Vertex - - switch (option) - { - case -1: - // No things to do - break; - case 1: - // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v - privateU.push_back(v); - privateU.push_back(w); - (*myCard)++; + { + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + option = 5; // u is local + } // End of critical + } // End of Else //A Ghost Vertex + + switch (option) + { + case -1: + // No things to do + break; + case 1: + // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v + privateU.push_back(v); + privateU.push_back(w); + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); #endif - // Decrement the counter: - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); - case 2: - - // Found a dominating edge, it is a ghost - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - - // Build the Message Packet: - // Message[0] = v; // LOCAL - // Message[1] = w; // GHOST - // Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) - // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - - (*msgActual)++; - (*msgInd)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(REQUEST); - privateQOwner.push_back(ghostOwner); - break; - case 3: - privateU.push_back(v); - privateU.push_back(w); - (*myCard)++; - break; - case 4: - // Could not find a dominating vertex - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); + case 2: + + // Found a dominating edge, it is a ghost + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + + // Build the Message Packet: + // Message[0] = v; // LOCAL + // Message[1] = w; // GHOST + // Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + privateU.push_back(v); + privateU.push_back(w); + (*myCard)++; + break; + case 4: + // Could not find a dominating vertex + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { // A ghost + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); - fflush(stdout); + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); #endif - - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - - // Build the Message Packet: - // Message[0] = v; // LOCAL - // Message[1] = w; // GHOST - // Message[2] = FAILURE; // TYPE - // Send a Request (Asynchronous) - // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - - (*msgActual)++; - (*msgInd)++; - - privateQLocalVtx.push_back(v); - privateQGhostVtx.push_back(w); - privateQMsgType.push_back(FAILURE); - privateQOwner.push_back(ghostOwner); - - } // End of if(GHOST) - } // End of for loop - break; - case 5: - default: - + + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + + // Build the Message Packet: + // Message[0] = v; // LOCAL + // Message[1] = w; // GHOST + // Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + break; + case 5: + default: + #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a success message: "; - cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); #endif - - ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); - - // Build the Message Packet: - // Message[0] = u; // LOCAL - // Message[1] = v; // GHOST - // Message[2] = SUCCESS; // TYPE - // Send a Request (Asynchronous) - // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - - (*msgActual)++; - (*msgInd)++; - - privateQLocalVtx.push_back(u); - privateQGhostVtx.push_back(v); - privateQMsgType.push_back(SUCCESS); - privateQOwner.push_back(ghostOwner); - - break; - } // End of switch - } // End of inner for - } - } // End of outer for - - queuesTransfer(U, privateU, QLocalVtx, - QGhostVtx, - QMsgType, QOwner, privateQLocalVtx, - privateQGhostVtx, - privateQMsgType, - privateQOwner); - + + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + + // Build the Message Packet: + // Message[0] = u; // LOCAL + // Message[1] = v; // GHOST + // Message[2] = SUCCESS; // TYPE + // Send a Request (Asynchronous) + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; + (*msgInd)++; + + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); + + break; + } // End of switch + } // End of inner for + } + } // End of outer for + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + } // End of while ( !U.empty() ) - + #ifdef COUNT_LOCAL_VERTEX printf("Count local vertexes: %ld for thread %d of processor %d\n", localVertices, omp_get_thread_num(), myRank); - + #endif } // End of parallel region - + // Send the messages - for (int i = initialSize; i < QOwner.size(); i++) - { - - Message[0] = QLocalVtx[i]; - Message[1] = QGhostVtx[i]; - Message[2] = QMsgType[i]; - ghostOwner = QOwner[i]; - - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); +#ifdef DEBUG_HANG_ + cout << myRank<<" Sending: "<(), ghostOwner, ComputeTag, comm); + //cout << myRank<<" Sending to "<(), ghostOwner, ComputeTag, comm); } +#ifdef DEBUG_HANG_ + cout << myRank<<" Done sending messages"<(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus); if (error_codeC != MPI_SUCCESS) { @@ -86,70 +88,66 @@ void processMessages( fflush(stdout); } Sender = computeStatus.MPI_SOURCE; - + //cout << " ...from "<(), Sender, BundleTag, comm, &computeStatus); - if (error_codeC != MPI_SUCCESS) - { - MPI_Error_string(error_codeC, error_message, &message_length); - cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n"; - fflush(stdout); - } + // Receive the message + //cout << myRank<<" Receiving from "<(), Sender, BundleTag, comm, &computeStatus); + if (error_codeC != MPI_SUCCESS) { + MPI_Error_string(error_codeC, error_message, &message_length); + cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n"; + fflush(stdout); + } #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message Bundle After: " << endl; - for (int i = 0; i < bundleSize; i++) - cout << ReceiveBuffer[i] << ","; - cout << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Message Bundle After: " << endl; + for (int i = 0; i < bundleSize; i++) + cout << ReceiveBuffer[i] << ","; + cout << endl; + fflush(stdout); #endif - } - else - { // Just a single message: + } else { // Just a single message: #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl; + fflush(stdout); #endif - // Add the current message to Queue: - bundleSize = 3; //#of integers in the message - // Build the Message Buffer: - if (!ReceiveBuffer.empty()) - ReceiveBuffer.clear(); // Empty it out first - ReceiveBuffer.resize(bundleSize, -1); // Initialize - - ReceiveBuffer[0] = Message[0]; // u - ReceiveBuffer[1] = Message[1]; // v - ReceiveBuffer[2] = Message[2]; // message_type + // Add the current message to Queue: + bundleSize = 3; //#of integers in the message + // Build the Message Buffer: + if (!ReceiveBuffer.empty()) + ReceiveBuffer.clear(); // Empty it out first + ReceiveBuffer.resize(bundleSize, -1); // Initialize + + ReceiveBuffer[0] = Message[0]; // u + ReceiveBuffer[1] = Message[1]; // v + ReceiveBuffer[2] = Message[2]; // message_type } - + #ifdef DEBUG_GHOST_ - if ((v < StartIndex) || (v > EndIndex)) - { - cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; - fflush(stdout); + if ((v < StartIndex) || (v > EndIndex)) { + cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; + fflush(stdout); } #endif #ifdef PRINT_DEBUG_INFO_ @@ -158,172 +156,160 @@ void processMessages( #endif // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop - for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3) - { - u = ReceiveBuffer[bundleCounter - 3]; // GHOST - v = ReceiveBuffer[bundleCounter - 2]; // LOCAL - message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE - - // CASE I: REQUEST - if (message_type == REQUEST) - { + for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3) { + u = ReceiveBuffer[bundleCounter - 3]; // GHOST + v = ReceiveBuffer[bundleCounter - 2]; // LOCAL + message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE + + // CASE I: REQUEST + if (message_type == REQUEST) { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message type is REQUEST" << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Message type is REQUEST" << endl; + fflush(stdout); #endif #ifdef DEBUG_GHOST_ - if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) - { - cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; - fflush(stdout); - } + if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { + cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } #endif - if (Mate[v - StartIndex] == -1) - { // Process only if not already matched (v is local) - candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost - if (candidateMate[v - StartIndex] == u) - { - GMate[Ghost2LocalMap[u]] = v; // u is ghost - Mate[v - StartIndex] = u; // v is local - U.push_back(v); - U.push_back(u); - (*myCard)++; + if (Mate[v - StartIndex] == -1) { + // Process only if not already matched (v is local) + candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost + if (candidateMate[v - StartIndex] == u) { + GMate[Ghost2LocalMap[u]] = v; // u is ghost + Mate[v - StartIndex] = u; // v is local + U.push_back(v); + U.push_back(u); + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; + fflush(stdout); #endif - - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); - } // End of if ( candidateMate[v-StartIndex] == u )e - } // End of if ( Mate[v] == -1 ) - } // End of REQUEST - else - { // CASE II: SUCCESS - if (message_type == SUCCESS) - { + + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); + } // End of if ( candidateMate[v-StartIndex] == u )e + } // End of if ( Mate[v] == -1 ) + } // End of REQUEST + else { // CASE II: SUCCESS + if (message_type == SUCCESS) { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; + fflush(stdout); #endif - GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); #ifdef DEBUG_GHOST_ - if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) - { - cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; - fflush(stdout); - } + if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { + cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } #endif - if (Mate[v - StartIndex] == -1) - { // Process only if not already matched ( v is local) - if (candidateMate[v - StartIndex] == u) - { - // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap); - candidateMate[v - StartIndex] = w; + if (Mate[v - StartIndex] == -1) { + // Process only if not already matched ( v is local) + if (candidateMate[v - StartIndex] == u) { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, + verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap); + candidateMate[v - StartIndex] = w; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl; - fflush(stdout); + cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl; + fflush(stdout); #endif - // If found a dominating edge: - if (w >= 0) - { - if ((w < StartIndex) || (w > EndIndex)) - { // w is a ghost - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = REQUEST; // TYPE - // Send a Request (Asynchronous) + // If found a dominating edge: + if (w >= 0) { + if ((w < StartIndex) || (w > EndIndex)) { + // w is a ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a request message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a request message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - //assert(ghostOwner != -1); - //assert(ghostOwner != myRank); - - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - (*msgInd)++; - (*msgActual)++; - if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) - { - Mate[v - StartIndex] = w; // v is local - GMate[Ghost2LocalMap[w]] = v; // w is ghost - U.push_back(v); - U.push_back(w); - (*myCard)++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); + //cout << myRank<<" Sending to "<(), ghostOwner, ComputeTag, comm); + (*msgInd)++; + (*msgActual)++; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + Mate[v - StartIndex] = w; // v is local + GMate[Ghost2LocalMap[w]] = v; // w is ghost + U.push_back(v); + U.push_back(w); + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; + fflush(stdout); #endif - - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); - } // End of if CandidateMate[w] = v - } // End of if a Ghost Vertex - else - { // w is a local vertex - if (candidateMate[w - StartIndex] == v) - { - Mate[v - StartIndex] = w; // v is local - Mate[w - StartIndex] = v; // w is local - // Q.push_back(u); - U.push_back(v); - U.push_back(w); - (*myCard)++; + + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else { // w is a local vertex + if (candidateMate[w - StartIndex] == v) { + Mate[v - StartIndex] = w; // v is local + Mate[w - StartIndex] = v; // w is local + // Q.push_back(u); + U.push_back(v); + U.push_back(w); + (*myCard)++; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; - fflush(stdout); + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; + fflush(stdout); #endif - } // End of if(CandidateMate(w) = v - } // End of Else - } // End of if(w >=0) - else - { // No dominant edge found - adj11 = verLocPtr[v - StartIndex]; - adj12 = verLocPtr[v - StartIndex + 1]; - for (k1 = adj11; k1 < adj12; k1++) - { - w = verLocInd[k1]; - if ((w < StartIndex) || (w > EndIndex)) - { // A ghost - // Build the Message Packet: - Message[0] = v; // LOCAL - Message[1] = w; // GHOST - Message[2] = FAILURE; // TYPE - // Send a Request (Asynchronous) + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else { // No dominant edge found + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { + // A ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending a failure message: "; - cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); #endif - ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); - //assert(ghostOwner != -1); - //assert(ghostOwner != myRank); - MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); - (*msgInd)++; - (*msgActual)++; - } // End of if(GHOST) - } // End of for loop - } // End of Else: w == -1 + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); + //cout << myRank<<" Sending to "<(), ghostOwner, ComputeTag, comm); + (*msgInd)++; + (*msgActual)++; + } // End of if(GHOST) + } // End of for loop + } // End of Else: w == -1 // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) - } // End of if ( candidateMate[v-StartIndex] == u ) - } // End of if ( Mate[v] == -1 ) - } // End of if ( message_type == SUCCESS ) - else - { // CASE III: FAILURE + } // End of if ( candidateMate[v-StartIndex] == u ) + } // End of if ( Mate[v] == -1 ) + } // End of if ( message_type == SUCCESS ) + else { + // CASE III: FAILURE #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Message type is FAILURE" << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Message type is FAILURE" << endl; + fflush(stdout); #endif - GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore - PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter - } // End of else: CASE III - } // End of else: CASE I + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter + } // End of else: CASE III + } // End of else: CASE I } - + return; -} \ No newline at end of file +} diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp index 7200b43d..33c65749 100644 --- a/amgprec/impl/aggregator/queueTransfer.cpp +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -32,4 +32,5 @@ void queuesTransfer(vector &U, privateQGhostVtx.clear(); privateQMsgType.clear(); privateQOwner.clear(); -} \ No newline at end of file + +} diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp index f7fd2f78..80a88b94 100644 --- a/amgprec/impl/aggregator/sendBundledMessages.cpp +++ b/amgprec/impl/aggregator/sendBundledMessages.cpp @@ -38,108 +38,107 @@ void sendBundledMessages(MilanLongInt *numGhostEdges, #pragma omp task depend(inout \ : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \ : NumMessagesBundled, numProcs) - {try { + { + try { PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message - PCumulative.reserve(numProcs + 1); // Similar to Row Pointer vector in CSR data structure - PSizeInfoMessages.reserve(numProcs * 3); // Buffer to hold the Size info message packets -} -catch (length_error) -{ - cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; - cout << "Not enough memory to allocate the internal variables \n"; - exit(1); -} -PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize -PCumulative.resize(numProcs + 1, 0); // Only initialize the counter variable -PSizeInfoMessages.resize(numProcs * 3, 0); -} + PCumulative.reserve(numProcs + 1); // Similar to Row Pointer vector in CSR data structure + PSizeInfoMessages.reserve(numProcs * 3); // Buffer to hold the Size info message packets + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize + PCumulative.resize(numProcs + 1, 0); // Only initialize the counter variable + PSizeInfoMessages.resize(numProcs * 3, 0); + } #pragma omp task depend(inout \ - : PCumulative) depend(in \ + : PCumulative) depend(in \ : PCounter) -{ - for (i = 0; i < numProcs; i++) - PCumulative[i + 1] = PCumulative[i] + PCounter[i]; -} - + { + for (i = 0; i < numProcs; i++) + PCumulative[i + 1] = PCumulative[i] + PCounter[i]; + } + #pragma omp task depend(inout \ : PCounter) -{ - // Reuse PCounter to keep track of how many messages were inserted: - for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers! - PCounter[i] = 0; -} + { + // Reuse PCounter to keep track of how many messages were inserted: + for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers! + PCounter[i] = 0; + } // Build the Message Bundle packet: #pragma omp task depend(in \ : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \ : myIndex, PMessageBundle, PCounter) { - for (i = 0; i < NumMessagesBundled; i++) - { - myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3; - PMessageBundle[myIndex + 0] = QLocalVtx[i]; - PMessageBundle[myIndex + 1] = QGhostVtx[i]; - PMessageBundle[myIndex + 2] = QMsgType[i]; - PCounter[QOwner[i]]++; - } -} - + for (i = 0; i < NumMessagesBundled; i++) { + myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3; + PMessageBundle[myIndex + 0] = QLocalVtx[i]; + PMessageBundle[myIndex + 1] = QGhostVtx[i]; + PMessageBundle[myIndex + 2] = QMsgType[i]; + PCounter[QOwner[i]]++; + } + } + // Send the Bundled Messages: Use ISend #pragma omp task depend(out \ : SRequest, SStatus) -{ - try - { - SRequest.reserve(numProcs * 2); // At most two messages per processor - SStatus.reserve(numProcs * 2); // At most two messages per processor - } - catch (length_error) - { - cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n"; - cout << "Not enough memory to allocate the internal variables \n"; - exit(1); - } -} - + { + try + { + SRequest.reserve(numProcs * 2); // At most two messages per processor + SStatus.reserve(numProcs * 2); // At most two messages per processor + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + } + // Send the Messages #pragma omp task depend(inout \ : SRequest, PSizeInfoMessages, PCumulative) depend(out \ : *msgActual, *msgInd) { - for (i = 0; i < numProcs; i++) - { // Changed by Fabio to be an integer, addresses needs to be integers! - if (i == myRank) // Do not send anything to yourself - continue; - // Send the Message with information about the size of next message: - // Build the Message Packet: - PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message - PSizeInfoMessages[i * 3 + 1] = -1; // Dummy packet - PSizeInfoMessages[i * 3 + 2] = SIZEINFO; // TYPE - // Send a Request (Asynchronous) + for (i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers! + if (i == myRank) // Do not send anything to yourself + continue; + // Send the Message with information about the size of next message: + // Build the Message Packet: + PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message + PSizeInfoMessages[i * 3 + 1] = -1; // Dummy packet + PSizeInfoMessages[i * 3 + 2] = SIZEINFO; // TYPE + // Send a Request (Asynchronous) #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl; - fflush(stdout); + cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl; + fflush(stdout); #endif - if (PSizeInfoMessages[i * 3 + 0] > 0) - { // Send only if it is a nonempty packet - MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, - &SRequest[(*msgInd)]); - (*msgActual)++; - (*msgInd)++; - // Now Send the message with the data packet: + if (PSizeInfoMessages[i * 3 + 0] > 0) + { // Send only if it is a nonempty packet + MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, + &SRequest[(*msgInd)]); + (*msgActual)++; + (*msgInd)++; + // Now Send the message with the data packet: #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl; - for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++) - cout << PMessageBundle[k] << ","; - cout << endl; - fflush(stdout); + cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl; + for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++) + cout << PMessageBundle[k] << ","; + cout << endl; + fflush(stdout); #endif - MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], - TypeMap(), i, BundleTag, comm, &SRequest[(*msgInd)]); - (*msgInd)++; - } // End of if size > 0 - } + MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], + TypeMap(), i, BundleTag, comm, &SRequest[(*msgInd)]); + (*msgInd)++; + } // End of if size > 0 + } } #pragma omp task depend(inout \ @@ -147,64 +146,64 @@ PSizeInfoMessages.resize(numProcs * 3, 0); { // Free up temporary memory: - PCumulative.clear(); - QLocalVtx.clear(); - QGhostVtx.clear(); - QMsgType.clear(); - QOwner.clear(); + PCumulative.clear(); + QLocalVtx.clear(); + QGhostVtx.clear(); + QMsgType.clear(); + QOwner.clear(); } #pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges) { #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; - cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2; - cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled; - if (*numGhostEdges > 0) + cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; + cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2; + cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled; + if (*numGhostEdges > 0) { - cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n"; + cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n"; } - fflush(stdout); + fflush(stdout); #endif - // Allocate memory for MPI Send messages: - /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */ - OneMessageSize = 0; - MPI_Pack_size(3, TypeMap(), comm, &OneMessageSize); // Size of one message packet - // How many messages to send? - // Potentially three kinds of messages will be sent/received: - // Request, Success, Failure. - // But only two will be sent from a given processor. - // Substract the number of messages that have already been sent as bundled messages: - numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled; - *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; + // Allocate memory for MPI Send messages: + /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */ + OneMessageSize = 0; + MPI_Pack_size(3, TypeMap(), comm, &OneMessageSize); // Size of one message packet + // How many messages to send? + // Potentially three kinds of messages will be sent/received: + // Request, Success, Failure. + // But only two will be sent from a given processor. + // Substract the number of messages that have already been sent as bundled messages: + numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled; + *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; } #pragma omp task depend(out : Buffer) depend(in : *BufferSize) -{ - Buffer = 0; + { + Buffer = 0; #ifdef PRINT_DEBUG_INFO_ - cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize; - cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD; - cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; - cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend; - cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize); - cout << "\n(" << myRank << ")Attaching Buffer on.. "; - fflush(stdout); + cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize; + cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD; + cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; + cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend; + cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize); + cout << "\n(" << myRank << ")Attaching Buffer on.. "; + fflush(stdout); #endif - if ((*BufferSize) > 0) - { - Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory - if (Buffer == 0) - { - cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; - cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n"; - exit(1); - } - MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer - } + if ((*BufferSize) > 0) + { + Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory + if (Buffer == 0) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n"; + exit(1); + } + MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer + } + } } } } -} \ No newline at end of file From a612cea167c0d74ffd14cf00eaea91ae61e96cc2 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Fri, 10 Feb 2023 07:53:04 -0500 Subject: [PATCH 83/96] Debug for matchboxp --- amgprec/amg_d_matchboxp_mod.f90 | 39 ++++++++++++++++++++++++++++----- amgprec/amg_s_matchboxp_mod.f90 | 39 ++++++++++++++++++++++++++++----- 2 files changed, 66 insertions(+), 12 deletions(-) diff --git a/amgprec/amg_d_matchboxp_mod.f90 b/amgprec/amg_d_matchboxp_mod.f90 index 2df58797..e19ce617 100644 --- a/amgprec/amg_d_matchboxp_mod.f90 +++ b/amgprec/amg_d_matchboxp_mod.f90 @@ -143,7 +143,7 @@ contains type(psb_ld_coo_sparse_mat) :: tmpcoo logical :: display_out_, print_out_, reproducible_ logical, parameter :: dump=.false., debug=.false., dump_mate=.false., & - & debug_ilaggr=.false., debug_sync=.false. + & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false. integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 logical, parameter :: do_timings=.true. integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2 @@ -214,7 +214,20 @@ contains call psb_barrier(ictxt) if (iam == 0) write(0,*)' out from buildmatching:', info end if - + if (debug_mate) then + block + integer(psb_lpk_), allocatable :: ckmate(:) + allocate(ckmate(nr)) + ckmate(1:nr) = mate(1:nr) + call psb_msort(ckmate(1:nr)) + do i=1,nr-1 + if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then + write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i) + end if + end do + end block + end if + if (info == 0) then if (do_timings) call psb_tic(idx_phase2) if (debug_sync) then @@ -276,6 +289,9 @@ contains ilaggr(idx) = nlaggr(iam) wtemp(k) = w(k)/nrmagg wtemp(idx) = w(idx)/nrmagg + else + write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,& + &mate(idx),ilaggr(idx) end if nlpairs = nlpairs+1 else if (idx <= nc) then @@ -326,6 +342,12 @@ contains nlsingl = nlsingl + 1 end if end if + if (ilaggr(k) == ilaggr_neginit) then + write(0,*) iam,' Error: no update to ',k,mate(k),& + & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k) + end if + else + if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k) end if end if end do @@ -360,9 +382,14 @@ contains else write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx) end if - end if - if (ilaggr(k) <0) then - write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + else if (ilaggr(k) <0) then + write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + write(0,*) iam,' : : ',nr,nc,mate(k) + if (mate(k) <= nr) then + write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),& + & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k))) + end if + flush(0) end if end do if (debug_sync) then @@ -415,7 +442,7 @@ contains end block if (iam == 0) then - write(0,*) 'Matching statistics: Unmatched nodes ',& + write(0,*) iam,'Matching statistics: Unmatched nodes ',& & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs end if diff --git a/amgprec/amg_s_matchboxp_mod.f90 b/amgprec/amg_s_matchboxp_mod.f90 index 9144d70e..a7f41c24 100644 --- a/amgprec/amg_s_matchboxp_mod.f90 +++ b/amgprec/amg_s_matchboxp_mod.f90 @@ -143,7 +143,7 @@ contains type(psb_ls_coo_sparse_mat) :: tmpcoo logical :: display_out_, print_out_, reproducible_ logical, parameter :: dump=.false., debug=.false., dump_mate=.false., & - & debug_ilaggr=.false., debug_sync=.false. + & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false. integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 logical, parameter :: do_timings=.true. integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2 @@ -214,7 +214,20 @@ contains call psb_barrier(ictxt) if (iam == 0) write(0,*)' out from buildmatching:', info end if - + if (debug_mate) then + block + integer(psb_lpk_), allocatable :: ckmate(:) + allocate(ckmate(nr)) + ckmate(1:nr) = mate(1:nr) + call psb_msort(ckmate(1:nr)) + do i=1,nr-1 + if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then + write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i) + end if + end do + end block + end if + if (info == 0) then if (do_timings) call psb_tic(idx_phase2) if (debug_sync) then @@ -276,6 +289,9 @@ contains ilaggr(idx) = nlaggr(iam) wtemp(k) = w(k)/nrmagg wtemp(idx) = w(idx)/nrmagg + else + write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,& + &mate(idx),ilaggr(idx) end if nlpairs = nlpairs+1 else if (idx <= nc) then @@ -326,6 +342,12 @@ contains nlsingl = nlsingl + 1 end if end if + if (ilaggr(k) == ilaggr_neginit) then + write(0,*) iam,' Error: no update to ',k,mate(k),& + & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k) + end if + else + if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k) end if end if end do @@ -360,9 +382,14 @@ contains else write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx) end if - end if - if (ilaggr(k) <0) then - write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + else if (ilaggr(k) <0) then + write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + write(0,*) iam,' : : ',nr,nc,mate(k) + if (mate(k) <= nr) then + write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),& + & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k))) + end if + flush(0) end if end do if (debug_sync) then @@ -415,7 +442,7 @@ contains end block if (iam == 0) then - write(0,*) 'Matching statistics: Unmatched nodes ',& + write(0,*) iam,'Matching statistics: Unmatched nodes ',& & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs end if From 73e5d499131c086b77edca6c94538fc2ecda17a0 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 2 Jun 2023 11:37:58 +0200 Subject: [PATCH 84/96] Added timers to build phases --- .../aggregator/amg_c_dec_aggregator_tprol.f90 | 10 +++++ .../impl/aggregator/amg_caggrmat_smth_bld.f90 | 38 ++++++++++++++++--- .../aggregator/amg_d_dec_aggregator_tprol.f90 | 10 +++++ .../impl/aggregator/amg_daggrmat_smth_bld.f90 | 38 ++++++++++++++++--- .../aggregator/amg_s_dec_aggregator_tprol.f90 | 10 +++++ .../impl/aggregator/amg_saggrmat_smth_bld.f90 | 38 ++++++++++++++++--- .../aggregator/amg_z_dec_aggregator_tprol.f90 | 10 +++++ .../impl/aggregator/amg_zaggrmat_smth_bld.f90 | 38 ++++++++++++++++--- .../impl/level/amg_c_base_onelev_mat_asb.f90 | 16 +++++++- .../impl/level/amg_d_base_onelev_mat_asb.f90 | 16 +++++++- .../impl/level/amg_s_base_onelev_mat_asb.f90 | 16 +++++++- .../impl/level/amg_z_base_onelev_mat_asb.f90 | 16 +++++++- 12 files changed, 224 insertions(+), 32 deletions(-) diff --git a/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90 index 4efaf61d..250cc549 100644 --- a/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90 +++ b/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90 @@ -97,6 +97,8 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,& integer(psb_lpk_) :: ntaggr integer(psb_ipk_) :: debug_level, debug_unit logical :: clean_zeros + integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1 + logical, parameter :: do_timings=.false. name='amg_c_dec_aggregator_tprol' call psb_erractionsave(err_act) @@ -108,6 +110,10 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,& info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_map_bld==-1)) & + & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld") + if ((do_timings).and.(idx_map_tprol==-1)) & + & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol") call amg_check_def(parms%ml_cycle,'Multilevel cycle',& & amg_mult_ml_,is_legal_ml_cycle) @@ -121,10 +127,14 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,& ! The decoupled aggregator based on SOC measures ignores ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! + if (do_timings) call psb_tic(idx_map_bld) clean_zeros = ag%do_clean_zeros call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) + if (do_timings) call psb_toc(idx_map_bld) + if (do_timings) call psb_tic(idx_map_tprol) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) + if (do_timings) call psb_toc(idx_map_tprol) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') diff --git a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 index 53e740fe..c4a85b05 100644 --- a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_spk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = czero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= czero) then adiag(i) = cone / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = cone end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 index 2edcca6c..26edbb0a 100644 --- a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 +++ b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 @@ -97,6 +97,8 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,& integer(psb_lpk_) :: ntaggr integer(psb_ipk_) :: debug_level, debug_unit logical :: clean_zeros + integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1 + logical, parameter :: do_timings=.false. name='amg_d_dec_aggregator_tprol' call psb_erractionsave(err_act) @@ -108,6 +110,10 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,& info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_map_bld==-1)) & + & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld") + if ((do_timings).and.(idx_map_tprol==-1)) & + & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol") call amg_check_def(parms%ml_cycle,'Multilevel cycle',& & amg_mult_ml_,is_legal_ml_cycle) @@ -121,10 +127,14 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,& ! The decoupled aggregator based on SOC measures ignores ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! + if (do_timings) call psb_tic(idx_map_bld) clean_zeros = ag%do_clean_zeros call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) + if (do_timings) call psb_toc(idx_map_bld) + if (do_timings) call psb_tic(idx_map_tprol) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) + if (do_timings) call psb_toc(idx_map_tprol) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') diff --git a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 index 82da3fc7..d365bf27 100644 --- a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_dpk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = dzero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= dzero) then adiag(i) = done / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = done end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 index c52c04f7..9529d141 100644 --- a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 +++ b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 @@ -97,6 +97,8 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,& integer(psb_lpk_) :: ntaggr integer(psb_ipk_) :: debug_level, debug_unit logical :: clean_zeros + integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1 + logical, parameter :: do_timings=.false. name='amg_s_dec_aggregator_tprol' call psb_erractionsave(err_act) @@ -108,6 +110,10 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,& info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_map_bld==-1)) & + & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld") + if ((do_timings).and.(idx_map_tprol==-1)) & + & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol") call amg_check_def(parms%ml_cycle,'Multilevel cycle',& & amg_mult_ml_,is_legal_ml_cycle) @@ -121,10 +127,14 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,& ! The decoupled aggregator based on SOC measures ignores ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! + if (do_timings) call psb_tic(idx_map_bld) clean_zeros = ag%do_clean_zeros call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) + if (do_timings) call psb_toc(idx_map_bld) + if (do_timings) call psb_tic(idx_map_tprol) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) + if (do_timings) call psb_toc(idx_map_tprol) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') diff --git a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 index d96176b2..c2eae3a4 100644 --- a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_spk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = szero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= szero) then adiag(i) = sone / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = sone end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 index a64e3ebb..a6a7856e 100644 --- a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 +++ b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 @@ -97,6 +97,8 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,& integer(psb_lpk_) :: ntaggr integer(psb_ipk_) :: debug_level, debug_unit logical :: clean_zeros + integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1 + logical, parameter :: do_timings=.false. name='amg_z_dec_aggregator_tprol' call psb_erractionsave(err_act) @@ -108,6 +110,10 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,& info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_map_bld==-1)) & + & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld") + if ((do_timings).and.(idx_map_tprol==-1)) & + & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol") call amg_check_def(parms%ml_cycle,'Multilevel cycle',& & amg_mult_ml_,is_legal_ml_cycle) @@ -121,10 +127,14 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,& ! The decoupled aggregator based on SOC measures ignores ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! + if (do_timings) call psb_tic(idx_map_bld) clean_zeros = ag%do_clean_zeros call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) + if (do_timings) call psb_toc(idx_map_bld) + if (do_timings) call psb_tic(idx_map_tprol) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) + if (do_timings) call psb_toc(idx_map_tprol) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') diff --git a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 index 2f944699..7b8ed075 100644 --- a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_dpk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = zzero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= zzero) then adiag(i) = zone / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = zone end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 index e79c90c9..27896806 100644 --- a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_cspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_c_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 diff --git a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 index e9e55a9a..6bd4e1ac 100644 --- a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_dspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_d_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 diff --git a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 index 271b31d0..034151d3 100644 --- a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_sspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_s_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 diff --git a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 index 07ab3e0b..eb11cad2 100644 --- a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_zspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_z_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 From 494b8b925ff06b9610db488f045909e166aa65cc Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 5 Jun 2023 11:46:39 +0200 Subject: [PATCH 85/96] OpenMP loop in samples data generation --- samples/advanced/pdegen/amg_d_genpde_mod.F90 | 420 ++++++++++--------- samples/advanced/pdegen/amg_s_genpde_mod.F90 | 420 ++++++++++--------- 2 files changed, 444 insertions(+), 396 deletions(-) diff --git a/samples/advanced/pdegen/amg_d_genpde_mod.F90 b/samples/advanced/pdegen/amg_d_genpde_mod.F90 index d6acd01c..ec3affc5 100644 --- a/samples/advanced/pdegen/amg_d_genpde_mod.F90 +++ b/samples/advanced/pdegen/amg_d_genpde_mod.F90 @@ -93,6 +93,9 @@ contains & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -128,7 +131,6 @@ contains type(psb_d_csc_sparse_mat) :: acsc type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr - real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -141,8 +143,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2 @@ -368,119 +369,128 @@ contains call psb_barrier(ctxt) talc = psb_wtime()-t0 - if (info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='allocation rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if (info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + !write(0,*) ' Outer in_parallel ',omp_in_parallel() + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -490,7 +500,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) call psb_barrier(ctxt) t1 = psb_wtime() @@ -557,6 +566,9 @@ contains & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -591,7 +603,6 @@ contains type(psb_d_csc_sparse_mat) :: acsc type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr - real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -604,8 +615,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2, dd @@ -791,7 +801,7 @@ contains !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() end if end block - + case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -816,93 +826,109 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(done,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + ! loop over rows belonging to current process in a block + ! distribution. + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(done,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -912,8 +938,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info) diff --git a/samples/advanced/pdegen/amg_s_genpde_mod.F90 b/samples/advanced/pdegen/amg_s_genpde_mod.F90 index 7d32cf30..dfa79ab3 100644 --- a/samples/advanced/pdegen/amg_s_genpde_mod.F90 +++ b/samples/advanced/pdegen/amg_s_genpde_mod.F90 @@ -93,6 +93,9 @@ contains & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -128,7 +131,6 @@ contains type(psb_s_csc_sparse_mat) :: acsc type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr - real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -141,8 +143,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2 @@ -368,119 +369,128 @@ contains call psb_barrier(ctxt) talc = psb_wtime()-t0 - if (info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='allocation rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if (info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + !write(0,*) ' Outer in_parallel ',omp_in_parallel() + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -490,7 +500,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) call psb_barrier(ctxt) t1 = psb_wtime() @@ -557,6 +566,9 @@ contains & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -591,7 +603,6 @@ contains type(psb_s_csc_sparse_mat) :: acsc type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr - real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -604,8 +615,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2, dd @@ -791,7 +801,7 @@ contains !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() end if end block - + case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -816,93 +826,109 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + ! loop over rows belonging to current process in a block + ! distribution. + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -912,8 +938,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info) From 3a5e73e4c8e2871f7912f7525c8b5d997803d3d3 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 14 Jun 2023 14:05:42 +0200 Subject: [PATCH 86/96] adjust NTH in samples/pdegen --- .../{amg_d_pde2d.f90 => amg_d_pde2d.F90} | 18 ++++++++++++++++-- .../{amg_d_pde3d.f90 => amg_d_pde3d.F90} | 16 +++++++++++++++- .../{amg_s_pde2d.f90 => amg_s_pde2d.F90} | 18 ++++++++++++++++-- .../{amg_s_pde3d.f90 => amg_s_pde3d.F90} | 16 +++++++++++++++- 4 files changed, 62 insertions(+), 6 deletions(-) rename samples/advanced/pdegen/{amg_d_pde2d.f90 => amg_d_pde2d.F90} (98%) rename samples/advanced/pdegen/{amg_d_pde3d.f90 => amg_d_pde3d.F90} (98%) rename samples/advanced/pdegen/{amg_s_pde2d.f90 => amg_s_pde2d.F90} (98%) rename samples/advanced/pdegen/{amg_s_pde3d.f90 => amg_s_pde3d.F90} (98%) diff --git a/samples/advanced/pdegen/amg_d_pde2d.f90 b/samples/advanced/pdegen/amg_d_pde2d.F90 similarity index 98% rename from samples/advanced/pdegen/amg_d_pde2d.f90 rename to samples/advanced/pdegen/amg_d_pde2d.F90 index c036aa6d..37e9fcd6 100644 --- a/samples/advanced/pdegen/amg_d_pde2d.f90 +++ b/samples/advanced/pdegen/amg_d_pde2d.F90 @@ -73,6 +73,9 @@ program amg_d_pde2d use amg_d_pde2d_exp_mod use amg_d_pde2d_box_mod use amg_d_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -93,7 +96,7 @@ program amg_d_pde2d type(psb_d_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -197,6 +200,15 @@ program amg_d_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -451,7 +463,9 @@ program amg_d_pde2d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) diff --git a/samples/advanced/pdegen/amg_d_pde3d.f90 b/samples/advanced/pdegen/amg_d_pde3d.F90 similarity index 98% rename from samples/advanced/pdegen/amg_d_pde3d.f90 rename to samples/advanced/pdegen/amg_d_pde3d.F90 index 1f6118ca..a1ceea84 100644 --- a/samples/advanced/pdegen/amg_d_pde3d.f90 +++ b/samples/advanced/pdegen/amg_d_pde3d.F90 @@ -74,6 +74,9 @@ program amg_d_pde3d use amg_d_pde3d_exp_mod use amg_d_pde3d_gauss_mod use amg_d_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -94,7 +97,7 @@ program amg_d_pde3d type(psb_d_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -198,6 +201,15 @@ program amg_d_pde3d call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -456,6 +468,8 @@ program amg_d_pde3d call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) diff --git a/samples/advanced/pdegen/amg_s_pde2d.f90 b/samples/advanced/pdegen/amg_s_pde2d.F90 similarity index 98% rename from samples/advanced/pdegen/amg_s_pde2d.f90 rename to samples/advanced/pdegen/amg_s_pde2d.F90 index a81d16ff..eb8a8d63 100644 --- a/samples/advanced/pdegen/amg_s_pde2d.f90 +++ b/samples/advanced/pdegen/amg_s_pde2d.F90 @@ -73,6 +73,9 @@ program amg_s_pde2d use amg_s_pde2d_exp_mod use amg_s_pde2d_box_mod use amg_s_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -93,7 +96,7 @@ program amg_s_pde2d type(psb_s_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -197,6 +200,15 @@ program amg_s_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -451,7 +463,9 @@ program amg_s_pde2d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) diff --git a/samples/advanced/pdegen/amg_s_pde3d.f90 b/samples/advanced/pdegen/amg_s_pde3d.F90 similarity index 98% rename from samples/advanced/pdegen/amg_s_pde3d.f90 rename to samples/advanced/pdegen/amg_s_pde3d.F90 index 7542c3a2..da5cd173 100644 --- a/samples/advanced/pdegen/amg_s_pde3d.f90 +++ b/samples/advanced/pdegen/amg_s_pde3d.F90 @@ -74,6 +74,9 @@ program amg_s_pde3d use amg_s_pde3d_exp_mod use amg_s_pde3d_gauss_mod use amg_s_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -94,7 +97,7 @@ program amg_s_pde3d type(psb_s_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -198,6 +201,15 @@ program amg_s_pde3d call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -456,6 +468,8 @@ program amg_s_pde3d call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) From 2fd718be6fc195157e49133028c7dc68fed11888 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 19 Jul 2023 15:59:11 +0200 Subject: [PATCH 87/96] Updates and measurements for OpenMP build --- amgprec/impl/aggregator/amg_c_ptap_bld.f90 | 34 +-- ...oc1_map_bld.f90 => amg_c_soc1_map_bld.F90} | 218 ++++++++++++++++-- amgprec/impl/aggregator/amg_d_ptap_bld.f90 | 34 +-- ...oc1_map_bld.f90 => amg_d_soc1_map_bld.F90} | 218 ++++++++++++++++-- amgprec/impl/aggregator/amg_s_ptap_bld.f90 | 34 +-- ...oc1_map_bld.f90 => amg_s_soc1_map_bld.F90} | 218 ++++++++++++++++-- amgprec/impl/aggregator/amg_z_ptap_bld.f90 | 34 +-- ...oc1_map_bld.f90 => amg_z_soc1_map_bld.F90} | 218 ++++++++++++++++-- amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 | 7 + amgprec/impl/solver/amg_c_gs_solver_bld.f90 | 11 +- amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 | 7 + amgprec/impl/solver/amg_d_gs_solver_bld.f90 | 11 +- amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 | 7 + amgprec/impl/solver/amg_s_gs_solver_bld.f90 | 11 +- amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 | 7 + amgprec/impl/solver/amg_z_gs_solver_bld.f90 | 11 +- samples/advanced/pdegen/amg_d_pde3d.F90 | 6 +- samples/advanced/pdegen/amg_s_pde3d.F90 | 6 +- samples/advanced/pdegen/runs/amg_pde3d.inp | 8 +- 19 files changed, 954 insertions(+), 146 deletions(-) rename amgprec/impl/aggregator/{amg_c_soc1_map_bld.f90 => amg_c_soc1_map_bld.F90} (59%) rename amgprec/impl/aggregator/{amg_d_soc1_map_bld.f90 => amg_d_soc1_map_bld.F90} (59%) rename amgprec/impl/aggregator/{amg_s_soc1_map_bld.f90 => amg_s_soc1_map_bld.F90} (59%) rename amgprec/impl/aggregator/{amg_z_soc1_map_bld.f90 => amg_z_soc1_map_bld.F90} (59%) diff --git a/amgprec/impl/aggregator/amg_c_ptap_bld.f90 b/amgprec/impl/aggregator/amg_c_ptap_bld.f90 index d787d7a7..02dcb1f4 100644 --- a/amgprec/impl/aggregator/amg_c_ptap_bld.f90 +++ b/amgprec/impl/aggregator/amg_c_ptap_bld.f90 @@ -76,7 +76,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. - integer(psb_ipk_), save :: idx_spspmm=-1 + integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1 name='amg_ptap_bld' if(psb_get_errstatus().ne.0) return @@ -93,7 +93,11 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ncol = desc_a%get_local_cols() if ((do_timings).and.(idx_spspmm==-1)) & - & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") + & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm") + if ((do_timings).and.(idx_cpytrans1==-1)) & + & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1") + if ((do_timings).and.(idx_cpytrans2==-1)) & + & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2") naggr = nlaggr(me+1) ntaggr = sum(nlaggr) @@ -128,6 +132,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ! Ok first product done. if (present(desc_ax)) then + if (do_timings) call psb_tic(idx_cpytrans1) block call coo_prol%cp_to_coo(coo_restr,info) call coo_restr%set_ncols(desc_ac%get_local_cols()) @@ -137,7 +142,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%set_ncols(desc_ax%get_local_cols()) end block call csr_restr%cp_from_coo(coo_restr,info) - + if (do_timings) call psb_toc(idx_cpytrans1) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 @@ -167,27 +172,28 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%transp() nzl = coo_restr%get_nzeros() - nrl = desc_ac%get_local_rows() - i=0 + nrl = desc_ac%get_local_rows() + call coo_restr%fix(info) + i=coo_restr%get_nzeros() ! ! Only keep local rows ! - do k=1, nzl - if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then - i = i+1 - coo_restr%val(i) = coo_restr%val(k) - coo_restr%ia(i) = coo_restr%ia(k) - coo_restr%ja(i) = coo_restr%ja(k) + search: do k=i,1,-1 + if (coo_restr%ia(k) <= nrl) then + call coo_restr%set_nzeros(k) + exit search end if - end do - call coo_restr%set_nzeros(i) - call coo_restr%fix(info) + end do search + nzl = coo_restr%get_nzeros() call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_ncols(desc_a%get_local_cols()) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) + if (do_timings) call psb_tic(idx_cpytrans2) + call csr_restr%cp_from_coo(coo_restr,info) + if (do_timings) call psb_toc(idx_cpytrans2) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 similarity index 59% rename from amgprec/impl/aggregator/amg_c_soc1_map_bld.f90 rename to amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index d1c734fc..4ec81322 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -72,7 +72,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_c_inner_mod - +#if defined(OPENMP) + use omp_lib +#endif implicit none ! Arguments @@ -99,6 +101,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_lpk_) :: nrglob character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1 + integer(psb_ipk_), save :: idx_soc1_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc1_map_bld' @@ -114,6 +119,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc1_p0==-1)) & + & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0") + if ((do_timings).and.(idx_soc1_p1==-1)) & + & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1") + if ((do_timings).and.(idx_soc1_p2==-1)) & + & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2") + if ((do_timings).and.(idx_soc1_p3==-1)) & + & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -133,41 +146,194 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc1_p0) call a%cp_to(acsr) + if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do - else + !$omp end parallel do + else + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if - + if (do_timings) call psb_tic(idx_soc1_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 - icnt = 0 +#if 0&&defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + nths = omp_get_num_threads() + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 +!!$ write(0,*) 'From thread : rsz ',myth,rsz + !$omp master + allocate(bnds(0:nths),locnaggr(0:nths)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + bnds(myth+1) = rsz + !$omp master +!!$ write(0,*) 'From master 1: ',bnds + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do +!!$ write(0,*) 'From master 2: ',bnds + !$omp end master + !$omp barrier + + !$omp do schedule(static) + do kk=0, nths-1 +!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + if (info /= 0) cycle + i = idxs(ii) + if ((i<1).or.(i>nr)) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + if (.false.) then + ip = 0 + do k=1, nz + j = icol(k) + if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + else + ip = 0 + minip = nr +1 + do k=1, nz + j = icol(k) + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + minip = min(icol(ip),minip) + end if + enddo + if (bnds(myth)<=minip) then + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + !$omp end critical(update_ilaggr) + end if + endif + end if + end if + enddo step1 + end do + !$omp end do + !$omp barrier + !$omp master + naggr = sum(locnaggr(0:nths-1)) +!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + end do + end do + !$omp end do + end block + !$omp end parallel + end block +!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +342,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +360,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +369,22 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +!!$ write(0,*) 'NAGGR ',naggr +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + ! $ omp workshare tmpaggr = ilaggr + ! $ omp end workshare + ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +415,15 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + ! $ omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if + + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +452,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -309,7 +486,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do - + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ @@ -336,9 +513,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if +!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/aggregator/amg_d_ptap_bld.f90 b/amgprec/impl/aggregator/amg_d_ptap_bld.f90 index 8520e58e..4006c04c 100644 --- a/amgprec/impl/aggregator/amg_d_ptap_bld.f90 +++ b/amgprec/impl/aggregator/amg_d_ptap_bld.f90 @@ -76,7 +76,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. - integer(psb_ipk_), save :: idx_spspmm=-1 + integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1 name='amg_ptap_bld' if(psb_get_errstatus().ne.0) return @@ -93,7 +93,11 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ncol = desc_a%get_local_cols() if ((do_timings).and.(idx_spspmm==-1)) & - & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") + & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm") + if ((do_timings).and.(idx_cpytrans1==-1)) & + & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1") + if ((do_timings).and.(idx_cpytrans2==-1)) & + & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2") naggr = nlaggr(me+1) ntaggr = sum(nlaggr) @@ -128,6 +132,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ! Ok first product done. if (present(desc_ax)) then + if (do_timings) call psb_tic(idx_cpytrans1) block call coo_prol%cp_to_coo(coo_restr,info) call coo_restr%set_ncols(desc_ac%get_local_cols()) @@ -137,7 +142,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%set_ncols(desc_ax%get_local_cols()) end block call csr_restr%cp_from_coo(coo_restr,info) - + if (do_timings) call psb_toc(idx_cpytrans1) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 @@ -167,27 +172,28 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%transp() nzl = coo_restr%get_nzeros() - nrl = desc_ac%get_local_rows() - i=0 + nrl = desc_ac%get_local_rows() + call coo_restr%fix(info) + i=coo_restr%get_nzeros() ! ! Only keep local rows ! - do k=1, nzl - if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then - i = i+1 - coo_restr%val(i) = coo_restr%val(k) - coo_restr%ia(i) = coo_restr%ia(k) - coo_restr%ja(i) = coo_restr%ja(k) + search: do k=i,1,-1 + if (coo_restr%ia(k) <= nrl) then + call coo_restr%set_nzeros(k) + exit search end if - end do - call coo_restr%set_nzeros(i) - call coo_restr%fix(info) + end do search + nzl = coo_restr%get_nzeros() call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_ncols(desc_a%get_local_cols()) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) + if (do_timings) call psb_tic(idx_cpytrans2) + call csr_restr%cp_from_coo(coo_restr,info) + if (do_timings) call psb_toc(idx_cpytrans2) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 similarity index 59% rename from amgprec/impl/aggregator/amg_d_soc1_map_bld.f90 rename to amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index e3ae5245..af0a7764 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -72,7 +72,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_d_inner_mod - +#if defined(OPENMP) + use omp_lib +#endif implicit none ! Arguments @@ -99,6 +101,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_lpk_) :: nrglob character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1 + integer(psb_ipk_), save :: idx_soc1_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc1_map_bld' @@ -114,6 +119,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc1_p0==-1)) & + & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0") + if ((do_timings).and.(idx_soc1_p1==-1)) & + & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1") + if ((do_timings).and.(idx_soc1_p2==-1)) & + & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2") + if ((do_timings).and.(idx_soc1_p3==-1)) & + & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -133,41 +146,194 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc1_p0) call a%cp_to(acsr) + if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do - else + !$omp end parallel do + else + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if - + if (do_timings) call psb_tic(idx_soc1_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 - icnt = 0 +#if 0&&defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + nths = omp_get_num_threads() + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 +!!$ write(0,*) 'From thread : rsz ',myth,rsz + !$omp master + allocate(bnds(0:nths),locnaggr(0:nths)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + bnds(myth+1) = rsz + !$omp master +!!$ write(0,*) 'From master 1: ',bnds + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do +!!$ write(0,*) 'From master 2: ',bnds + !$omp end master + !$omp barrier + + !$omp do schedule(static) + do kk=0, nths-1 +!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + if (info /= 0) cycle + i = idxs(ii) + if ((i<1).or.(i>nr)) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + if (.false.) then + ip = 0 + do k=1, nz + j = icol(k) + if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + else + ip = 0 + minip = nr +1 + do k=1, nz + j = icol(k) + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + minip = min(icol(ip),minip) + end if + enddo + if (bnds(myth)<=minip) then + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + !$omp end critical(update_ilaggr) + end if + endif + end if + end if + enddo step1 + end do + !$omp end do + !$omp barrier + !$omp master + naggr = sum(locnaggr(0:nths-1)) +!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + end do + end do + !$omp end do + end block + !$omp end parallel + end block +!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +342,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +360,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +369,22 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +!!$ write(0,*) 'NAGGR ',naggr +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + ! $ omp workshare tmpaggr = ilaggr + ! $ omp end workshare + ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +415,15 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + ! $ omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if + + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +452,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -309,7 +486,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do - + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ @@ -336,9 +513,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if +!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/aggregator/amg_s_ptap_bld.f90 b/amgprec/impl/aggregator/amg_s_ptap_bld.f90 index 93b79b63..e1a6c867 100644 --- a/amgprec/impl/aggregator/amg_s_ptap_bld.f90 +++ b/amgprec/impl/aggregator/amg_s_ptap_bld.f90 @@ -76,7 +76,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. - integer(psb_ipk_), save :: idx_spspmm=-1 + integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1 name='amg_ptap_bld' if(psb_get_errstatus().ne.0) return @@ -93,7 +93,11 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ncol = desc_a%get_local_cols() if ((do_timings).and.(idx_spspmm==-1)) & - & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") + & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm") + if ((do_timings).and.(idx_cpytrans1==-1)) & + & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1") + if ((do_timings).and.(idx_cpytrans2==-1)) & + & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2") naggr = nlaggr(me+1) ntaggr = sum(nlaggr) @@ -128,6 +132,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ! Ok first product done. if (present(desc_ax)) then + if (do_timings) call psb_tic(idx_cpytrans1) block call coo_prol%cp_to_coo(coo_restr,info) call coo_restr%set_ncols(desc_ac%get_local_cols()) @@ -137,7 +142,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%set_ncols(desc_ax%get_local_cols()) end block call csr_restr%cp_from_coo(coo_restr,info) - + if (do_timings) call psb_toc(idx_cpytrans1) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 @@ -167,27 +172,28 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%transp() nzl = coo_restr%get_nzeros() - nrl = desc_ac%get_local_rows() - i=0 + nrl = desc_ac%get_local_rows() + call coo_restr%fix(info) + i=coo_restr%get_nzeros() ! ! Only keep local rows ! - do k=1, nzl - if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then - i = i+1 - coo_restr%val(i) = coo_restr%val(k) - coo_restr%ia(i) = coo_restr%ia(k) - coo_restr%ja(i) = coo_restr%ja(k) + search: do k=i,1,-1 + if (coo_restr%ia(k) <= nrl) then + call coo_restr%set_nzeros(k) + exit search end if - end do - call coo_restr%set_nzeros(i) - call coo_restr%fix(info) + end do search + nzl = coo_restr%get_nzeros() call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_ncols(desc_a%get_local_cols()) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) + if (do_timings) call psb_tic(idx_cpytrans2) + call csr_restr%cp_from_coo(coo_restr,info) + if (do_timings) call psb_toc(idx_cpytrans2) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 similarity index 59% rename from amgprec/impl/aggregator/amg_s_soc1_map_bld.f90 rename to amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 3f4cc437..967ee669 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -72,7 +72,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_s_inner_mod - +#if defined(OPENMP) + use omp_lib +#endif implicit none ! Arguments @@ -99,6 +101,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_lpk_) :: nrglob character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1 + integer(psb_ipk_), save :: idx_soc1_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc1_map_bld' @@ -114,6 +119,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc1_p0==-1)) & + & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0") + if ((do_timings).and.(idx_soc1_p1==-1)) & + & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1") + if ((do_timings).and.(idx_soc1_p2==-1)) & + & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2") + if ((do_timings).and.(idx_soc1_p3==-1)) & + & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -133,41 +146,194 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc1_p0) call a%cp_to(acsr) + if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do - else + !$omp end parallel do + else + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if - + if (do_timings) call psb_tic(idx_soc1_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 - icnt = 0 +#if 0&&defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + nths = omp_get_num_threads() + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 +!!$ write(0,*) 'From thread : rsz ',myth,rsz + !$omp master + allocate(bnds(0:nths),locnaggr(0:nths)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + bnds(myth+1) = rsz + !$omp master +!!$ write(0,*) 'From master 1: ',bnds + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do +!!$ write(0,*) 'From master 2: ',bnds + !$omp end master + !$omp barrier + + !$omp do schedule(static) + do kk=0, nths-1 +!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + if (info /= 0) cycle + i = idxs(ii) + if ((i<1).or.(i>nr)) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + if (.false.) then + ip = 0 + do k=1, nz + j = icol(k) + if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + else + ip = 0 + minip = nr +1 + do k=1, nz + j = icol(k) + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + minip = min(icol(ip),minip) + end if + enddo + if (bnds(myth)<=minip) then + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + !$omp end critical(update_ilaggr) + end if + endif + end if + end if + enddo step1 + end do + !$omp end do + !$omp barrier + !$omp master + naggr = sum(locnaggr(0:nths-1)) +!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + end do + end do + !$omp end do + end block + !$omp end parallel + end block +!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +342,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +360,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +369,22 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +!!$ write(0,*) 'NAGGR ',naggr +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + ! $ omp workshare tmpaggr = ilaggr + ! $ omp end workshare + ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +415,15 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + ! $ omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if + + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +452,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -309,7 +486,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do - + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ @@ -336,9 +513,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if +!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/aggregator/amg_z_ptap_bld.f90 b/amgprec/impl/aggregator/amg_z_ptap_bld.f90 index 6faf1b71..e322a303 100644 --- a/amgprec/impl/aggregator/amg_z_ptap_bld.f90 +++ b/amgprec/impl/aggregator/amg_z_ptap_bld.f90 @@ -76,7 +76,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. - integer(psb_ipk_), save :: idx_spspmm=-1 + integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1 name='amg_ptap_bld' if(psb_get_errstatus().ne.0) return @@ -93,7 +93,11 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ncol = desc_a%get_local_cols() if ((do_timings).and.(idx_spspmm==-1)) & - & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") + & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm") + if ((do_timings).and.(idx_cpytrans1==-1)) & + & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1") + if ((do_timings).and.(idx_cpytrans2==-1)) & + & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2") naggr = nlaggr(me+1) ntaggr = sum(nlaggr) @@ -128,6 +132,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ! Ok first product done. if (present(desc_ax)) then + if (do_timings) call psb_tic(idx_cpytrans1) block call coo_prol%cp_to_coo(coo_restr,info) call coo_restr%set_ncols(desc_ac%get_local_cols()) @@ -137,7 +142,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%set_ncols(desc_ax%get_local_cols()) end block call csr_restr%cp_from_coo(coo_restr,info) - + if (do_timings) call psb_toc(idx_cpytrans1) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 @@ -167,27 +172,28 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%transp() nzl = coo_restr%get_nzeros() - nrl = desc_ac%get_local_rows() - i=0 + nrl = desc_ac%get_local_rows() + call coo_restr%fix(info) + i=coo_restr%get_nzeros() ! ! Only keep local rows ! - do k=1, nzl - if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then - i = i+1 - coo_restr%val(i) = coo_restr%val(k) - coo_restr%ia(i) = coo_restr%ia(k) - coo_restr%ja(i) = coo_restr%ja(k) + search: do k=i,1,-1 + if (coo_restr%ia(k) <= nrl) then + call coo_restr%set_nzeros(k) + exit search end if - end do - call coo_restr%set_nzeros(i) - call coo_restr%fix(info) + end do search + nzl = coo_restr%get_nzeros() call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_ncols(desc_a%get_local_cols()) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) + if (do_timings) call psb_tic(idx_cpytrans2) + call csr_restr%cp_from_coo(coo_restr,info) + if (do_timings) call psb_toc(idx_cpytrans2) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 similarity index 59% rename from amgprec/impl/aggregator/amg_z_soc1_map_bld.f90 rename to amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index d9f24130..611590cb 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -72,7 +72,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_z_inner_mod - +#if defined(OPENMP) + use omp_lib +#endif implicit none ! Arguments @@ -99,6 +101,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_lpk_) :: nrglob character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1 + integer(psb_ipk_), save :: idx_soc1_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc1_map_bld' @@ -114,6 +119,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc1_p0==-1)) & + & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0") + if ((do_timings).and.(idx_soc1_p1==-1)) & + & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1") + if ((do_timings).and.(idx_soc1_p2==-1)) & + & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2") + if ((do_timings).and.(idx_soc1_p3==-1)) & + & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -133,41 +146,194 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc1_p0) call a%cp_to(acsr) + if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do - else + !$omp end parallel do + else + !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if - + if (do_timings) call psb_tic(idx_soc1_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 - icnt = 0 +#if 0&&defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + nths = omp_get_num_threads() + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 +!!$ write(0,*) 'From thread : rsz ',myth,rsz + !$omp master + allocate(bnds(0:nths),locnaggr(0:nths)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + bnds(myth+1) = rsz + !$omp master +!!$ write(0,*) 'From master 1: ',bnds + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do +!!$ write(0,*) 'From master 2: ',bnds + !$omp end master + !$omp barrier + + !$omp do schedule(static) + do kk=0, nths-1 +!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + if (info /= 0) cycle + i = idxs(ii) + if ((i<1).or.(i>nr)) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + info=psb_err_internal_error_ + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + if (.false.) then + ip = 0 + do k=1, nz + j = icol(k) + if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + else + ip = 0 + minip = nr +1 + do k=1, nz + j = icol(k) + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + minip = min(icol(ip),minip) + end if + enddo + if (bnds(myth)<=minip) then + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + do k=1, ip + ilaggr(icol(k)) = locnaggr(kk) + end do + ilaggr(i) = locnaggr(kk) + end if + !$omp end critical(update_ilaggr) + end if + endif + end if + end if + enddo step1 + end do + !$omp end do + !$omp barrier + !$omp master + naggr = sum(locnaggr(0:nths-1)) +!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + end do + end do + !$omp end do + end block + !$omp end parallel + end block +!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +342,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +360,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +369,22 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +!!$ write(0,*) 'NAGGR ',naggr +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + ! $ omp workshare tmpaggr = ilaggr + ! $ omp end workshare + ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +415,15 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + ! $ omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if + + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +452,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -309,7 +486,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do - + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ @@ -336,9 +513,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if +!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 index f760c80f..11ea6576 100644 --- a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_c_gs_solver_bld.f90 b/amgprec/impl/solver/amg_c_gs_solver_bld.f90 index 3cdfe7e7..79be20b5 100644 --- a/amgprec/impl/solver/amg_c_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_c_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='c_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 index 859c8ebe..de5f91f8 100644 --- a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_d_gs_solver_bld.f90 b/amgprec/impl/solver/amg_d_gs_solver_bld.f90 index 3cbc78ee..918712b5 100644 --- a/amgprec/impl/solver/amg_d_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_d_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 index e96e1229..d285d0b3 100644 --- a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_s_gs_solver_bld.f90 b/amgprec/impl/solver/amg_s_gs_solver_bld.f90 index b4580f0f..6e0870b7 100644 --- a/amgprec/impl/solver/amg_s_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_s_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='s_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 index dec629f5..a953c608 100644 --- a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_z_gs_solver_bld.f90 b/amgprec/impl/solver/amg_z_gs_solver_bld.f90 index b347937a..748a6122 100644 --- a/amgprec/impl/solver/amg_z_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_z_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='z_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/samples/advanced/pdegen/amg_d_pde3d.F90 b/samples/advanced/pdegen/amg_d_pde3d.F90 index a1ceea84..cb9542d4 100644 --- a/samples/advanced/pdegen/amg_d_pde3d.F90 +++ b/samples/advanced/pdegen/amg_d_pde3d.F90 @@ -195,7 +195,7 @@ program amg_d_pde3d ! other variables integer(psb_ipk_) :: info, i, k character(len=20) :: name,ch_err - + type(psb_d_csr_sparse_mat) :: amold info=psb_success_ @@ -402,7 +402,7 @@ program amg_d_pde3d end if call psb_barrier(ctxt) t1 = psb_wtime() - call prec%smoothers_build(a,desc_a,info) + call prec%smoothers_build(a,desc_a,info,amold=amold) tprec = psb_wtime()-t1 if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld') @@ -492,7 +492,7 @@ program amg_d_pde3d write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt() end if - + call psb_print_timers(ctxt) ! ! cleanup storage and exit ! diff --git a/samples/advanced/pdegen/amg_s_pde3d.F90 b/samples/advanced/pdegen/amg_s_pde3d.F90 index da5cd173..d6195c45 100644 --- a/samples/advanced/pdegen/amg_s_pde3d.F90 +++ b/samples/advanced/pdegen/amg_s_pde3d.F90 @@ -195,7 +195,7 @@ program amg_s_pde3d ! other variables integer(psb_ipk_) :: info, i, k character(len=20) :: name,ch_err - + type(psb_s_csr_sparse_mat) :: amold info=psb_success_ @@ -402,7 +402,7 @@ program amg_s_pde3d end if call psb_barrier(ctxt) t1 = psb_wtime() - call prec%smoothers_build(a,desc_a,info) + call prec%smoothers_build(a,desc_a,info,amold=amold) tprec = psb_wtime()-t1 if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld') @@ -492,7 +492,7 @@ program amg_s_pde3d write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt() end if - + call psb_print_timers(ctxt) ! ! cleanup storage and exit ! diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index 0cd5d6c5..7a3329cd 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0200 ! IDIM; domain size. Linear system size is IDIM**3 +0200 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC @@ -9,7 +9,7 @@ BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS F 30 ! IRST (restart for RGMRES and BiCGSTABL) 1.d-6 ! EPS %%%%%%%%%%% Main preconditioner choices %%%%%%%%%%%%%%%% -ML-VCYCLE-BJAC-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars) +ML-VBM-VCYCLE-FBGS-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars) ML ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML %%%%%%%%%%% First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%% FBGS ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous. @@ -39,8 +39,8 @@ VCYCLE ! Type of multilevel CYCLE: VCYCLE WCYCLE KCYCLE MUL -3 ! Max Number of levels in a multilevel preconditioner; if <0, lib default -3 ! Target coarse matrix size per process; if <0, lib default SMOOTHED ! Type of aggregation: SMOOTHED UNSMOOTHED -COUPLED ! Parallel aggregation: DEC, SYMDEC, COUPLED -MATCHBOXP ! aggregation measure SOC1, MATCHBOXP +DEC ! Parallel aggregation: DEC, SYMDEC, COUPLED +SOC1 ! aggregation measure SOC1, MATCHBOXP 8 ! Requested size of the aggregates for MATCHBOXP NATURAL ! Ordering of aggregation NATURAL DEGREE -1.5 ! Coarsening ratio, if < 0 use library default From 7b9c722a1a8801109358358089aa1924374ef6bd Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 26 Jul 2023 14:32:13 +0200 Subject: [PATCH 88/96] Fixed OpenMP version of SOC1. --- .../impl/aggregator/amg_c_soc1_map_bld.F90 | 129 +++++++----------- .../impl/aggregator/amg_d_soc1_map_bld.F90 | 129 +++++++----------- .../impl/aggregator/amg_s_soc1_map_bld.F90 | 129 +++++++----------- .../impl/aggregator/amg_z_soc1_map_bld.F90 | 129 +++++++----------- 4 files changed, 200 insertions(+), 316 deletions(-) diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index 4ec81322..91acfefe 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) complex(psb_spk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -154,7 +154,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) - idxs(i) = i + idxs(i) = i + ioffs(i) = 0 end do !$omp end parallel do else @@ -162,6 +163,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) + ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -172,37 +174,35 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if 0&&defined(OPENMP) +#if defined(OPENMP) block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + !$omp private(icol,val,myth,kk) block - integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths if (myth < mod(nr,nths)) rsz = rsz + 1 -!!$ write(0,*) 'From thread : rsz ',myth,rsz !$omp master - allocate(bnds(0:nths),locnaggr(0:nths)) + allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier bnds(myth+1) = rsz + !$omp barrier !$omp master -!!$ write(0,*) 'From master 1: ',bnds do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do -!!$ write(0,*) 'From master 2: ',bnds !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) do kk=0, nths-1 -!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle i = idxs(ii) @@ -228,94 +228,67 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - if (.false.) then - ip = 0 - do k=1, nz - j = icol(k) - if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - end if - end if - enddo - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! + ip = 0 + do k=1, nz + j = icol(k) + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) + ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) + ioffs(icol(k)) = kk end do - ilaggr(i) = locnaggr(kk) + ilaggr(i) = bnds(kk)-1+locnaggr(kk) + ioffs(i) = kk end if - else - ip = 0 - minip = nr +1 - do k=1, nz - j = icol(k) - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - minip = min(icol(ip),minip) - end if - enddo - if (bnds(myth)<=minip) then - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) - end do - ilaggr(i) = locnaggr(kk) - end if - !$omp end critical(update_ilaggr) - end if - endif + !$omp end critical(update_ilaggr) end if + end if enddo step1 end do !$omp end do - !$omp barrier + !$omp master naggr = sum(locnaggr(0:nths-1)) -!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) do i=1,nths locnaggr(i) = locnaggr(i) + locnaggr(i-1) end do - do i=nths,1,-1 + do i=nths+1,1,-1 locnaggr(i) = locnaggr(i-1) end do locnaggr(0) = 0 - !$omp end master + !$omp end master !$omp barrier !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + if (ilaggr(ii) > 0) then + kp = ioffs(ii) + ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + end if end do end do !$omp end do end block !$omp end parallel end block -!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr #else step1: do ii=1, nr if (info /= 0) cycle @@ -369,7 +342,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 -!!$ write(0,*) 'NAGGR ',naggr #endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& @@ -381,10 +353,11 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Phase two: join the neighbours ! - ! $ omp workshare + !$omp workshare tmpaggr = ilaggr - ! $ omp end workshare - ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -415,14 +388,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - ! $ omp end parallel do + !$omp end parallel do if (do_timings) call psb_toc(idx_soc1_p2) if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any @@ -488,7 +461,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -518,7 +490,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if -!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() call psb_erractionrestore(err_act) diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index af0a7764..0c76f269 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_dpk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -154,7 +154,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) - idxs(i) = i + idxs(i) = i + ioffs(i) = 0 end do !$omp end parallel do else @@ -162,6 +163,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) + ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -172,37 +174,35 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if 0&&defined(OPENMP) +#if defined(OPENMP) block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + !$omp private(icol,val,myth,kk) block - integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths if (myth < mod(nr,nths)) rsz = rsz + 1 -!!$ write(0,*) 'From thread : rsz ',myth,rsz !$omp master - allocate(bnds(0:nths),locnaggr(0:nths)) + allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier bnds(myth+1) = rsz + !$omp barrier !$omp master -!!$ write(0,*) 'From master 1: ',bnds do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do -!!$ write(0,*) 'From master 2: ',bnds !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) do kk=0, nths-1 -!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle i = idxs(ii) @@ -228,94 +228,67 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - if (.false.) then - ip = 0 - do k=1, nz - j = icol(k) - if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - end if - end if - enddo - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! + ip = 0 + do k=1, nz + j = icol(k) + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) + ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) + ioffs(icol(k)) = kk end do - ilaggr(i) = locnaggr(kk) + ilaggr(i) = bnds(kk)-1+locnaggr(kk) + ioffs(i) = kk end if - else - ip = 0 - minip = nr +1 - do k=1, nz - j = icol(k) - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - minip = min(icol(ip),minip) - end if - enddo - if (bnds(myth)<=minip) then - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) - end do - ilaggr(i) = locnaggr(kk) - end if - !$omp end critical(update_ilaggr) - end if - endif + !$omp end critical(update_ilaggr) end if + end if enddo step1 end do !$omp end do - !$omp barrier + !$omp master naggr = sum(locnaggr(0:nths-1)) -!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) do i=1,nths locnaggr(i) = locnaggr(i) + locnaggr(i-1) end do - do i=nths,1,-1 + do i=nths+1,1,-1 locnaggr(i) = locnaggr(i-1) end do locnaggr(0) = 0 - !$omp end master + !$omp end master !$omp barrier !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + if (ilaggr(ii) > 0) then + kp = ioffs(ii) + ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + end if end do end do !$omp end do end block !$omp end parallel end block -!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr #else step1: do ii=1, nr if (info /= 0) cycle @@ -369,7 +342,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 -!!$ write(0,*) 'NAGGR ',naggr #endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& @@ -381,10 +353,11 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Phase two: join the neighbours ! - ! $ omp workshare + !$omp workshare tmpaggr = ilaggr - ! $ omp end workshare - ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -415,14 +388,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - ! $ omp end parallel do + !$omp end parallel do if (do_timings) call psb_toc(idx_soc1_p2) if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any @@ -488,7 +461,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -518,7 +490,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if -!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() call psb_erractionrestore(err_act) diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 967ee669..fe5701ef 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_spk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -154,7 +154,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) - idxs(i) = i + idxs(i) = i + ioffs(i) = 0 end do !$omp end parallel do else @@ -162,6 +163,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) + ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -172,37 +174,35 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if 0&&defined(OPENMP) +#if defined(OPENMP) block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + !$omp private(icol,val,myth,kk) block - integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths if (myth < mod(nr,nths)) rsz = rsz + 1 -!!$ write(0,*) 'From thread : rsz ',myth,rsz !$omp master - allocate(bnds(0:nths),locnaggr(0:nths)) + allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier bnds(myth+1) = rsz + !$omp barrier !$omp master -!!$ write(0,*) 'From master 1: ',bnds do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do -!!$ write(0,*) 'From master 2: ',bnds !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) do kk=0, nths-1 -!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle i = idxs(ii) @@ -228,94 +228,67 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - if (.false.) then - ip = 0 - do k=1, nz - j = icol(k) - if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - end if - end if - enddo - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! + ip = 0 + do k=1, nz + j = icol(k) + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) + ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) + ioffs(icol(k)) = kk end do - ilaggr(i) = locnaggr(kk) + ilaggr(i) = bnds(kk)-1+locnaggr(kk) + ioffs(i) = kk end if - else - ip = 0 - minip = nr +1 - do k=1, nz - j = icol(k) - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - minip = min(icol(ip),minip) - end if - enddo - if (bnds(myth)<=minip) then - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) - end do - ilaggr(i) = locnaggr(kk) - end if - !$omp end critical(update_ilaggr) - end if - endif + !$omp end critical(update_ilaggr) end if + end if enddo step1 end do !$omp end do - !$omp barrier + !$omp master naggr = sum(locnaggr(0:nths-1)) -!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) do i=1,nths locnaggr(i) = locnaggr(i) + locnaggr(i-1) end do - do i=nths,1,-1 + do i=nths+1,1,-1 locnaggr(i) = locnaggr(i-1) end do locnaggr(0) = 0 - !$omp end master + !$omp end master !$omp barrier !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + if (ilaggr(ii) > 0) then + kp = ioffs(ii) + ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + end if end do end do !$omp end do end block !$omp end parallel end block -!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr #else step1: do ii=1, nr if (info /= 0) cycle @@ -369,7 +342,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 -!!$ write(0,*) 'NAGGR ',naggr #endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& @@ -381,10 +353,11 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Phase two: join the neighbours ! - ! $ omp workshare + !$omp workshare tmpaggr = ilaggr - ! $ omp end workshare - ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -415,14 +388,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - ! $ omp end parallel do + !$omp end parallel do if (do_timings) call psb_toc(idx_soc1_p2) if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any @@ -488,7 +461,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -518,7 +490,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if -!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() call psb_erractionrestore(err_act) diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index 611590cb..54c3add4 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) complex(psb_dpk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -154,7 +154,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp parallel do private(i) do i=1, nr ilaggr(i) = -(nr+1) - idxs(i) = i + idxs(i) = i + ioffs(i) = 0 end do !$omp end parallel do else @@ -162,6 +163,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) + ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -172,37 +174,35 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if 0&&defined(OPENMP) +#if defined(OPENMP) block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) private(icol,val,myth,kk) + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + !$omp private(icol,val,myth,kk) block - integer(psb_ipk_) :: ii,nlp,k,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz, minip + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths if (myth < mod(nr,nths)) rsz = rsz + 1 -!!$ write(0,*) 'From thread : rsz ',myth,rsz !$omp master - allocate(bnds(0:nths),locnaggr(0:nths)) + allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier bnds(myth+1) = rsz + !$omp barrier !$omp master -!!$ write(0,*) 'From master 1: ',bnds do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do -!!$ write(0,*) 'From master 2: ',bnds !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) do kk=0, nths-1 -!!$ write(0,*) 'From thread ',myth,kk,bnds(kk),bnds(kk+1)-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle i = idxs(ii) @@ -228,94 +228,67 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - if (.false.) then - ip = 0 - do k=1, nz - j = icol(k) - if ((bnds(myth)<=j).and.(j<=(bnds(myth+1)-1))) then - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - end if - end if - enddo - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! + ip = 0 + do k=1, nz + j = icol(k) + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + !$omp critical(update_ilaggr) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) + ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) + ioffs(icol(k)) = kk end do - ilaggr(i) = locnaggr(kk) + ilaggr(i) = bnds(kk)-1+locnaggr(kk) + ioffs(i) = kk end if - else - ip = 0 - minip = nr +1 - do k=1, nz - j = icol(k) - if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then - ip = ip + 1 - icol(ip) = icol(k) - minip = min(icol(ip),minip) - end if - enddo - if (bnds(myth)<=minip) then - - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! Same if ip==0 (in which case, neighborhood only - ! contains I even if it does not look like it from matrix) - ! - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = locnaggr(kk) - end do - ilaggr(i) = locnaggr(kk) - end if - !$omp end critical(update_ilaggr) - end if - endif + !$omp end critical(update_ilaggr) end if + end if enddo step1 end do !$omp end do - !$omp barrier + !$omp master naggr = sum(locnaggr(0:nths-1)) -!!$ write(0,*) 'NAGGR ',naggr, 'locnaggr ',locnaggr(0:nths-1) do i=1,nths locnaggr(i) = locnaggr(i) + locnaggr(i-1) end do - do i=nths,1,-1 + do i=nths+1,1,-1 locnaggr(i) = locnaggr(i-1) end do locnaggr(0) = 0 - !$omp end master + !$omp end master !$omp barrier !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) ilaggr(ii) = ilaggr(ii) + locnaggr(kk) + if (ilaggr(ii) > 0) then + kp = ioffs(ii) + ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + end if end do end do !$omp end do end block !$omp end parallel end block -!!$ write(0,*) 'Out of parallel looop NAGGR ',naggr #else step1: do ii=1, nr if (info /= 0) cycle @@ -369,7 +342,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 -!!$ write(0,*) 'NAGGR ',naggr #endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& @@ -381,10 +353,11 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Phase two: join the neighbours ! - ! $ omp workshare + !$omp workshare tmpaggr = ilaggr - ! $ omp end workshare - ! $ omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta) private(ii,i,j,k,nz,icol,val,ip) + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip) step2: do ii=1,nr i = idxs(ii) @@ -415,14 +388,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - ! $ omp end parallel do + !$omp end parallel do if (do_timings) call psb_toc(idx_soc1_p2) if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any @@ -488,7 +461,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -518,7 +490,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if -!!$ write(0,*) nlaggr(1:np),'ILAGGR : ',ilaggr(1:nr) call acsr%free() call psb_erractionrestore(err_act) From e3de565b6dc31f6cd566547ee8622544d86593a8 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 26 Jul 2023 14:47:05 +0200 Subject: [PATCH 89/96] Updated commeents in SOC1 --- amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 | 14 ++++++++++++++ amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 | 14 ++++++++++++++ amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 | 14 ++++++++++++++ amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 | 14 ++++++++++++++ 4 files changed, 56 insertions(+) diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index 91acfefe..eb6b0eac 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -178,6 +178,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block @@ -231,6 +243,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ip = 0 do k=1, nz j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. if (ilaggr(j) > 0) cycle step1 if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then ip = ip + 1 diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index 0c76f269..241f0568 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -178,6 +178,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block @@ -231,6 +243,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ip = 0 do k=1, nz j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. if (ilaggr(j) > 0) cycle step1 if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then ip = ip + 1 diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index fe5701ef..329cd3ba 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -178,6 +178,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block @@ -231,6 +243,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ip = 0 do k=1, nz j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. if (ilaggr(j) > 0) cycle step1 if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then ip = ip + 1 diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index 54c3add4..697a55b3 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -178,6 +178,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in block integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block @@ -231,6 +243,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ip = 0 do k=1, nz j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. if (ilaggr(j) > 0) cycle step1 if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then ip = ip + 1 From e78449d0f5c6bed62b23ab5b9172de9b4cb16289 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 31 Jul 2023 13:26:05 +0200 Subject: [PATCH 90/96] Prepare for SOC2 OpenMP --- .../aggregator/{amg_c_soc2_map_bld.f90 => amg_c_soc2_map_bld.F90} | 0 .../aggregator/{amg_d_soc2_map_bld.f90 => amg_d_soc2_map_bld.F90} | 0 .../aggregator/{amg_s_soc2_map_bld.f90 => amg_s_soc2_map_bld.F90} | 0 .../aggregator/{amg_z_soc2_map_bld.f90 => amg_z_soc2_map_bld.F90} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename amgprec/impl/aggregator/{amg_c_soc2_map_bld.f90 => amg_c_soc2_map_bld.F90} (100%) rename amgprec/impl/aggregator/{amg_d_soc2_map_bld.f90 => amg_d_soc2_map_bld.F90} (100%) rename amgprec/impl/aggregator/{amg_s_soc2_map_bld.f90 => amg_s_soc2_map_bld.F90} (100%) rename amgprec/impl/aggregator/{amg_z_soc2_map_bld.f90 => amg_z_soc2_map_bld.F90} (100%) diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 similarity index 100% rename from amgprec/impl/aggregator/amg_c_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 similarity index 100% rename from amgprec/impl/aggregator/amg_d_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 similarity index 100% rename from amgprec/impl/aggregator/amg_s_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 similarity index 100% rename from amgprec/impl/aggregator/amg_z_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 From c1ecb4ebec85413cff2ff728a0c9e611eb77554f Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 3 Aug 2023 13:26:24 +0200 Subject: [PATCH 91/96] Fixed SOC1 and begin work on SOC2 --- .../impl/aggregator/amg_c_soc1_map_bld.F90 | 63 ++++++++------ .../impl/aggregator/amg_c_soc2_map_bld.F90 | 84 ++++++++++++++++--- .../impl/aggregator/amg_d_soc1_map_bld.F90 | 63 ++++++++------ .../impl/aggregator/amg_d_soc2_map_bld.F90 | 84 ++++++++++++++++--- .../impl/aggregator/amg_s_soc1_map_bld.F90 | 63 ++++++++------ .../impl/aggregator/amg_s_soc2_map_bld.F90 | 84 ++++++++++++++++--- .../impl/aggregator/amg_z_soc1_map_bld.F90 | 63 ++++++++------ .../impl/aggregator/amg_z_soc2_map_bld.F90 | 84 ++++++++++++++++--- 8 files changed, 448 insertions(+), 140 deletions(-) diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index eb6b0eac..516daf4b 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) complex(psb_spk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 index 020cae4b..ed4161a5 100644 --- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_c_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = sone + s_neigh_coo%val(k) = sone else - s_neigh_coo%val(ip) = -sone + s_neigh_coo%val(k) = -sone end if + else + s_neigh_coo%val(k) = -sone end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index 241f0568..f2cf9027 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_dpk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 index 1433a670..6047f375 100644 --- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_d_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = done + s_neigh_coo%val(k) = done else - s_neigh_coo%val(ip) = -done + s_neigh_coo%val(k) = -done end if + else + s_neigh_coo%val(k) = -done end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 329cd3ba..4d9ab106 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_spk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 index 4bb17a80..e94261a8 100644 --- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_s_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = sone + s_neigh_coo%val(k) = sone else - s_neigh_coo%val(ip) = -sone + s_neigh_coo%val(k) = -sone end if + else + s_neigh_coo%val(k) = -sone end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index 697a55b3..40a85dae 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) complex(psb_dpk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 index c1b165b1..e09bcf1e 100644 --- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_z_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = done + s_neigh_coo%val(k) = done else - s_neigh_coo%val(ip) = -done + s_neigh_coo%val(k) = -done end if + else + s_neigh_coo%val(k) = -done end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) From 9e82d2e3118c41b174dfee14ce9002fa7934b7a8 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 4 Aug 2023 09:30:32 +0200 Subject: [PATCH 92/96] Final OMP version of SOC1. --- .../impl/aggregator/amg_c_soc1_map_bld.F90 | 37 ++++++++++++------- .../impl/aggregator/amg_d_soc1_map_bld.F90 | 37 ++++++++++++------- .../impl/aggregator/amg_s_soc1_map_bld.F90 | 37 ++++++++++++------- .../impl/aggregator/amg_z_soc1_map_bld.F90 | 37 ++++++++++++------- 4 files changed, 92 insertions(+), 56 deletions(-) diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index 516daf4b..70931f05 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -187,47 +187,51 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - info = 0 - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & - !$omp private(icol,val,myth,kk) + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz integer(psb_lpk_) :: itmp - nths = omp_get_num_threads() - myth = omp_get_thread_num() - rsz = nr/nths - if (myth < mod(nr,nths)) rsz = rsz + 1 !$omp master + nths = omp_get_num_threads() allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 bnds(myth+1) = rsz !$omp barrier !$omp master do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do + info = 0 !$omp end master !$omp barrier - - !$omp do schedule(static) private(disjoint) reduction(max: info) + + !$omp do schedule(static) private(disjoint) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 - if (info /= 0) cycle - i = idxs(ii) + i = idxs(ii) + if (info /= 0) cycle step1 if ((i<1).or.(i>nr)) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 @@ -250,7 +254,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in icol(ip) = icol(k) end if enddo - + ! ! If the whole strongly coupled neighborhood of I is ! as yet unconnected, turn it into the next aggregate. @@ -263,13 +267,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! However, a certain unrepeatability is already present ! because the sequence of aggregates is computed with a ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk if (itmp < (bnds(kk)-1+locnaggr(kk))) then - info = 12345678 + !$omp atomic update + info = max(12345678,info) + !$omp end atomic cycle step1 end if !$omp atomic write diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index f2cf9027..bd40a2a4 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -187,47 +187,51 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - info = 0 - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & - !$omp private(icol,val,myth,kk) + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz integer(psb_lpk_) :: itmp - nths = omp_get_num_threads() - myth = omp_get_thread_num() - rsz = nr/nths - if (myth < mod(nr,nths)) rsz = rsz + 1 !$omp master + nths = omp_get_num_threads() allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 bnds(myth+1) = rsz !$omp barrier !$omp master do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do + info = 0 !$omp end master !$omp barrier - - !$omp do schedule(static) private(disjoint) reduction(max: info) + + !$omp do schedule(static) private(disjoint) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 - if (info /= 0) cycle - i = idxs(ii) + i = idxs(ii) + if (info /= 0) cycle step1 if ((i<1).or.(i>nr)) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 @@ -250,7 +254,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in icol(ip) = icol(k) end if enddo - + ! ! If the whole strongly coupled neighborhood of I is ! as yet unconnected, turn it into the next aggregate. @@ -263,13 +267,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! However, a certain unrepeatability is already present ! because the sequence of aggregates is computed with a ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk if (itmp < (bnds(kk)-1+locnaggr(kk))) then - info = 12345678 + !$omp atomic update + info = max(12345678,info) + !$omp end atomic cycle step1 end if !$omp atomic write diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 4d9ab106..109abc00 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -187,47 +187,51 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - info = 0 - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & - !$omp private(icol,val,myth,kk) + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz integer(psb_lpk_) :: itmp - nths = omp_get_num_threads() - myth = omp_get_thread_num() - rsz = nr/nths - if (myth < mod(nr,nths)) rsz = rsz + 1 !$omp master + nths = omp_get_num_threads() allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 bnds(myth+1) = rsz !$omp barrier !$omp master do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do + info = 0 !$omp end master !$omp barrier - - !$omp do schedule(static) private(disjoint) reduction(max: info) + + !$omp do schedule(static) private(disjoint) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 - if (info /= 0) cycle - i = idxs(ii) + i = idxs(ii) + if (info /= 0) cycle step1 if ((i<1).or.(i>nr)) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 @@ -250,7 +254,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in icol(ip) = icol(k) end if enddo - + ! ! If the whole strongly coupled neighborhood of I is ! as yet unconnected, turn it into the next aggregate. @@ -263,13 +267,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! However, a certain unrepeatability is already present ! because the sequence of aggregates is computed with a ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk if (itmp < (bnds(kk)-1+locnaggr(kk))) then - info = 12345678 + !$omp atomic update + info = max(12345678,info) + !$omp end atomic cycle step1 end if !$omp atomic write diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index 40a85dae..3efee9e8 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -187,47 +187,51 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - info = 0 - !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & - !$omp private(icol,val,myth,kk) + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz integer(psb_lpk_) :: itmp - nths = omp_get_num_threads() - myth = omp_get_thread_num() - rsz = nr/nths - if (myth < mod(nr,nths)) rsz = rsz + 1 !$omp master + nths = omp_get_num_threads() allocate(bnds(0:nths),locnaggr(0:nths+1)) locnaggr(:) = 0 bnds(0) = 1 !$omp end master !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 bnds(myth+1) = rsz !$omp barrier !$omp master do i=1,nths bnds(i) = bnds(i) + bnds(i-1) end do + info = 0 !$omp end master !$omp barrier - - !$omp do schedule(static) private(disjoint) reduction(max: info) + + !$omp do schedule(static) private(disjoint) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 - if (info /= 0) cycle - i = idxs(ii) + i = idxs(ii) + if (info /= 0) cycle step1 if ((i<1).or.(i>nr)) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name) cycle step1 !goto 9999 @@ -250,7 +254,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in icol(ip) = icol(k) end if enddo - + ! ! If the whole strongly coupled neighborhood of I is ! as yet unconnected, turn it into the next aggregate. @@ -263,13 +267,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! However, a certain unrepeatability is already present ! because the sequence of aggregates is computed with a ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) if (disjoint) then locnaggr(kk) = locnaggr(kk) + 1 itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk if (itmp < (bnds(kk)-1+locnaggr(kk))) then - info = 12345678 + !$omp atomic update + info = max(12345678,info) + !$omp end atomic cycle step1 end if !$omp atomic write From 73495edf09f6d3f73749e1e6e15d03f7fafd3ee9 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 7 Aug 2023 08:59:32 +0200 Subject: [PATCH 93/96] Finish SOC1 OpenMP --- amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 | 7 ++++++- amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 | 7 ++++++- amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 | 7 ++++++- amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 | 7 ++++++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index 70931f05..b9110aae 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -478,7 +478,10 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -489,9 +492,11 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index bd40a2a4..2b01f3e5 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -478,7 +478,10 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -489,9 +492,11 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 109abc00..069c924e 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -478,7 +478,10 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -489,9 +492,11 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index 3efee9e8..d618fe1c 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -478,7 +478,10 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -489,9 +492,11 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do From 5bcd36f39411dcd3201072cf48d7ebe2eb3bbf88 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 8 Aug 2023 09:25:15 +0200 Subject: [PATCH 94/96] Fixed SOC1 and SOC2 OpenMP --- .../impl/aggregator/amg_c_soc1_map_bld.F90 | 2 +- .../impl/aggregator/amg_c_soc2_map_bld.F90 | 189 ++++++++++++++---- .../impl/aggregator/amg_d_soc1_map_bld.F90 | 2 +- .../impl/aggregator/amg_d_soc2_map_bld.F90 | 189 ++++++++++++++---- .../impl/aggregator/amg_s_soc1_map_bld.F90 | 2 +- .../impl/aggregator/amg_s_soc2_map_bld.F90 | 189 ++++++++++++++---- .../impl/aggregator/amg_z_soc1_map_bld.F90 | 2 +- .../impl/aggregator/amg_z_soc2_map_bld.F90 | 189 ++++++++++++++---- 8 files changed, 608 insertions(+), 156 deletions(-) diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index b9110aae..81047953 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -393,7 +393,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in tmpaggr = ilaggr !$omp end workshare !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& - !$omp private(ii,i,j,k,nz,icol,val,ip) + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 index ed4161a5..3bda8e90 100644 --- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 @@ -68,7 +68,7 @@ ! subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_c_inner_mod #if defined(OPENMP) @@ -164,6 +164,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) @@ -180,6 +181,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) @@ -209,45 +211,156 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if defined(OPENMP) - icnt = 0 - step1: do ii=1, nr - i = idxs(ii) +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. - if (ilaggr(i) == -(nr+1)) then - ! - ! Get the 1-neighbourhood of I - ! - ip1 = s_neigh%irp(i) - nz = s_neigh%irp(i+1)-ip1 - ! - ! If the neighbourhood only contains I, skip it - ! - if (nz ==0) then - ilaggr(i) = 0 - cycle step1 - end if - if ((nz==1).and.(s_neigh%ja(ip1)==i)) then - ilaggr(i) = 0 - cycle step1 - end if - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! - nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) - icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) - disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) - if (disjoint) then - icnt = icnt + 1 - naggr = naggr + 1 - do k=1, nzcnt - ilaggr(icol(k)) = naggr + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if end do - ilaggr(i) = naggr - end if - endif - enddo step1 + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else icnt = 0 diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index 2b01f3e5..c83dfe3b 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -393,7 +393,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in tmpaggr = ilaggr !$omp end workshare !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& - !$omp private(ii,i,j,k,nz,icol,val,ip) + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 index 6047f375..b4602378 100644 --- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 @@ -68,7 +68,7 @@ ! subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_d_inner_mod #if defined(OPENMP) @@ -164,6 +164,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) @@ -180,6 +181,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) @@ -209,45 +211,156 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if defined(OPENMP) - icnt = 0 - step1: do ii=1, nr - i = idxs(ii) +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. - if (ilaggr(i) == -(nr+1)) then - ! - ! Get the 1-neighbourhood of I - ! - ip1 = s_neigh%irp(i) - nz = s_neigh%irp(i+1)-ip1 - ! - ! If the neighbourhood only contains I, skip it - ! - if (nz ==0) then - ilaggr(i) = 0 - cycle step1 - end if - if ((nz==1).and.(s_neigh%ja(ip1)==i)) then - ilaggr(i) = 0 - cycle step1 - end if - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! - nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) - icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) - disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) - if (disjoint) then - icnt = icnt + 1 - naggr = naggr + 1 - do k=1, nzcnt - ilaggr(icol(k)) = naggr + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if end do - ilaggr(i) = naggr - end if - endif - enddo step1 + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else icnt = 0 diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 069c924e..59a7c03b 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -393,7 +393,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in tmpaggr = ilaggr !$omp end workshare !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& - !$omp private(ii,i,j,k,nz,icol,val,ip) + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 index e94261a8..8dac2dd5 100644 --- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 @@ -68,7 +68,7 @@ ! subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_s_inner_mod #if defined(OPENMP) @@ -164,6 +164,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) @@ -180,6 +181,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) @@ -209,45 +211,156 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if defined(OPENMP) - icnt = 0 - step1: do ii=1, nr - i = idxs(ii) +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. - if (ilaggr(i) == -(nr+1)) then - ! - ! Get the 1-neighbourhood of I - ! - ip1 = s_neigh%irp(i) - nz = s_neigh%irp(i+1)-ip1 - ! - ! If the neighbourhood only contains I, skip it - ! - if (nz ==0) then - ilaggr(i) = 0 - cycle step1 - end if - if ((nz==1).and.(s_neigh%ja(ip1)==i)) then - ilaggr(i) = 0 - cycle step1 - end if - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! - nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) - icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) - disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) - if (disjoint) then - icnt = icnt + 1 - naggr = naggr + 1 - do k=1, nzcnt - ilaggr(icol(k)) = naggr + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if end do - ilaggr(i) = naggr - end if - endif - enddo step1 + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else icnt = 0 diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index d618fe1c..66c8e4e2 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -393,7 +393,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in tmpaggr = ilaggr !$omp end workshare !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& - !$omp private(ii,i,j,k,nz,icol,val,ip) + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 index e09bcf1e..19956309 100644 --- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 @@ -68,7 +68,7 @@ ! subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_z_inner_mod #if defined(OPENMP) @@ -164,6 +164,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) @@ -180,6 +181,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) @@ -209,45 +211,156 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! Phase one: Start with disjoint groups. ! naggr = 0 -#if defined(OPENMP) - icnt = 0 - step1: do ii=1, nr - i = idxs(ii) +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. - if (ilaggr(i) == -(nr+1)) then - ! - ! Get the 1-neighbourhood of I - ! - ip1 = s_neigh%irp(i) - nz = s_neigh%irp(i+1)-ip1 - ! - ! If the neighbourhood only contains I, skip it - ! - if (nz ==0) then - ilaggr(i) = 0 - cycle step1 - end if - if ((nz==1).and.(s_neigh%ja(ip1)==i)) then - ilaggr(i) = 0 - cycle step1 - end if - ! - ! If the whole strongly coupled neighborhood of I is - ! as yet unconnected, turn it into the next aggregate. - ! - nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) - icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) - disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) - if (disjoint) then - icnt = icnt + 1 - naggr = naggr + 1 - do k=1, nzcnt - ilaggr(icol(k)) = naggr + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if end do - ilaggr(i) = naggr - end if - endif - enddo step1 + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else icnt = 0 From d33bcfe107508a5d71de3eb56f9b7d8c66fdcb0f Mon Sep 17 00:00:00 2001 From: sfilippone Date: Sun, 13 Aug 2023 09:52:25 +0200 Subject: [PATCH 95/96] Completed SOC2 OpenMP. --- .../impl/aggregator/amg_c_soc1_map_bld.F90 | 2 ++ .../impl/aggregator/amg_c_soc2_map_bld.F90 | 22 ++++++++++++++----- .../impl/aggregator/amg_d_soc1_map_bld.F90 | 2 ++ .../impl/aggregator/amg_d_soc2_map_bld.F90 | 22 ++++++++++++++----- .../impl/aggregator/amg_s_soc1_map_bld.F90 | 2 ++ .../impl/aggregator/amg_s_soc2_map_bld.F90 | 22 ++++++++++++++----- .../impl/aggregator/amg_z_soc1_map_bld.F90 | 2 ++ .../impl/aggregator/amg_z_soc2_map_bld.F90 | 22 ++++++++++++++----- 8 files changed, 76 insertions(+), 20 deletions(-) diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index 81047953..4041ebe5 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -500,6 +500,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do + !$omp end parallel do + if (info /= 0) goto 9999 if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then info=psb_err_internal_error_ diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 index 3bda8e90..b250e434 100644 --- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 @@ -406,11 +406,16 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -436,8 +441,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -471,6 +477,8 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -482,13 +490,17 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index c83dfe3b..de95abce 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -500,6 +500,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do + !$omp end parallel do + if (info /= 0) goto 9999 if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then info=psb_err_internal_error_ diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 index b4602378..345cd1ad 100644 --- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 @@ -406,11 +406,16 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -436,8 +441,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -471,6 +477,8 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -482,13 +490,17 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 59a7c03b..0a809624 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -500,6 +500,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do + !$omp end parallel do + if (info /= 0) goto 9999 if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then info=psb_err_internal_error_ diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 index 8dac2dd5..ef7f5707 100644 --- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 @@ -406,11 +406,16 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -436,8 +441,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -471,6 +477,8 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -482,13 +490,17 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index 66c8e4e2..2c467426 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -500,6 +500,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in endif end if end do + !$omp end parallel do + if (info /= 0) goto 9999 if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then info=psb_err_internal_error_ diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 index 19956309..c6ac226e 100644 --- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 @@ -406,11 +406,16 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -436,8 +441,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -471,6 +477,8 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -482,13 +490,17 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') From 11421f53a256ad0b33503c5500d24aa5773f5934 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 22 Aug 2023 10:36:44 +0200 Subject: [PATCH 96/96] Minor updates on sample output --- samples/advanced/pdegen/amg_d_pde2d.F90 | 10 +++++----- samples/advanced/pdegen/amg_d_pde3d.F90 | 2 +- samples/advanced/pdegen/amg_s_pde2d.F90 | 10 +++++----- samples/advanced/pdegen/amg_s_pde3d.F90 | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/samples/advanced/pdegen/amg_d_pde2d.F90 b/samples/advanced/pdegen/amg_d_pde2d.F90 index 37e9fcd6..145c5890 100644 --- a/samples/advanced/pdegen/amg_d_pde2d.F90 +++ b/samples/advanced/pdegen/amg_d_pde2d.F90 @@ -463,14 +463,14 @@ program amg_d_pde2d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np write(psb_out_unit,'("Number of threads : ",i12)') nth write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size - write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) - write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) - write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) - write(psb_out_unit,'("Iterations to convergence : ",i12)') iter + write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) + write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) + write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) + write(psb_out_unit,'("Iterations to convergence : ",i12)') iter write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs() write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier diff --git a/samples/advanced/pdegen/amg_d_pde3d.F90 b/samples/advanced/pdegen/amg_d_pde3d.F90 index cb9542d4..75dcd1a0 100644 --- a/samples/advanced/pdegen/amg_d_pde3d.F90 +++ b/samples/advanced/pdegen/amg_d_pde3d.F90 @@ -467,7 +467,7 @@ program amg_d_pde3d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np write(psb_out_unit,'("Number of threads : ",i12)') nth write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size diff --git a/samples/advanced/pdegen/amg_s_pde2d.F90 b/samples/advanced/pdegen/amg_s_pde2d.F90 index eb8a8d63..ad28d1f6 100644 --- a/samples/advanced/pdegen/amg_s_pde2d.F90 +++ b/samples/advanced/pdegen/amg_s_pde2d.F90 @@ -463,14 +463,14 @@ program amg_s_pde2d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np write(psb_out_unit,'("Number of threads : ",i12)') nth write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size - write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) - write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) - write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) - write(psb_out_unit,'("Iterations to convergence : ",i12)') iter + write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) + write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) + write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) + write(psb_out_unit,'("Iterations to convergence : ",i12)') iter write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs() write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier diff --git a/samples/advanced/pdegen/amg_s_pde3d.F90 b/samples/advanced/pdegen/amg_s_pde3d.F90 index d6195c45..cda6a48b 100644 --- a/samples/advanced/pdegen/amg_s_pde3d.F90 +++ b/samples/advanced/pdegen/amg_s_pde3d.F90 @@ -467,7 +467,7 @@ program amg_s_pde3d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np write(psb_out_unit,'("Number of threads : ",i12)') nth write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size