diff --git a/README.md b/README.md index 2a4426e3..3be67f1f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - AMG4PSBLAS Algebraic Multigrid Package based on PSBLAS (Parallel Sparse BLAS version 3.8) diff --git a/amgprec/amg_d_matchboxp_mod.f90 b/amgprec/amg_d_matchboxp_mod.f90 index a18d62d6..e19ce617 100644 --- a/amgprec/amg_d_matchboxp_mod.f90 +++ b/amgprec/amg_d_matchboxp_mod.f90 @@ -143,9 +143,10 @@ contains type(psb_ld_coo_sparse_mat) :: tmpcoo logical :: display_out_, print_out_, reproducible_ logical, parameter :: dump=.false., debug=.false., dump_mate=.false., & - & debug_ilaggr=.false., debug_sync=.false. + & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false. integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 logical, parameter :: do_timings=.true. + integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2 ictxt = desc_a%get_ctxt() call psb_info(ictxt,iam,np) @@ -187,7 +188,7 @@ contains call desc_a%l2gip(ilv,info,owned=.false.) call psb_geall(ilaggr,desc_a,info) - ilaggr = -1 + ilaggr = ilaggr_neginit call psb_geasb(ilaggr,desc_a,info) nr = a%get_nrows() nc = a%get_ncols() @@ -213,7 +214,20 @@ contains call psb_barrier(ictxt) if (iam == 0) write(0,*)' out from buildmatching:', info end if - + if (debug_mate) then + block + integer(psb_lpk_), allocatable :: ckmate(:) + allocate(ckmate(nr)) + ckmate(1:nr) = mate(1:nr) + call psb_msort(ckmate(1:nr)) + do i=1,nr-1 + if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then + write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i) + end if + end do + end block + end if + if (info == 0) then if (do_timings) call psb_tic(idx_phase2) if (debug_sync) then @@ -259,7 +273,7 @@ contains cycle else - if (ilaggr(k) == -1) then + if (ilaggr(k) == ilaggr_neginit) then wk = w(k) widx = w(idx) @@ -267,7 +281,7 @@ contains nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2) if (nrmagg > epsilon(nrmagg)) then if (idx <= nr) then - if (ilaggr(idx) == -1) then + if (ilaggr(idx) == ilaggr_neginit) then ! Now, if both vertices are local, the aggregate is local ! (kinda obvious). nlaggr(iam) = nlaggr(iam) + 1 @@ -275,6 +289,9 @@ contains ilaggr(idx) = nlaggr(iam) wtemp(k) = w(k)/nrmagg wtemp(idx) = w(idx)/nrmagg + else + write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,& + &mate(idx),ilaggr(idx) end if nlpairs = nlpairs+1 else if (idx <= nc) then @@ -294,7 +311,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if else ! Use a statistically unbiased tie-breaking rule, @@ -309,7 +326,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if end if end if @@ -325,6 +342,12 @@ contains nlsingl = nlsingl + 1 end if end if + if (ilaggr(k) == ilaggr_neginit) then + write(0,*) iam,' Error: no update to ',k,mate(k),& + & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k) + end if + else + if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k) end if end if end do @@ -332,7 +355,7 @@ contains if (do_timings) call psb_tic(idx_phase3) ! Ok, now compute offsets, gather halo and fix non-local - ! aggregates (those where ilaggr == -2) + ! aggregates (those where ilaggr == ilaggr_nonlocal) call psb_sum(ictxt,nlaggr) ntaggr = sum(nlaggr(0:np-1)) naggrm1 = sum(nlaggr(0:iam-1)) @@ -347,7 +370,7 @@ contains call psb_halo(wtemp,desc_a,info) ! Cleanup as yet unmarked entries do k=1,nr - if (ilaggr(k) == -2) then + if (ilaggr(k) == ilaggr_nonlocal) then idx = mate(k) if (idx > nr) then i = ilaggr(idx) @@ -359,9 +382,14 @@ contains else write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx) end if - end if - if (ilaggr(k) <0) then - write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + else if (ilaggr(k) <0) then + write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + write(0,*) iam,' : : ',nr,nc,mate(k) + if (mate(k) <= nr) then + write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),& + & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k))) + end if + flush(0) end if end do if (debug_sync) then @@ -414,7 +442,7 @@ contains end block if (iam == 0) then - write(0,*) 'Matching statistics: Unmatched nodes ',& + write(0,*) iam,'Matching statistics: Unmatched nodes ',& & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs end if diff --git a/amgprec/amg_s_matchboxp_mod.f90 b/amgprec/amg_s_matchboxp_mod.f90 index 9061344f..a7f41c24 100644 --- a/amgprec/amg_s_matchboxp_mod.f90 +++ b/amgprec/amg_s_matchboxp_mod.f90 @@ -143,9 +143,10 @@ contains type(psb_ls_coo_sparse_mat) :: tmpcoo logical :: display_out_, print_out_, reproducible_ logical, parameter :: dump=.false., debug=.false., dump_mate=.false., & - & debug_ilaggr=.false., debug_sync=.false. + & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false. integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 logical, parameter :: do_timings=.true. + integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2 ictxt = desc_a%get_ctxt() call psb_info(ictxt,iam,np) @@ -187,7 +188,7 @@ contains call desc_a%l2gip(ilv,info,owned=.false.) call psb_geall(ilaggr,desc_a,info) - ilaggr = -1 + ilaggr = ilaggr_neginit call psb_geasb(ilaggr,desc_a,info) nr = a%get_nrows() nc = a%get_ncols() @@ -213,7 +214,20 @@ contains call psb_barrier(ictxt) if (iam == 0) write(0,*)' out from buildmatching:', info end if - + if (debug_mate) then + block + integer(psb_lpk_), allocatable :: ckmate(:) + allocate(ckmate(nr)) + ckmate(1:nr) = mate(1:nr) + call psb_msort(ckmate(1:nr)) + do i=1,nr-1 + if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then + write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i) + end if + end do + end block + end if + if (info == 0) then if (do_timings) call psb_tic(idx_phase2) if (debug_sync) then @@ -259,7 +273,7 @@ contains cycle else - if (ilaggr(k) == -1) then + if (ilaggr(k) == ilaggr_neginit) then wk = w(k) widx = w(idx) @@ -267,7 +281,7 @@ contains nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2) if (nrmagg > epsilon(nrmagg)) then if (idx <= nr) then - if (ilaggr(idx) == -1) then + if (ilaggr(idx) == ilaggr_neginit) then ! Now, if both vertices are local, the aggregate is local ! (kinda obvious). nlaggr(iam) = nlaggr(iam) + 1 @@ -275,6 +289,9 @@ contains ilaggr(idx) = nlaggr(iam) wtemp(k) = w(k)/nrmagg wtemp(idx) = w(idx)/nrmagg + else + write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,& + &mate(idx),ilaggr(idx) end if nlpairs = nlpairs+1 else if (idx <= nc) then @@ -294,7 +311,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if else ! Use a statistically unbiased tie-breaking rule, @@ -309,7 +326,7 @@ contains ilaggr(k) = nlaggr(iam) nlpairs = nlpairs+1 else - ilaggr(k) = -2 + ilaggr(k) = ilaggr_nonlocal end if end if end if @@ -325,6 +342,12 @@ contains nlsingl = nlsingl + 1 end if end if + if (ilaggr(k) == ilaggr_neginit) then + write(0,*) iam,' Error: no update to ',k,mate(k),& + & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k) + end if + else + if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k) end if end if end do @@ -332,7 +355,7 @@ contains if (do_timings) call psb_tic(idx_phase3) ! Ok, now compute offsets, gather halo and fix non-local - ! aggregates (those where ilaggr == -2) + ! aggregates (those where ilaggr == ilaggr_nonlocal) call psb_sum(ictxt,nlaggr) ntaggr = sum(nlaggr(0:np-1)) naggrm1 = sum(nlaggr(0:iam-1)) @@ -347,7 +370,7 @@ contains call psb_halo(wtemp,desc_a,info) ! Cleanup as yet unmarked entries do k=1,nr - if (ilaggr(k) == -2) then + if (ilaggr(k) == ilaggr_nonlocal) then idx = mate(k) if (idx > nr) then i = ilaggr(idx) @@ -359,9 +382,14 @@ contains else write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx) end if - end if - if (ilaggr(k) <0) then - write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + else if (ilaggr(k) <0) then + write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) + write(0,*) iam,' : : ',nr,nc,mate(k) + if (mate(k) <= nr) then + write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),& + & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k))) + end if + flush(0) end if end do if (debug_sync) then @@ -414,7 +442,7 @@ contains end block if (iam == 0) then - write(0,*) 'Matching statistics: Unmatched nodes ',& + write(0,*) iam,'Matching statistics: Unmatched nodes ',& & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs end if diff --git a/amgprec/impl/aggregator/Makefile b/amgprec/impl/aggregator/Makefile index 75079098..11027ac1 100644 --- a/amgprec/impl/aggregator/Makefile +++ b/amgprec/impl/aggregator/Makefile @@ -62,7 +62,22 @@ amg_s_parmatch_smth_bld.o \ amg_s_parmatch_spmm_bld_inner.o MPCOBJS=MatchBoxPC.o \ -algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o +sendBundledMessages.o \ +initialize.o \ +extractUChunk.o \ +isAlreadyMatched.o \ +findOwnerOfGhost.o \ +clean.o \ +computeCandidateMate.o \ +parallelComputeCandidateMateB.o \ +processMatchedVertices.o \ +processMatchedVerticesAndSendMessages.o \ +processCrossEdge.o \ +queueTransfer.o \ +processMessages.o \ +processExposedVertex.o \ +algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \ +algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o OBJS = $(FOBJS) $(MPCOBJS) diff --git a/amgprec/impl/aggregator/MatchBoxPC.cpp b/amgprec/impl/aggregator/MatchBoxPC.cpp index c1ec0976..90b448dc 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.cpp +++ b/amgprec/impl/aggregator/MatchBoxPC.cpp @@ -60,17 +60,43 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { #if !defined(SERIAL_MPI) MPI_Comm C_comm=MPI_Comm_f2c(icomm); + #ifdef DEBUG fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n", myRank,NLVer, NLEdge,verDistance[0],verDistance[1]); #endif - dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge, + + +#define TIME_TRACKER + #ifdef TIME_TRACKER + double tmr = MPI_Wtime(); + #endif + +#define OMP +#ifdef OMP + dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge, + verLocPtr, verLocInd, edgeLocWeight, + verDistance, Mate, + myRank, numProcs, C_comm, + msgIndSent, msgActualSent, msgPercent, + ph0_time, ph1_time, ph2_time, + ph1_card, ph2_card ); +#else + dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge, verLocPtr, verLocInd, edgeLocWeight, verDistance, Mate, myRank, numProcs, C_comm, msgIndSent, msgActualSent, msgPercent, ph0_time, ph1_time, ph2_time, ph1_card, ph2_card ); +#endif + + + #ifdef TIME_TRACKER + tmr = MPI_Wtime() - tmr; + fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer); + #endif + #endif } diff --git a/amgprec/impl/aggregator/MatchBoxPC.h b/amgprec/impl/aggregator/MatchBoxPC.h index 21d0a181..a1fddb59 100644 --- a/amgprec/impl/aggregator/MatchBoxPC.h +++ b/amgprec/impl/aggregator/MatchBoxPC.h @@ -52,145 +52,412 @@ #ifndef _matchboxpC_H_ #define _matchboxpC_H_ -//Turn on a lot of debugging information with this switch: +// Turn on a lot of debugging information with this switch: //#define PRINT_DEBUG_INFO_ #include #include #include #include #include -// #include "matchboxp.h" +#include "omp.h" #include "primitiveDataTypeDefinitions.h" #include "dataStrStaticQueue.h" using namespace std; +const int NUM_THREAD = 4; +const int UCHUNK = 10; + +const MilanLongInt REQUEST = 1; +const MilanLongInt SUCCESS = 2; +const MilanLongInt FAILURE = 3; +const MilanLongInt SIZEINFO = 4; + +const int ComputeTag = 7; // Predefined tag +const int BundleTag = 9; // Predefined tag + +static vector DEFAULT_VECTOR; + +// MPI type map +template +MPI_Datatype TypeMap(); +template <> +inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } +template <> +inline MPI_Datatype TypeMap() { return MPI_INT; } +template <> +inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } +template <> +inline MPI_Datatype TypeMap() { return MPI_FLOAT; } + #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif #if !defined(SERIAL_MPI) - -#define MilanMpiLongInt MPI_LONG_LONG + +#define MilanMpiLongInt MPI_LONG_LONG #ifndef _primitiveDataType_Definition_ #define _primitiveDataType_Definition_ - //Regular integer: - #ifndef INTEGER_H - #define INTEGER_H - typedef int32_t MilanInt; - #endif - - //Regular long integer: - #ifndef LONG_INT_H - #define LONG_INT_H - #ifdef BIT64 - typedef int64_t MilanLongInt; - typedef MPI_LONG MilanMpiLongInt; - #else - typedef int32_t MilanLongInt; - typedef MPI_INT MilanMpiLongInt; - #endif - #endif - - //Regular boolean - #ifndef BOOL_H - #define BOOL_H - typedef bool MilanBool; - #endif - - //Regular double and absolute value computation: - #ifndef REAL_H - #define REAL_H - typedef double MilanReal; - typedef MPI_DOUBLE MilanMpiReal; - inline MilanReal MilanAbs(MilanReal value) - { - return fabs(value); - } - #endif - - //Regular float and absolute value computation: - #ifndef FLOAT_H - #define FLOAT_H - typedef float MilanFloat; - typedef MPI_FLOAT MilanMpiFloat; - inline MilanFloat MilanAbsFloat(MilanFloat value) - { - return fabs(value); - } - #endif - - //// Define the limits: - #ifndef LIMITS_H - #define LIMITS_H - //Integer Maximum and Minimum: - // #define MilanIntMax INT_MAX - // #define MilanIntMin INT_MIN - #define MilanIntMax INT32_MAX - #define MilanIntMin INT32_MIN - - #ifdef BIT64 - #define MilanLongIntMax INT64_MAX - #define MilanLongIntMin -INT64_MAX - #else - #define MilanLongIntMax INT32_MAX - #define MilanLongIntMin -INT32_MAX - #endif - - #endif +// Regular integer: +#ifndef INTEGER_H +#define INTEGER_H + typedef int32_t MilanInt; +#endif + +// Regular long integer: +#ifndef LONG_INT_H +#define LONG_INT_H +#ifdef BIT64 + typedef int64_t MilanLongInt; + typedef MPI_LONG MilanMpiLongInt; +#else + typedef int32_t MilanLongInt; + typedef MPI_INT MilanMpiLongInt; +#endif +#endif + +// Regular boolean +#ifndef BOOL_H +#define BOOL_H + typedef bool MilanBool; +#endif + +// Regular double and absolute value computation: +#ifndef REAL_H +#define REAL_H + typedef double MilanReal; + typedef MPI_DOUBLE MilanMpiReal; + inline MilanReal MilanAbs(MilanReal value) + { + return fabs(value); + } +#endif + +// Regular float and absolute value computation: +#ifndef FLOAT_H +#define FLOAT_H + typedef float MilanFloat; + typedef MPI_FLOAT MilanMpiFloat; + inline MilanFloat MilanAbsFloat(MilanFloat value) + { + return fabs(value); + } +#endif + +//// Define the limits: +#ifndef LIMITS_H +#define LIMITS_H + // Integer Maximum and Minimum: + // #define MilanIntMax INT_MAX + // #define MilanIntMin INT_MIN +#define MilanIntMax INT32_MAX +#define MilanIntMin INT32_MIN + +#ifdef BIT64 +#define MilanLongIntMax INT64_MAX +#define MilanLongIntMin -INT64_MAX +#else +#define MilanLongIntMax INT32_MAX +#define MilanLongIntMin -INT32_MAX +#endif + +#endif // +INFINITY const double PLUS_INFINITY = numeric_limits::infinity(); const double MINUS_INFINITY = -PLUS_INFINITY; - //#define MilanRealMax LDBL_MAX - #define MilanRealMax PLUS_INFINITY - #define MilanRealMin MINUS_INFINITY +//#define MilanRealMax LDBL_MAX +#define MilanRealMax PLUS_INFINITY +#define MilanRealMin MINUS_INFINITY #endif -//Function of find the owner of a ghost vertex using binary search: -inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, - MilanInt myRank, MilanInt numProcs); - - void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC -( - MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MPI_Comm comm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ); - - void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC -( -MilanLongInt NLVer, MilanLongInt NLEdge, -MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight, -MilanLongInt* verDistance, -MilanLongInt* Mate, -MilanInt myRank, MilanInt numProcs, MPI_Comm comm, -MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, -MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, -MilanLongInt* ph1_card, MilanLongInt* ph2_card ); - -void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MilanInt icomm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ); - -void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, - MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight, - MilanLongInt* verDistance, - MilanLongInt* Mate, - MilanInt myRank, MilanInt numProcs, MilanInt icomm, - MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, - MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, - MilanLongInt* ph1_card, MilanLongInt* ph2_card ); + // Function of find the owner of a ghost vertex using binary search: + MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, + MilanInt myRank, MilanInt numProcs); + + MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanLongInt *verLocInd, + MilanReal *edgeLocWeight); + + void queuesTransfer(vector &U, + vector &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); + + bool isAlreadyMatched(MilanLongInt node, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap); + + MilanLongInt computeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanReal *edgeLocWeight, + MilanLongInt k, + MilanLongInt *verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap); + + void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt *numGhostEdgesPtr, + MilanLongInt *numGhostVerticesPtr, + MilanLongInt *S, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + map &Ghost2LocalMap, + vector &Counter, + vector &verGhostPtr, + vector &verGhostInd, + vector &tempCounter, + vector &GMate, + vector &Message, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + MilanLongInt *&candidateMate, + vector &U, + vector &privateU, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); + + void clean(MilanLongInt NLVer, + MilanInt myRank, + MilanLongInt MessageIndex, + vector &SRequest, + vector &SStatus, + MilanInt BufferSize, + MilanLongInt *Buffer, + MilanLongInt msgActual, + MilanLongInt *msgActualSent, + MilanLongInt msgInd, + MilanLongInt *msgIndSent, + MilanLongInt NumMessagesBundled, + MilanReal *msgPercent); + + void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanInt myRank, + MilanReal *edgeLocWeight, + MilanLongInt *candidateMate); + + void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, + MilanLongInt *candidateMate, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *Mate, + vector &GMate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + vector &U, + vector &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); + + void PROCESS_CROSS_EDGE(MilanLongInt *edge, + MilanLongInt *SPtr); + + void processMatchedVertices( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + vector &U, + vector &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner); + + void processMatchedVerticesAndSendMessages( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + vector &U, + vector &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *NumMessagesBundledPtr, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner, + MPI_Comm comm, + MilanLongInt *msgActual, + vector &Message); + + void sendBundledMessages(MilanLongInt *numGhostEdgesPtr, + MilanInt *BufferSizePtr, + MilanLongInt *Buffer, + vector &PCumulative, + vector &PMessageBundle, + vector &PSizeInfoMessages, + MilanLongInt *PCounter, + MilanLongInt NumMessagesBundled, + MilanLongInt *msgActualPtr, + MilanLongInt *MessageIndexPtr, + MilanInt numProcs, + MilanInt myRank, + MPI_Comm comm, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &SRequest, + vector &SStatus); + + void processMessages( + MilanLongInt NLVer, + MilanLongInt *Mate, + MilanLongInt *candidateMate, + map &Ghost2LocalMap, + vector &GMate, + vector &Counter, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCardPtr, + MilanLongInt *msgIndPtr, + MilanLongInt *msgActualPtr, + MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *verLocPtr, + MilanLongInt k, + MilanLongInt *verLocInd, + MilanInt numProcs, + MilanInt myRank, + MPI_Comm comm, + vector &Message, + MilanLongInt numGhostEdges, + MilanLongInt u, + MilanLongInt v, + MilanLongInt *SPtr, + vector &U); + + void extractUChunk( + vector &UChunkBeingProcessed, + vector &U, + vector &privateU); + + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); + + void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); + + void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); + + void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MilanInt icomm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); + + void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MilanInt icomm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card); #endif #ifdef __cplusplus diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp index 8be438b6..f03f726f 100644 --- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp @@ -72,12 +72,6 @@ #ifdef SERIAL_MPI #else -//MPI type map -template MPI_Datatype TypeMap(); -template<> inline MPI_Datatype TypeMap() { return MPI_LONG_LONG; } -template<> inline MPI_Datatype TypeMap() { return MPI_INT; } -template<> inline MPI_Datatype TypeMap() { return MPI_DOUBLE; } -template<> inline MPI_Datatype TypeMap() { return MPI_FLOAT; } // DOUBLE PRECISION VERSION //WARNING: The vertex block on a given rank is contiguous diff --git a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp new file mode 100644 index 00000000..49b366a6 --- /dev/null +++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp @@ -0,0 +1,554 @@ +#include "MatchBoxPC.h" + +// *********************************************************************** +// +// MatchboxP: A C++ library for approximate weighted matching +// Mahantesh Halappanavar (hala@pnnl.gov) +// Pacific Northwest National Laboratory +// +// *********************************************************************** +// +// Copyright (2021) Battelle Memorial Institute +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////// DOMINATING EDGES MODEL /////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////// +/* Function : algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate() + * + * Date : New update: Feb 17, 2019, Richland, Washington. + * Date : Original development: May 17, 2009, E&CS Bldg. + * + * Purpose : Compute Approximate Maximum Weight Matching in Linear Time + * + * Args : inputMatrix - instance of Compressed-Col format of Matrix + * Mate - The Mate array + * + * Returns : By Value: (void) + * By Reference: Mate + * + * Comments : 1/2 Approx Algorithm. Picks the locally available heaviest edge. + * Assumption: The Mate Array is empty. + */ + +/* + NLVer = #of vertices, NLEdge = #of edges + CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers) + verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process + Mate = A vector of size |V_p| (local subgraph) to store the output (matching) + MPI: myRank, numProcs, comm, + Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world + Statistics: ph0_time, ph1_time, ph2_time: Runtimes + Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2) + */ +//#define DEBUG_HANG_ +#ifdef SERIAL_MPI +#else + +// DOUBLE PRECISION VERSION +// WARNING: The vertex block on a given rank is contiguous +void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP( + MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt *verLocPtr, MilanLongInt *verLocInd, + MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *Mate, + MilanInt myRank, MilanInt numProcs, MPI_Comm comm, + MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, + MilanReal *msgPercent, + MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time, + MilanLongInt *ph1_card, MilanLongInt *ph2_card) +{ + + /* + * verDistance: it's a vector long as the number of processors. + * verDistance[i] contains the first node index of the i-th processor + * verDistance[i + 1] contains the last node index of the i-th processor + * NLVer: number of elements in the LocPtr + * NLEdge: number of edges assigned to the current processor + * + * Contains the portion of matrix assigned to the processor in + * Yale notation + * verLocInd: contains the positions on row of the matrix + * verLocPtr: i-th value is the position of the first element on the i-th row and + * i+1-th value is the position of the first element on the i+1-th row + */ + +#if !defined(SERIAL_MPI) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()"; + fflush(stdout); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ") verDistance [" ; + for (int i = 0; i < numProcs; i++) + cout << verDistance[i] << "," << verDistance[i+1]; + cout << "]\n"; + fflush(stdout); +#endif +#ifdef DEBUG_HANG_ + if (myRank == 0) { + cout << "\n(" << myRank << ") verDistance [" ; + for (int i = 0; i < numProcs; i++) + cout << verDistance[i] << "," ; + cout << verDistance[numProcs]<< "]\n"; + } + fflush(stdout); +#endif + + MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank + MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank + + MPI_Status computeStatus; + + MilanLongInt msgActual = 0, msgInd = 0; + MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight + MilanReal startTime, finishTime; + + startTime = MPI_Wtime(); + + // Data structures for sending and receiving messages: + vector Message; // [ u, v, message_type ] + Message.resize(3, -1); + // Data structures for Message Bundling: + // Although up to two messages can be sent along any cross edge, + // only one message will be sent in the initialization phase - + // one of: REQUEST/FAILURE/SUCCESS + vector QLocalVtx, QGhostVtx, QMsgType; + vector QOwner; // Changed by Fabio to be an integer, addresses needs to be integers! + + MilanLongInt *PCounter = new MilanLongInt[numProcs]; + for (int i = 0; i < numProcs; i++) + PCounter[i] = 0; + + MilanLongInt NumMessagesBundled = 0; + // TODO when the last computational section will be refactored this could be eliminated + MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers! + MilanLongInt *candidateMate = nullptr; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")NV: " << NLVer << " Edges: " << NLEdge; + fflush(stdout); + cout << "\n(" << myRank << ")StartIndex: " << StartIndex << " EndIndex: " << EndIndex; + fflush(stdout); +#endif + // Other Variables: + MilanLongInt u = -1, v = -1, w = -1, i = 0; + MilanLongInt k = -1, adj1 = -1, adj2 = -1; + MilanLongInt k1 = -1, adj11 = -1, adj12 = -1; + MilanLongInt myCard = 0; + + // Build the Ghost Vertex Set: Vg + map Ghost2LocalMap; // Map each ghost vertex to a local vertex + vector Counter; // Store the edge count for each ghost vertex + MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")About to compute Ghost Vertices..."; + fflush(stdout); +#endif +#ifdef DEBUG_HANG_ + if (myRank == 0) + cout << "\n(" << myRank << ")About to compute Ghost Vertices..."; + fflush(stdout); +#endif + + // Define Adjacency Lists for Ghost Vertices: + // cout<<"Building Ghost data structures ... \n\n"; + vector verGhostPtr, verGhostInd, tempCounter; + // Mate array for ghost vertices: + vector GMate; // Proportional to the number of ghost vertices + MilanLongInt S; + MilanLongInt privateMyCard = 0; + vector PCumulative, PMessageBundle, PSizeInfoMessages; + vector SRequest; // Requests that are used for each send message + vector SStatus; // Status of sent messages, used in MPI_Wait + MilanLongInt MessageIndex = 0; // Pointer for current message + MilanInt BufferSize; + MilanLongInt *Buffer; + + vector privateQLocalVtx, privateQGhostVtx, privateQMsgType; + vector privateQOwner; + vector U, privateU; + + initialize(NLVer, NLEdge, StartIndex, + EndIndex, &numGhostEdges, + &numGhostVertices, &S, + verLocInd, verLocPtr, + Ghost2LocalMap, Counter, + verGhostPtr, verGhostInd, + tempCounter, GMate, + Message, QLocalVtx, + QGhostVtx, QMsgType, QOwner, + candidateMate, U, + privateU, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + finishTime = MPI_Wtime(); + *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization +#ifdef DEBUG_HANG_ + cout << myRank << " Finished initialization" << endl; + fflush(stdout); +#endif + + startTime = MPI_Wtime(); + + ///////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////// INITIALIZATION ///////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////// + // Compute the Initial Matching Set: + + /* + * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from + * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize + * the two. + * PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel. + */ + + PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer, + verLocPtr, + verLocInd, + myRank, + edgeLocWeight, + candidateMate); + +#ifdef DEBUG_HANG_ + cout << myRank << " Finished Exposed Vertex" << endl; + fflush(stdout); +#if 0 + cout << myRank << " candidateMate after parallelCompute " < UChunkBeingProcessed; + UChunkBeingProcessed.reserve(UCHUNK); + + processMatchedVertices(NLVer, + UChunkBeingProcessed, + U, + privateU, + StartIndex, + EndIndex, + &myCard, + &msgInd, + &NumMessagesBundled, + &S, + verLocPtr, + verLocInd, + verDistance, + PCounter, + Counter, + myRank, + numProcs, + candidateMate, + GMate, + Mate, + Ghost2LocalMap, + edgeLocWeight, + QLocalVtx, + QGhostVtx, + QMsgType, + QOwner, + privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + +#ifdef DEBUG_HANG_ + cout << myRank << " Finished Process Vertices" << endl; + fflush(stdout); +#if 0 + cout << myRank << " Mate after Matched Vertices " <BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) cycle step1 + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + ip = 0 + do k=1, nz + j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +351,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +369,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +378,22 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +424,15 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +461,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -292,7 +478,10 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -303,15 +492,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -336,9 +528,13 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 similarity index 57% rename from amgprec/impl/aggregator/amg_c_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 index 020cae4b..b250e434 100644 --- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 @@ -68,9 +68,12 @@ ! subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_c_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,217 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = sone + s_neigh_coo%val(k) = sone else - s_neigh_coo%val(ip) = -sone + s_neigh_coo%val(k) = -sone end if + else + s_neigh_coo%val(k) = -sone end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,16 +401,21 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -259,8 +441,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -294,6 +477,8 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -305,13 +490,17 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') diff --git a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 index 53e740fe..c4a85b05 100644 --- a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_spk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = czero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= czero) then adiag(i) = cone / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = cone end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 index 2edcca6c..26edbb0a 100644 --- a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 +++ b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90 @@ -97,6 +97,8 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,& integer(psb_lpk_) :: ntaggr integer(psb_ipk_) :: debug_level, debug_unit logical :: clean_zeros + integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1 + logical, parameter :: do_timings=.false. name='amg_d_dec_aggregator_tprol' call psb_erractionsave(err_act) @@ -108,6 +110,10 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,& info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_map_bld==-1)) & + & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld") + if ((do_timings).and.(idx_map_tprol==-1)) & + & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol") call amg_check_def(parms%ml_cycle,'Multilevel cycle',& & amg_mult_ml_,is_legal_ml_cycle) @@ -121,10 +127,14 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,& ! The decoupled aggregator based on SOC measures ignores ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! + if (do_timings) call psb_tic(idx_map_bld) clean_zeros = ag%do_clean_zeros call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) + if (do_timings) call psb_toc(idx_map_bld) + if (do_timings) call psb_tic(idx_map_tprol) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) + if (do_timings) call psb_toc(idx_map_tprol) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') diff --git a/amgprec/impl/aggregator/amg_d_ptap_bld.f90 b/amgprec/impl/aggregator/amg_d_ptap_bld.f90 index 8520e58e..4006c04c 100644 --- a/amgprec/impl/aggregator/amg_d_ptap_bld.f90 +++ b/amgprec/impl/aggregator/amg_d_ptap_bld.f90 @@ -76,7 +76,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. - integer(psb_ipk_), save :: idx_spspmm=-1 + integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1 name='amg_ptap_bld' if(psb_get_errstatus().ne.0) return @@ -93,7 +93,11 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ncol = desc_a%get_local_cols() if ((do_timings).and.(idx_spspmm==-1)) & - & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") + & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm") + if ((do_timings).and.(idx_cpytrans1==-1)) & + & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1") + if ((do_timings).and.(idx_cpytrans2==-1)) & + & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2") naggr = nlaggr(me+1) ntaggr = sum(nlaggr) @@ -128,6 +132,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ! Ok first product done. if (present(desc_ax)) then + if (do_timings) call psb_tic(idx_cpytrans1) block call coo_prol%cp_to_coo(coo_restr,info) call coo_restr%set_ncols(desc_ac%get_local_cols()) @@ -137,7 +142,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%set_ncols(desc_ax%get_local_cols()) end block call csr_restr%cp_from_coo(coo_restr,info) - + if (do_timings) call psb_toc(idx_cpytrans1) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 @@ -167,27 +172,28 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%transp() nzl = coo_restr%get_nzeros() - nrl = desc_ac%get_local_rows() - i=0 + nrl = desc_ac%get_local_rows() + call coo_restr%fix(info) + i=coo_restr%get_nzeros() ! ! Only keep local rows ! - do k=1, nzl - if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then - i = i+1 - coo_restr%val(i) = coo_restr%val(k) - coo_restr%ia(i) = coo_restr%ia(k) - coo_restr%ja(i) = coo_restr%ja(k) + search: do k=i,1,-1 + if (coo_restr%ia(k) <= nrl) then + call coo_restr%set_nzeros(k) + exit search end if - end do - call coo_restr%set_nzeros(i) - call coo_restr%fix(info) + end do search + nzl = coo_restr%get_nzeros() call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_ncols(desc_a%get_local_cols()) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) + if (do_timings) call psb_tic(idx_cpytrans2) + call csr_restr%cp_from_coo(coo_restr,info) + if (do_timings) call psb_toc(idx_cpytrans2) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 similarity index 56% rename from amgprec/impl/aggregator/amg_d_soc1_map_bld.f90 rename to amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index e3ae5245..de95abce 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -72,7 +72,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_d_inner_mod - +#if defined(OPENMP) + use omp_lib +#endif implicit none ! Arguments @@ -85,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_dpk_), allocatable :: val(:), diag(:) @@ -99,6 +101,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_lpk_) :: nrglob character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1 + integer(psb_ipk_), save :: idx_soc1_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc1_map_bld' @@ -114,6 +119,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc1_p0==-1)) & + & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0") + if ((do_timings).and.(idx_soc1_p1==-1)) & + & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1") + if ((do_timings).and.(idx_soc1_p2==-1)) & + & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2") + if ((do_timings).and.(idx_soc1_p3==-1)) & + & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -133,41 +146,203 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc1_p0) call a%cp_to(acsr) + if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) - idxs(i) = i + idxs(i) = i end do - else + !$omp end parallel do + else + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if - + if (do_timings) call psb_tic(idx_soc1_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 - icnt = 0 +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) cycle step1 + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + ip = 0 + do k=1, nz + j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +351,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +369,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +378,22 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +424,15 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +461,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -292,7 +478,10 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -303,15 +492,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -336,9 +528,13 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 similarity index 57% rename from amgprec/impl/aggregator/amg_d_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 index 1433a670..345cd1ad 100644 --- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 @@ -68,9 +68,12 @@ ! subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_d_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,217 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = done + s_neigh_coo%val(k) = done else - s_neigh_coo%val(ip) = -done + s_neigh_coo%val(k) = -done end if + else + s_neigh_coo%val(k) = -done end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,16 +401,21 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -259,8 +441,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -294,6 +477,8 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -305,13 +490,17 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') diff --git a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 index 82da3fc7..d365bf27 100644 --- a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_dpk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = dzero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= dzero) then adiag(i) = done / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = done end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 index c52c04f7..9529d141 100644 --- a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 +++ b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90 @@ -97,6 +97,8 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,& integer(psb_lpk_) :: ntaggr integer(psb_ipk_) :: debug_level, debug_unit logical :: clean_zeros + integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1 + logical, parameter :: do_timings=.false. name='amg_s_dec_aggregator_tprol' call psb_erractionsave(err_act) @@ -108,6 +110,10 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,& info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_map_bld==-1)) & + & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld") + if ((do_timings).and.(idx_map_tprol==-1)) & + & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol") call amg_check_def(parms%ml_cycle,'Multilevel cycle',& & amg_mult_ml_,is_legal_ml_cycle) @@ -121,10 +127,14 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,& ! The decoupled aggregator based on SOC measures ignores ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! + if (do_timings) call psb_tic(idx_map_bld) clean_zeros = ag%do_clean_zeros call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) + if (do_timings) call psb_toc(idx_map_bld) + if (do_timings) call psb_tic(idx_map_tprol) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) + if (do_timings) call psb_toc(idx_map_tprol) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') diff --git a/amgprec/impl/aggregator/amg_s_ptap_bld.f90 b/amgprec/impl/aggregator/amg_s_ptap_bld.f90 index 93b79b63..e1a6c867 100644 --- a/amgprec/impl/aggregator/amg_s_ptap_bld.f90 +++ b/amgprec/impl/aggregator/amg_s_ptap_bld.f90 @@ -76,7 +76,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. - integer(psb_ipk_), save :: idx_spspmm=-1 + integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1 name='amg_ptap_bld' if(psb_get_errstatus().ne.0) return @@ -93,7 +93,11 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ncol = desc_a%get_local_cols() if ((do_timings).and.(idx_spspmm==-1)) & - & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") + & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm") + if ((do_timings).and.(idx_cpytrans1==-1)) & + & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1") + if ((do_timings).and.(idx_cpytrans2==-1)) & + & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2") naggr = nlaggr(me+1) ntaggr = sum(nlaggr) @@ -128,6 +132,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ! Ok first product done. if (present(desc_ax)) then + if (do_timings) call psb_tic(idx_cpytrans1) block call coo_prol%cp_to_coo(coo_restr,info) call coo_restr%set_ncols(desc_ac%get_local_cols()) @@ -137,7 +142,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%set_ncols(desc_ax%get_local_cols()) end block call csr_restr%cp_from_coo(coo_restr,info) - + if (do_timings) call psb_toc(idx_cpytrans1) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 @@ -167,27 +172,28 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%transp() nzl = coo_restr%get_nzeros() - nrl = desc_ac%get_local_rows() - i=0 + nrl = desc_ac%get_local_rows() + call coo_restr%fix(info) + i=coo_restr%get_nzeros() ! ! Only keep local rows ! - do k=1, nzl - if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then - i = i+1 - coo_restr%val(i) = coo_restr%val(k) - coo_restr%ia(i) = coo_restr%ia(k) - coo_restr%ja(i) = coo_restr%ja(k) + search: do k=i,1,-1 + if (coo_restr%ia(k) <= nrl) then + call coo_restr%set_nzeros(k) + exit search end if - end do - call coo_restr%set_nzeros(i) - call coo_restr%fix(info) + end do search + nzl = coo_restr%get_nzeros() call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_ncols(desc_a%get_local_cols()) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) + if (do_timings) call psb_tic(idx_cpytrans2) + call csr_restr%cp_from_coo(coo_restr,info) + if (do_timings) call psb_toc(idx_cpytrans2) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 similarity index 56% rename from amgprec/impl/aggregator/amg_s_soc1_map_bld.f90 rename to amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 3f4cc437..0a809624 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -72,7 +72,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_s_inner_mod - +#if defined(OPENMP) + use omp_lib +#endif implicit none ! Arguments @@ -85,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_spk_), allocatable :: val(:), diag(:) @@ -99,6 +101,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_lpk_) :: nrglob character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1 + integer(psb_ipk_), save :: idx_soc1_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc1_map_bld' @@ -114,6 +119,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc1_p0==-1)) & + & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0") + if ((do_timings).and.(idx_soc1_p1==-1)) & + & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1") + if ((do_timings).and.(idx_soc1_p2==-1)) & + & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2") + if ((do_timings).and.(idx_soc1_p3==-1)) & + & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -133,41 +146,203 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc1_p0) call a%cp_to(acsr) + if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) - idxs(i) = i + idxs(i) = i end do - else + !$omp end parallel do + else + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if - + if (do_timings) call psb_tic(idx_soc1_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 - icnt = 0 +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) cycle step1 + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + ip = 0 + do k=1, nz + j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +351,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +369,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +378,22 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +424,15 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +461,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -292,7 +478,10 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -303,15 +492,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -336,9 +528,13 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 similarity index 57% rename from amgprec/impl/aggregator/amg_s_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 index 4bb17a80..ef7f5707 100644 --- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 @@ -68,9 +68,12 @@ ! subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_s_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,217 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = sone + s_neigh_coo%val(k) = sone else - s_neigh_coo%val(ip) = -sone + s_neigh_coo%val(k) = -sone end if + else + s_neigh_coo%val(k) = -sone end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,16 +401,21 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -259,8 +441,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -294,6 +477,8 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -305,13 +490,17 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') diff --git a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 index d96176b2..c2eae3a4 100644 --- a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_spk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = szero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= szero) then adiag(i) = sone / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = sone end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 index a64e3ebb..a6a7856e 100644 --- a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 +++ b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90 @@ -97,6 +97,8 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,& integer(psb_lpk_) :: ntaggr integer(psb_ipk_) :: debug_level, debug_unit logical :: clean_zeros + integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1 + logical, parameter :: do_timings=.false. name='amg_z_dec_aggregator_tprol' call psb_erractionsave(err_act) @@ -108,6 +110,10 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,& info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_map_bld==-1)) & + & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld") + if ((do_timings).and.(idx_map_tprol==-1)) & + & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol") call amg_check_def(parms%ml_cycle,'Multilevel cycle',& & amg_mult_ml_,is_legal_ml_cycle) @@ -121,10 +127,14 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,& ! The decoupled aggregator based on SOC measures ignores ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! + if (do_timings) call psb_tic(idx_map_bld) clean_zeros = ag%do_clean_zeros call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) + if (do_timings) call psb_toc(idx_map_bld) + if (do_timings) call psb_tic(idx_map_tprol) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) + if (do_timings) call psb_toc(idx_map_tprol) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') diff --git a/amgprec/impl/aggregator/amg_z_ptap_bld.f90 b/amgprec/impl/aggregator/amg_z_ptap_bld.f90 index 6faf1b71..e322a303 100644 --- a/amgprec/impl/aggregator/amg_z_ptap_bld.f90 +++ b/amgprec/impl/aggregator/amg_z_ptap_bld.f90 @@ -76,7 +76,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. - integer(psb_ipk_), save :: idx_spspmm=-1 + integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1 name='amg_ptap_bld' if(psb_get_errstatus().ne.0) return @@ -93,7 +93,11 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ncol = desc_a%get_local_cols() if ((do_timings).and.(idx_spspmm==-1)) & - & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") + & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm") + if ((do_timings).and.(idx_cpytrans1==-1)) & + & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1") + if ((do_timings).and.(idx_cpytrans2==-1)) & + & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2") naggr = nlaggr(me+1) ntaggr = sum(nlaggr) @@ -128,6 +132,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& ! Ok first product done. if (present(desc_ax)) then + if (do_timings) call psb_tic(idx_cpytrans1) block call coo_prol%cp_to_coo(coo_restr,info) call coo_restr%set_ncols(desc_ac%get_local_cols()) @@ -137,7 +142,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%set_ncols(desc_ax%get_local_cols()) end block call csr_restr%cp_from_coo(coo_restr,info) - + if (do_timings) call psb_toc(idx_cpytrans1) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 @@ -167,27 +172,28 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,& call coo_restr%transp() nzl = coo_restr%get_nzeros() - nrl = desc_ac%get_local_rows() - i=0 + nrl = desc_ac%get_local_rows() + call coo_restr%fix(info) + i=coo_restr%get_nzeros() ! ! Only keep local rows ! - do k=1, nzl - if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then - i = i+1 - coo_restr%val(i) = coo_restr%val(k) - coo_restr%ia(i) = coo_restr%ia(k) - coo_restr%ja(i) = coo_restr%ja(k) + search: do k=i,1,-1 + if (coo_restr%ia(k) <= nrl) then + call coo_restr%set_nzeros(k) + exit search end if - end do - call coo_restr%set_nzeros(i) - call coo_restr%fix(info) + end do search + nzl = coo_restr%get_nzeros() call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_ncols(desc_a%get_local_cols()) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) + if (do_timings) call psb_tic(idx_cpytrans2) + call csr_restr%cp_from_coo(coo_restr,info) + if (do_timings) call psb_toc(idx_cpytrans2) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') goto 9999 diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.f90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 similarity index 56% rename from amgprec/impl/aggregator/amg_z_soc1_map_bld.f90 rename to amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index d9f24130..2c467426 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -72,7 +72,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_z_inner_mod - +#if defined(OPENMP) + use omp_lib +#endif implicit none ! Arguments @@ -85,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) complex(psb_dpk_), allocatable :: val(:), diag(:) @@ -99,6 +101,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_lpk_) :: nrglob character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1 + integer(psb_ipk_), save :: idx_soc1_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc1_map_bld' @@ -114,6 +119,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc1_p0==-1)) & + & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0") + if ((do_timings).and.(idx_soc1_p1==-1)) & + & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1") + if ((do_timings).and.(idx_soc1_p2==-1)) & + & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2") + if ((do_timings).and.(idx_soc1_p3==-1)) & + & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -133,41 +146,203 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc1_p0) call a%cp_to(acsr) + if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) - idxs(i) = i + idxs(i) = i end do - else + !$omp end parallel do + else + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if - + if (do_timings) call psb_tic(idx_soc1_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 - icnt = 0 +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) cycle step1 + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + if (ilaggr(i) == -(nr+1)) then + nz = (acsr%irp(i+1)-acsr%irp(i)) + if ((nz<0).or.(nz>size(icol))) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) + val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) + + ! + ! Build the set of all strongly coupled nodes + ! + ip = 0 + do k=1, nz + j = icol(k) + ! If any of the neighbours is already assigned, + ! we will not reset. + if (ilaggr(j) > 0) cycle step1 + if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then + ip = ip + 1 + icol(ip) = icol(k) + end if + enddo + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if +#else step1: do ii=1, nr + if (info /= 0) cycle i = idxs(ii) if ((i<1).or.(i>nr)) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if - + if (ilaggr(i) == -(nr+1)) then nz = (acsr%irp(i+1)-acsr%irp(i)) if ((nz<0).or.(nz>size(icol))) then info=psb_err_internal_error_ call psb_errpush(info,name) - goto 9999 + cycle step1 + !goto 9999 end if icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) @@ -176,7 +351,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! ! Build the set of all strongly coupled nodes ! - ip = 0 + ip = 0 do k=1, nz j = icol(k) if ((1<=j).and.(j<=nr)) then @@ -194,8 +369,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! contains I even if it does not look like it from matrix) ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - icnt = icnt + 1 + if (disjoint) then naggr = naggr + 1 do k=1, ip ilaggr(icol(k)) = naggr @@ -204,16 +378,22 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& - & ' Check 1:',count(ilaggr == -(nr+1)) + & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr end if - + if (do_timings) call psb_toc(idx_soc1_p1) + if (do_timings) call psb_tic(idx_soc1_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -244,8 +424,15 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc1_p2) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if - + if (do_timings) call psb_tic(idx_soc1_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -274,7 +461,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if enddo if (ip > 0) then - icnt = icnt + 1 naggr = naggr + 1 ilaggr(i) = naggr do k=1, ip @@ -292,7 +478,10 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,info)& + !$omp private(ii,i,j,k,nz,icol,val,ip) do i=1, nr + if (info /= 0) cycle if (ilaggr(i) < 0) then nz = (acsr%irp(i+1)-acsr%irp(i)) if (nz == 1) then @@ -303,15 +492,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc1_p3) if (naggr > ncol) then - !write(0,*) name,'Error : naggr > ncol',naggr,ncol info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') goto 9999 @@ -336,9 +528,13 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nlaggr(:) = 0 nlaggr(me+1) = naggr call psb_sum(ctxt,nlaggr(1:np)) + if (debug_level >= psb_debug_outer_) then + write(debug_unit,*) me,' ',trim(name),& + & ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),& + & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr + end if call acsr%free() - call psb_erractionrestore(err_act) return diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.f90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 similarity index 57% rename from amgprec/impl/aggregator/amg_z_soc2_map_bld.f90 rename to amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 index c1b165b1..c6ac226e 100644 --- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.f90 +++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 @@ -68,9 +68,12 @@ ! subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info) - use psb_base_mod + use psb_base_mod use amg_base_prec_type use amg_z_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,217 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = done + s_neigh_coo%val(k) = done else - s_neigh_coo%val(ip) = -done + s_neigh_coo%val(k) = -done end if + else + s_neigh_coo%val(k) = -done end if end do end do + !$omp end parallel do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:) + integer(psb_ipk_) :: myth,nths, kk + ! The parallelization makes use of a locaggr(:) array; each thread + ! keeps its own version of naggr, and when the loop ends, a prefix is applied + ! to locnaggr to determine: + ! 1. The total number of aggregaters NAGGR; + ! 2. How much should each thread shift its own aggregates + ! Part 2 requires to keep track of which thread defined each entry + ! of ilaggr(), so that each entry can be adjusted correctly: even + ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have + ! been set because it is strongly connected to an entry J belonging to a + ! different thread. + + !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) & + !$omp private(icol,val,myth,kk) + block + integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt + integer(psb_lpk_) :: itmp + !$omp master + nths = omp_get_num_threads() + allocate(bnds(0:nths),locnaggr(0:nths+1)) + locnaggr(:) = 0 + bnds(0) = 1 + !$omp end master + !$omp barrier + myth = omp_get_thread_num() + rsz = nr/nths + if (myth < mod(nr,nths)) rsz = rsz + 1 + bnds(myth+1) = rsz + !$omp barrier + !$omp master + do i=1,nths + bnds(i) = bnds(i) + bnds(i-1) + end do + info = 0 + !$omp end master + !$omp barrier + + !$omp do schedule(static) private(disjoint) + do kk=0, nths-1 + step1: do ii=bnds(kk), bnds(kk+1)-1 + i = idxs(ii) + if (info /= 0) then + write(0,*) ' Step1:',kk,ii,i,info + cycle step1 + end if + if ((i<1).or.(i>nr)) then + !$omp atomic write + info=psb_err_internal_error_ + !$omp end atomic + call psb_errpush(info,name) + cycle step1 + !goto 9999 + end if + + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! Same if ip==0 (in which case, neighborhood only + ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. + ! In any case, even if the enteries of ILAGGR may be + ! overwritten, the important thing is that each entry is + ! consistent and they generate a correct aggregation map. + ! + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + !$omp atomic update + info = max(12345678,info) + !$omp end atomic + cycle step1 + end if + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, nzcnt + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do + end if + end if + enddo step1 + end do + !$omp end do + + !$omp master + naggr = sum(locnaggr(0:nths-1)) + do i=1,nths + locnaggr(i) = locnaggr(i) + locnaggr(i-1) + end do + do i=nths+1,1,-1 + locnaggr(i) = locnaggr(i-1) + end do + locnaggr(0) = 0 + !write(0,*) 'LNAG ',locnaggr(nths+1) + !$omp end master + !$omp barrier + !$omp do schedule(static) + do kk=0, nths-1 + do ii=bnds(kk), bnds(kk+1)-1 + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) + end if + end do + end do + !$omp end do + end block + !$omp end parallel + end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,16 +401,21 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) end if - + if (do_timings) call psb_toc(idx_soc2_p1) + if (do_timings) call psb_tic(idx_soc2_p2) ! ! Phase two: join the neighbours ! + !$omp workshare tmpaggr = ilaggr + !$omp end workshare + !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& + !$omp private(ii,i,j,k,nz,icol,val,ip,cpling) step2: do ii=1,nr i = idxs(ii) @@ -259,8 +441,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if end if end do step2 - - + !$omp end parallel do + if (do_timings) call psb_toc(idx_soc2_p2) + if (do_timings) call psb_tic(idx_soc2_p3) ! ! Phase three: sweep over leftovers, if any ! @@ -294,6 +477,8 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end do step3 ! Any leftovers? + !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& + !$omp private(ii,i,j,k) do i=1, nr if (ilaggr(i) <= 0) then nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) @@ -305,13 +490,17 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! other processes. ilaggr(i) = -(nrglob+nr) else + !$omp atomic write info=psb_err_internal_error_ + !$omp end atomic call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') - goto 9999 + cycle endif end if end do - + !$omp end parallel do + if (info /= 0) goto 9999 + if (do_timings) call psb_toc(idx_soc2_p3) if (naggr > ncol) then info=psb_err_internal_error_ call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') diff --git a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 index 2f944699..7b8ed075 100644 --- a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 +++ b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90 @@ -140,6 +140,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& real(psb_dpk_) :: anorm, omega, tmp, dg, theta logical, parameter :: debug_new=.false. character(len=80) :: filename + logical, parameter :: do_timings=.false. + integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1 + integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1 name='amg_aggrmat_smth_bld' info=psb_success_ @@ -153,6 +156,23 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ctxt = desc_a%get_context() call psb_info(ctxt, me, np) + if ((do_timings).and.(idx_spspmm==-1)) & + & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm") + if ((do_timings).and.(idx_phase1==-1)) & + & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ") + if ((do_timings).and.(idx_phase2==-1)) & + & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ") + if ((do_timings).and.(idx_phase3==-1)) & + & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ") + if ((do_timings).and.(idx_gtrans==-1)) & + & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ") + if ((do_timings).and.(idx_refine==-1)) & + & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ") + if ((do_timings).and.(idx_cdasb==-1)) & + & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ") + if ((do_timings).and.(idx_ptap==-1)) & + & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ") + nglob = desc_a%get_global_rows() nrow = desc_a%get_local_rows() @@ -171,6 +191,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! naggr: number of local aggregates ! nrow: local rows. ! + if (do_timings) call psb_tic(idx_phase1) ! Get the diagonal D adiag = a%get_diag(info) @@ -196,7 +217,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& ! ! Build the filtered matrix Af from A ! - + !$OMP parallel do private(i,j,tmp,jd) schedule(static) do i=1, nrow tmp = zzero jd = -1 @@ -214,11 +235,13 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& acsrf%val(jd)=acsrf%val(jd)-tmp end if enddo + !$OMP end parallel do ! Take out zeroed terms call acsrf%clean_zeros(info) end if + !$OMP parallel do private(i) schedule(static) do i=1,size(adiag) if (adiag(i) /= zzero) then adiag(i) = zone / adiag(i) @@ -226,7 +249,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& adiag(i) = zone end if end do - + !$OMP end parallel do if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_eig == amg_max_norm_) then @@ -252,8 +275,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') goto 9999 end if + if (do_timings) call psb_toc(idx_phase1) - + if (do_timings) call psb_tic(idx_phase2) call acsrf%scal(adiag,info) if (info /= psb_success_) goto 9999 @@ -267,6 +291,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_cdasb(desc_ac,info) call psb_cd_reinit(desc_ac,info) + if (do_timings) call psb_toc(idx_phase2) + if (do_timings) call psb_tic(idx_phase3) ! ! Build the smoothed prolongator using either A or Af ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol @@ -279,8 +305,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') goto 9999 end if - - + if (do_timings) call psb_toc(idx_phase3) + if (do_timings) call psb_tic(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done SPSPMM 1' @@ -292,7 +318,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,& call op_prol%mv_from(coo_prol) call op_restr%mv_from(coo_restr) - + if (do_timings) call psb_toc(idx_ptap) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),& & 'Done smooth_aggregate ' diff --git a/amgprec/impl/aggregator/clean.cpp b/amgprec/impl/aggregator/clean.cpp new file mode 100644 index 00000000..f316aee7 --- /dev/null +++ b/amgprec/impl/aggregator/clean.cpp @@ -0,0 +1,91 @@ +#include "MatchBoxPC.h" + +// TODO comment + +void clean(MilanLongInt NLVer, + MilanInt myRank, + MilanLongInt MessageIndex, + vector &SRequest, + vector &SStatus, + MilanInt BufferSize, + MilanLongInt *Buffer, + MilanLongInt msgActual, + MilanLongInt *msgActualSent, + MilanLongInt msgInd, + MilanLongInt *msgIndSent, + MilanLongInt NumMessagesBundled, + MilanReal *msgPercent) +{ + // Cleanup Phase + +#pragma omp parallel + { +#pragma omp master + { +#pragma omp task + { + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ") Waitall= " << endl; + fflush(stdout); +#endif +#ifdef DEBUG_HANG_ + cout << "\n(" << myRank << ") Waitall " << endl; + fflush(stdout); +#endif + //return; + + MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]); + + // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer + if (BufferSize > 0) + { + MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer + free(Buffer); // Free the memory that was allocated + } + } + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")End of function to compute matching: " << endl; + fflush(stdout); + cout << "\n(" << myRank << ")myCardinality: " << myCard << endl; + fflush(stdout); + cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl; + fflush(stdout); + cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl; + fflush(stdout); +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges; + cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2; + cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled; + cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd; + if (msgInd > 0) + { + cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n"; + } + fflush(stdout); +#endif + +#pragma omp task + { + *msgActualSent = msgActual; + *msgIndSent = msgInd; + if (msgInd > 0) + { + *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0; + } + else + { + *msgPercent = 0; + } + } + +#ifdef DEBUG_HANG_ + if (myRank == 0) + cout << "\n(" << myRank << ") Done" << endl; + fflush(stdout); +#endif + } + } +} diff --git a/amgprec/impl/aggregator/computeCandidateMate.cpp b/amgprec/impl/aggregator/computeCandidateMate.cpp new file mode 100644 index 00000000..7d4e7ce8 --- /dev/null +++ b/amgprec/impl/aggregator/computeCandidateMate.cpp @@ -0,0 +1,73 @@ +#include "MatchBoxPC.h" + +/** + * Execute the research fr the Candidate Mate without controlling if the vertices are already matched. + * Returns the vertices with the highest weight + * @param adj1 + * @param adj2 + * @param verLocInd + * @param edgeLocWeight + * @return + */ +MilanLongInt firstComputeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanLongInt *verLocInd, + MilanReal *edgeLocWeight) +{ + MilanInt w = -1; + MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN + int finalK; + for (int k = adj1; k < adj2; k++) { + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; + finalK = k; + } + } // End of for loop + return finalK; +} + +/** + * //TODO documentation + * @param adj1 + * @param adj2 + * @param edgeLocWeight + * @param k + * @param verLocInd + * @param StartIndex + * @param EndIndex + * @param GMate + * @param Mate + * @param Ghost2LocalMap + * @return + */ +MilanLongInt computeCandidateMate(MilanLongInt adj1, + MilanLongInt adj2, + MilanReal *edgeLocWeight, + MilanLongInt k, + MilanLongInt *verLocInd, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap) +{ + // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + + MilanInt w = -1; + MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN + for (k = adj1; k < adj2; k++) { + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) + continue; + + if ((edgeLocWeight[k] > heaviestEdgeWt) || + ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) { + heaviestEdgeWt = edgeLocWeight[k]; + w = verLocInd[k]; + } + } // End of for loop + // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + + return w; +} diff --git a/amgprec/impl/aggregator/dataStrStaticQueue.h b/amgprec/impl/aggregator/dataStrStaticQueue.h index eecbffeb..c6e6882a 100755 --- a/amgprec/impl/aggregator/dataStrStaticQueue.h +++ b/amgprec/impl/aggregator/dataStrStaticQueue.h @@ -80,9 +80,11 @@ class staticQueue MilanLongInt squeueTail; MilanLongInt NumNodes; + //FIXME I had to comment this piece of code in order to make everything work. + // why? //Prevent Assignment and Pass by Value: - staticQueue(const staticQueue& src); - staticQueue& operator=(const staticQueue& rhs); + //staticQueue(const staticQueue& src); + //staticQueue& operator=(const staticQueue& rhs); public: //Constructors and Destructors diff --git a/amgprec/impl/aggregator/extractUChunk.cpp b/amgprec/impl/aggregator/extractUChunk.cpp new file mode 100644 index 00000000..923a0b51 --- /dev/null +++ b/amgprec/impl/aggregator/extractUChunk.cpp @@ -0,0 +1,31 @@ +#include "MatchBoxPC.h" + +void extractUChunk( + vector &UChunkBeingProcessed, + vector &U, + vector &privateU) +{ + + UChunkBeingProcessed.clear(); +#pragma omp critical(U) + { + + if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U + { + while (!privateU.empty()) + UChunkBeingProcessed.push_back(privateU.back()); + privateU.pop_back(); + } + else + { + for (int i = 0; i < UCHUNK; i++) + { // Pop the new nodes + if (U.empty()) + break; + UChunkBeingProcessed.push_back(U.back()); + U.pop_back(); + } + } + + } // End of critical U // End of critical U +} \ No newline at end of file diff --git a/amgprec/impl/aggregator/findOwnerOfGhost.cpp b/amgprec/impl/aggregator/findOwnerOfGhost.cpp new file mode 100644 index 00000000..b9d60614 --- /dev/null +++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp @@ -0,0 +1,29 @@ +#include "MatchBoxPC.h" + +/// Find the owner of a ghost node: +MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, + MilanInt myRank, MilanInt numProcs) +{ + + MilanLongInt mStartInd = mVerDistance[myRank]; + MilanInt Start = 0; + MilanInt End = numProcs; + MilanInt Current = 0; + + while (Start <= End) + { + Current = (End + Start) / 2; + // CASE-1: + if (mVerDistance[Current] == vtxIndex) return Current; + else // CASE 2: + if (mVerDistance[Current] > vtxIndex) + End = Current - 1; + else // CASE 3: + Start = Current + 1; + } // End of While() + + if (mVerDistance[Current] > vtxIndex) + return (Current - 1); + + return Current; +} // End of findOwnerOfGhost() diff --git a/amgprec/impl/aggregator/initialize.cpp b/amgprec/impl/aggregator/initialize.cpp new file mode 100644 index 00000000..17a4169e --- /dev/null +++ b/amgprec/impl/aggregator/initialize.cpp @@ -0,0 +1,304 @@ +#include "MatchBoxPC.h" + +void initialize(MilanLongInt NLVer, MilanLongInt NLEdge, + MilanLongInt StartIndex, MilanLongInt EndIndex, + MilanLongInt *numGhostEdges, + MilanLongInt *numGhostVertices, + MilanLongInt *S, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + map &Ghost2LocalMap, + vector &Counter, + vector &verGhostPtr, + vector &verGhostInd, + vector &tempCounter, + vector &GMate, + vector &Message, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + MilanLongInt *&candidateMate, + vector &U, + vector &privateU, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) +{ + + MilanLongInt insertMe = 0; + MilanLongInt adj1, adj2; + int i, v, k, w; + // index that starts with zero to |Vg| - 1 + map::iterator storedAlready; + +#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD) + { + +#pragma omp single + { + +#ifdef TIME_TRACKER + double Ghost2LocalInitialization = MPI_Wtime(); +#endif + + /* + * OMP Ghost2LocalInitialization + * This loop analyzes all the edges and when finds a ghost edge + * puts it in the Ghost2LocalMap. + * A critical region is needed when inserting data in the map. + * + * Despite the critical region it is still productive to + * parallelize this cycle because the critical region is exeuted + * only when a ghost edge is found and ghost edges are a minority, + * circa 3.5% during the tests. + */ +#pragma omp task depend(out \ + : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices) + { +#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \ + : numGhostEdges[:1]) + for (i = 0; i < NLEdge; i++) + { // O(m) - Each edge stored twice + insertMe = verLocInd[i]; + if ((insertMe < StartIndex) || (insertMe > EndIndex)) + { // Find a ghost + (*numGhostEdges)++; +#pragma omp critical + { + storedAlready = Ghost2LocalMap.find(insertMe); + if (storedAlready != Ghost2LocalMap.end()) + { // Has already been added + Counter[storedAlready->second]++; // Increment the counter + } + else + { // Insert an entry for the ghost: + Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry + Counter.push_back(1); // Initialize the counter + (*numGhostVertices)++; // Increment the number of ghost vertices + } // End of else() + } + } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) ) + } // End of for(ghost vertices) + } // end of task depend + + // *numGhostEdges = atomicNumGhostEdges; +#ifdef TIME_TRACKER + Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization; + fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges; + if (!Ghost2LocalMap.empty()) + { + cout << "\n(" << myRank << ")Final Map : on process "; + cout << "\n(" << myRank << ")Key \t Value \t Counter \n"; + fflush(stdout); + storedAlready = Ghost2LocalMap.begin(); + do + { + cout << storedAlready->second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl; + fflush(stdout); + storedAlready++; + } while (storedAlready != Ghost2LocalMap.end()); + } +#endif + +#pragma omp task depend(out \ + : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \ + : *numGhostVertices, *numGhostEdges) + { + + // Initialize adjacency Lists for Ghost Vertices: + try + { + verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector + tempCounter.reserve(*numGhostVertices); // Pointer Vector + verGhostInd.reserve(*numGhostEdges); // Index Vector + GMate.reserve(*numGhostVertices); // Ghost Mate Vector + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + // Initialize the Vectors: + verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector + tempCounter.resize(*numGhostVertices, 0); // Temporary Counter + verGhostInd.resize(*numGhostEdges, -1); // Index Vector + GMate.resize(*numGhostVertices, -1); // Temporary Counter + verGhostPtr[0] = 0; // The first value +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Ghost Vertex Pointer: "; + fflush(stdout); +#endif + + } // End of task + +#pragma omp task depend(out \ + : verGhostPtr) depend(in \ + : Counter, *numGhostVertices) + { + +#ifdef TIME_TRACKER + double verGhostPtrInitialization = MPI_Wtime(); +#endif + for (i = 0; i < *numGhostVertices; i++) + { // O(|Ghost Vertices|) + verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i]; +#ifdef PRINT_DEBUG_INFO_ + cout << verGhostPtr[i] << "\t"; + fflush(stdout); +#endif + } + +#ifdef TIME_TRACKER + verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization; + fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization); +#endif + } // End of task + +#ifdef PRINT_DEBUG_INFO_ + if (*numGhostVertices > 0) + cout << verGhostPtr[*numGhostVertices] << "\n"; + fflush(stdout); +#endif + +#ifdef TIME_TRACKER + double verGhostIndInitialization = MPI_Wtime(); +#endif + + /* + * OMP verGhostIndInitialization + * + * In this cycle the verGhostInd is initialized + * with the datas related to ghost edges. + * The check to see if a node is a ghost node is + * executed in paralle and when a ghost node + * is found a critical region is started. + * + * Despite the critical region it's still useful to + * parallelize the for cause the ghost nodes + * are a minority hence the critical region is executed + * few times, circa 3.5% of the times in the tests. + */ +#pragma omp task depend(in \ + : insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \ + : verGhostInd) + { +#pragma omp taskloop num_tasks(NUM_THREAD) + for (v = 0; v < NLVer; v++) + { + adj1 = verLocPtr[v]; // Vertex Pointer + adj2 = verLocPtr[v + 1]; + for (k = adj1; k < adj2; k++) + { + w = verLocInd[k]; // Get the adjacent vertex + if ((w < StartIndex) || (w > EndIndex)) + { // Find a ghost +#pragma omp critical + { + insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert + tempCounter[Ghost2LocalMap[w]]++; // Increment the counter + } + verGhostInd[insertMe] = v + StartIndex; // Add the adjacency + } // End of if((w < StartIndex) || (w > EndIndex)) + } // End of for(k) + } // End of for (v) + } // end of tasklopp + +#ifdef TIME_TRACKER + verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization; + fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Ghost Vertex Index: "; + for (v = 0; v < *numGhostEdges; v++) + cout << verGhostInd[v] << "\t"; + cout << endl; + fflush(stdout); +#endif + +#pragma omp task depend(in \ + : *numGhostEdges) depend(out \ + : QLocalVtx, QGhostVtx, QMsgType, QOwner) + { + try + { + QLocalVtx.reserve(*numGhostEdges); // Local Vertex + QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex + QMsgType.reserve(*numGhostEdges); // Message Type (Request/Failure) + QOwner.reserve(*numGhostEdges); // Owner of the ghost: COmpute once and use later + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + } // end of task + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Allocating CandidateMate.. "; + fflush(stdout); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl; + fflush(stdout); + fflush(stdout); +#endif + +#ifdef DEBUG_HANG_ + if (myRank == 0) + cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl; + fflush(stdout); +#endif + +#pragma omp task depend(in \ + : *numGhostVertices) depend(out \ + : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) + { + + // Allocate Data Structures: + /* + * candidateMate was a vector and has been replaced with an array + * there is no point in using the vector (or maybe there is (???)) + * so I replaced it with an array wich is slightly faster + */ + candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)]; + + *S = (*numGhostVertices); // Initialize S with number of Ghost Vertices + + /* + * Create the Queue Data Structure for the Dominating Set + * + * I had to declare the staticuQueue U before the parallel region + * to have it in the correct scope. Since we can't change the dimension + * of a staticQueue I had to destroy the previous object and instantiate + * a new one of the correct size. + */ + //new (&U) staticQueue(NLVer + (*numGhostVertices)); + U.reserve(NLVer + (*numGhostVertices)); + + // Initialize the private vectors + privateQLocalVtx.reserve(*numGhostVertices); + privateQGhostVtx.reserve(*numGhostVertices); + privateQMsgType.reserve(*numGhostVertices); + privateQOwner.reserve(*numGhostVertices); + privateU.reserve(*numGhostVertices); + } // end of task + + } // End of single region + } // End of parallel region +} diff --git a/amgprec/impl/aggregator/isAlreadyMatched.cpp b/amgprec/impl/aggregator/isAlreadyMatched.cpp new file mode 100644 index 00000000..a7d65c15 --- /dev/null +++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp @@ -0,0 +1,46 @@ +#include "MatchBoxPC.h" + +/** + * //TODO documentation + * @param k + * @param verLocInd + * @param StartIndex + * @param EndIndex + * @param GMate + * @param Mate + * @param Ghost2LocalMap + * @return + */ +bool isAlreadyMatched(MilanLongInt node, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap) +{ + + /* +#pragma omp critical(Mate) + { + if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex? + result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched + } else { //A local vertex + result = (Mate[node - StartIndex] >= 0); // Already matched + } + + } + */ + MilanLongInt val; + if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex + { +#pragma omp atomic read + val = GMate[Ghost2LocalMap[node]]; + return val >= 0; // Already matched + } + + // If not ghost vertex +#pragma omp atomic read + val = Mate[node - StartIndex]; + + return val >= 0; // Already matched +} \ No newline at end of file diff --git a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp new file mode 100644 index 00000000..ffb8d2a3 --- /dev/null +++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp @@ -0,0 +1,27 @@ +#include "MatchBoxPC.h" + +void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanInt myRank, + MilanReal *edgeLocWeight, + MilanLongInt *candidateMate) +{ + + MilanLongInt v = -1; + +#pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD) + { + +#pragma omp for schedule(static) + for (v = 0; v < NLVer; v++) { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl; + fflush(stdout); +#endif + // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight); + // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v) + } + } +} diff --git a/amgprec/impl/aggregator/processCrossEdge.cpp b/amgprec/impl/aggregator/processCrossEdge.cpp new file mode 100644 index 00000000..e844f127 --- /dev/null +++ b/amgprec/impl/aggregator/processCrossEdge.cpp @@ -0,0 +1,24 @@ +#include "MatchBoxPC.h" + +void PROCESS_CROSS_EDGE(MilanLongInt *edge, + MilanLongInt *S) +{ + // Start: PARALLEL_PROCESS_CROSS_EDGE_B + MilanLongInt captureCounter; + +#pragma omp atomic capture + captureCounter = --(*edge); // Decrement + + //assert(captureCounter >= 0); + + if (captureCounter == 0) +#pragma omp atomic + (*S)--; // Decrement S + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages"; + fflush(stdout); +#endif + + // End: PARALLEL_PROCESS_CROSS_EDGE_B +} \ No newline at end of file diff --git a/amgprec/impl/aggregator/processExposedVertex.cpp b/amgprec/impl/aggregator/processExposedVertex.cpp new file mode 100644 index 00000000..2b38ec7a --- /dev/null +++ b/amgprec/impl/aggregator/processExposedVertex.cpp @@ -0,0 +1,195 @@ +#include "MatchBoxPC.h" + +void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer, + MilanLongInt *candidateMate, + MilanLongInt *verLocInd, + MilanLongInt *verLocPtr, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *Mate, + vector &GMate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, + MilanLongInt *S, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + vector &U, + vector &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) +{ + + MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0; + MilanInt ghostOwner = 0, option, igw; + +#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) \ + default(shared) num_threads(NUM_THREAD) + + { +#pragma omp for reduction(+ \ + : PCounter[:numProcs], myCard \ + [:1], msgInd \ + [:1], NumMessagesBundled \ + [:1]) \ + schedule(static) + for (v = 0; v < NLVer; v++) { + option = -1; + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + k = candidateMate[v]; + candidateMate[v] = verLocInd[k]; + w = candidateMate[v]; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl; + fflush(stdout); +#endif + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) + { + +#pragma omp critical(processExposed) + { + if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) { + w = computeCandidateMate(verLocPtr[v], + verLocPtr[v + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + candidateMate[v] = w; + } + + if (w >= 0) { + (*myCard)++; + if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex + option = 2; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) { + option = 1; + Mate[v] = w; + GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost + + } // End of if CandidateMate[w] = v + + } // End of if a Ghost Vertex + else { // w is a local vertex + + if (candidateMate[w - StartIndex] == (v + StartIndex)) { + option = 3; + Mate[v] = w; // v is local + Mate[w - StartIndex] = v + StartIndex; // w is local + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") "; + fflush(stdout); +#endif + + } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) ) + } // End of Else + + } // End of second if + + } // End critical processExposed + + } // End of if(w >=0) + else { + // This piece of code is executed a really small amount of times + adj11 = verLocPtr[v]; + adj12 = verLocPtr[v + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { // A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + (*msgInd)++; + (*NumMessagesBundled)++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + + privateQLocalVtx.push_back(v + StartIndex); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + } + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + + switch (option) + { + case -1: + break; + case 1: + privateU.push_back(v + StartIndex); + privateU.push_back(w); + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")"; + fflush(stdout); +#endif + + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); + case 2: +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message (291):"; + cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); +#endif + (*msgInd)++; + (*NumMessagesBundled)++; + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + + privateQLocalVtx.push_back(v + StartIndex); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + default: + privateU.push_back(v + StartIndex); + privateU.push_back(w); + break; + } + + } // End of for ( v=0; v < NLVer; v++ ) + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + } // End of parallel region +} diff --git a/amgprec/impl/aggregator/processMatchedVertices.cpp b/amgprec/impl/aggregator/processMatchedVertices.cpp new file mode 100644 index 00000000..d9363c39 --- /dev/null +++ b/amgprec/impl/aggregator/processMatchedVertices.cpp @@ -0,0 +1,294 @@ +#include "MatchBoxPC.h" + +void processMatchedVertices( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + vector &U, + vector &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) +{ + + MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; + int option; + MilanLongInt mateVal; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif + +#ifdef COUNT_LOCAL_VERTEX + MilanLongInt localVertices = 0; +#endif + //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, \ + privateQMsgType, privateQOwner, UChunkBeingProcessed) \ + default(shared) num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1]) + { + + while (!U.empty()) { + + extractUChunk(UChunkBeingProcessed, U, privateU); + + for (MilanLongInt u : UChunkBeingProcessed) { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); +#endif + if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices + +#ifdef COUNT_LOCAL_VERTEX + localVertices++; +#endif + + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) { + option = -1; + v = verLocInd[k]; + + if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex: + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); +#endif +#pragma omp atomic read + mateVal = Mate[v - StartIndex]; + // If the current vertex is pointing to a matched vertex and is not matched + if (mateVal < 0) { +#pragma omp critical + { + if (candidateMate[v - StartIndex] == u) { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) { + if ((w < StartIndex) || (w > EndIndex)) { // A ghost +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); +#endif + option = 2; + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + option = 1; + Mate[v - StartIndex] = w; // v is a local vertex + GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex + + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else { // w is a local vertex + if (candidateMate[w - StartIndex] == v) { + option = 3; + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + option = 4; // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of If (candidateMate[v-StartIndex] == u + } // End of task + } // mateval < 0 + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { // Neighbor is a ghost vertex + +#pragma omp critical + { + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + option = 5; // u is local + } // End of critical + } // End of Else //A Ghost Vertex + + switch (option) + { + case -1: + // No things to do + break; + case 1: + // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v + privateU.push_back(v); + privateU.push_back(w); + + (*myCard)++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); + case 2: + + // Found a dominating edge, it is a ghost + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + PCounter[ghostOwner]++; + (*NumMessagesBundled)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + privateU.push_back(v); + privateU.push_back(w); + + (*myCard)++; + break; + case 4: + // Could not find a dominating vertex + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { // A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + + PCounter[ghostOwner]++; + (*NumMessagesBundled)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + break; + case 5: + default: + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); +#endif + + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + // assert(ghostOwner != -1); + // assert(ghostOwner != myRank); + + (*NumMessagesBundled)++; + PCounter[ghostOwner]++; + (*msgInd)++; + + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); + + break; + } // End of switch + + } // End of inner for + } + } // End of outer for + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + +#pragma omp critical(U) + { + U.insert(U.end(), privateU.begin(), privateU.end()); + } + + privateU.clear(); + +#pragma omp critical(sendMessageTransfer) + { + + QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); + QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); + QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); + QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); + } + + privateQLocalVtx.clear(); + privateQGhostVtx.clear(); + privateQMsgType.clear(); + privateQOwner.clear(); + + } // End of while ( !U.empty() ) + +#ifdef COUNT_LOCAL_VERTEX + printf("Count local vertexes: %ld for thread %d of processor %d\n", + localVertices, + omp_get_thread_num(), + myRank); + +#endif + } // End of parallel region +} diff --git a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp new file mode 100644 index 00000000..469d7a16 --- /dev/null +++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp @@ -0,0 +1,308 @@ +#include "MatchBoxPC.h" +//#define DEBUG_HANG_ +void processMatchedVerticesAndSendMessages( + MilanLongInt NLVer, + vector &UChunkBeingProcessed, + vector &U, + vector &privateU, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *NumMessagesBundled, + MilanLongInt *SPtr, + MilanLongInt *verLocPtr, + MilanLongInt *verLocInd, + MilanLongInt *verDistance, + MilanLongInt *PCounter, + vector &Counter, + MilanInt myRank, + MilanInt numProcs, + MilanLongInt *candidateMate, + vector &GMate, + MilanLongInt *Mate, + map &Ghost2LocalMap, + MilanReal *edgeLocWeight, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner, + MPI_Comm comm, + MilanLongInt *msgActual, + vector &Message) +{ + + MilanLongInt initialSize = QLocalVtx.size(); + MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner; + int option; + MilanLongInt mateVal; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif + +#ifdef COUNT_LOCAL_VERTEX + MilanLongInt localVertices = 0; +#endif + //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \ + firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\ + privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \ + num_threads(NUM_THREAD) \ + reduction(+ \ + : msgInd[:1], PCounter \ + [:numProcs], myCard \ + [:1], NumMessagesBundled \ + [:1], msgActual \ + [:1]) + { + + while (!U.empty()) { + + extractUChunk(UChunkBeingProcessed, U, privateU); + + for (MilanLongInt u : UChunkBeingProcessed) { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")u: " << u; + fflush(stdout); +#endif + if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices + +#ifdef COUNT_LOCAL_VERTEX + localVertices++; +#endif + + // Get the Adjacency list for u + adj1 = verLocPtr[u - StartIndex]; // Pointer + adj2 = verLocPtr[u - StartIndex + 1]; + for (k = adj1; k < adj2; k++) { + option = -1; + v = verLocInd[k]; + + if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex: + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v]; + fflush(stdout); +#endif +#pragma omp atomic read + mateVal = Mate[v - StartIndex]; + // If the current vertex is pointing to a matched vertex and is not matched + if (mateVal < 0) { +#pragma omp critical + { + if (candidateMate[v - StartIndex] == u) { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], + verLocPtr[v - StartIndex + 1], + edgeLocWeight, 0, + verLocInd, + StartIndex, + EndIndex, + GMate, + Mate, + Ghost2LocalMap); + + candidateMate[v - StartIndex] = w; + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v << " Points to: " << w; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) { + + if ((w < StartIndex) || (w > EndIndex)) { // A ghost +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message:"; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); +#endif + option = 2; + + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + option = 1; + Mate[v - StartIndex] = w; // v is a local vertex + GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex + + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else { // w is a local vertex + if (candidateMate[w - StartIndex] == v) { + option = 3; + Mate[v - StartIndex] = w; // v is a local vertex + Mate[w - StartIndex] = v; // w is a local vertex + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else + option = 4; // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of If (candidateMate[v-StartIndex] == u + } // End of task + } // mateval < 0 + } // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex: + else { // Neighbor is a ghost vertex + +#pragma omp critical + { + if (candidateMate[NLVer + Ghost2LocalMap[v]] == u) + candidateMate[NLVer + Ghost2LocalMap[v]] = -1; + if (v != Mate[u - StartIndex]) + option = 5; // u is local + } // End of critical + } // End of Else //A Ghost Vertex + + switch (option) + { + case -1: + // No things to do + break; + case 1: + // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v + privateU.push_back(v); + privateU.push_back(w); + (*myCard)++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") "; + fflush(stdout); +#endif + // Decrement the counter: + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr); + case 2: + + // Found a dominating edge, it is a ghost + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + + // Build the Message Packet: + // Message[0] = v; // LOCAL + // Message[1] = w; // GHOST + // Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(REQUEST); + privateQOwner.push_back(ghostOwner); + break; + case 3: + privateU.push_back(v); + privateU.push_back(w); + (*myCard)++; + break; + case 4: + // Could not find a dominating vertex + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { // A ghost + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs); + fflush(stdout); +#endif + + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + + // Build the Message Packet: + // Message[0] = v; // LOCAL + // Message[1] = w; // GHOST + // Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; + (*msgInd)++; + + privateQLocalVtx.push_back(v); + privateQGhostVtx.push_back(w); + privateQMsgType.push_back(FAILURE); + privateQOwner.push_back(ghostOwner); + + } // End of if(GHOST) + } // End of for loop + break; + case 5: + default: + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a success message: "; + cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n"; + fflush(stdout); +#endif + + ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs); + + // Build the Message Packet: + // Message[0] = u; // LOCAL + // Message[1] = v; // GHOST + // Message[2] = SUCCESS; // TYPE + // Send a Request (Asynchronous) + // MPI_Bsend(&Message[0], 3, TypeMap(), ghostOwner, ComputeTag, comm); + + (*msgActual)++; + (*msgInd)++; + + privateQLocalVtx.push_back(u); + privateQGhostVtx.push_back(v); + privateQMsgType.push_back(SUCCESS); + privateQOwner.push_back(ghostOwner); + + break; + } // End of switch + } // End of inner for + } + } // End of outer for + + queuesTransfer(U, privateU, QLocalVtx, + QGhostVtx, + QMsgType, QOwner, privateQLocalVtx, + privateQGhostVtx, + privateQMsgType, + privateQOwner); + + } // End of while ( !U.empty() ) + +#ifdef COUNT_LOCAL_VERTEX + printf("Count local vertexes: %ld for thread %d of processor %d\n", + localVertices, + omp_get_thread_num(), + myRank); + +#endif + } // End of parallel region + + // Send the messages +#ifdef DEBUG_HANG_ + cout << myRank<<" Sending: "<(), ghostOwner, ComputeTag, comm); + //cout << myRank<<" Sending to "<(), ghostOwner, ComputeTag, comm); + } +#ifdef DEBUG_HANG_ + cout << myRank<<" Done sending messages"< &Ghost2LocalMap, + vector &GMate, + vector &Counter, + MilanLongInt StartIndex, + MilanLongInt EndIndex, + MilanLongInt *myCard, + MilanLongInt *msgInd, + MilanLongInt *msgActual, + MilanReal *edgeLocWeight, + MilanLongInt *verDistance, + MilanLongInt *verLocPtr, + MilanLongInt k, + MilanLongInt *verLocInd, + MilanInt numProcs, + MilanInt myRank, + MPI_Comm comm, + vector &Message, + MilanLongInt numGhostEdges, + MilanLongInt u, + MilanLongInt v, + MilanLongInt *S, + vector &U) +{ + + //#define PRINT_DEBUG_INFO_ + + MilanInt Sender; + MPI_Status computeStatus; + MilanLongInt bundleSize, w; + MilanLongInt adj11, adj12, k1; + MilanLongInt ghostOwner; + int error_codeC; + error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); + char error_message[MPI_MAX_ERROR_STRING]; + int message_length; + MilanLongInt message_type = 0; + + // Buffer to receive bundled messages + // Maximum messages that can be received from any processor is + // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS) + vector ReceiveBuffer; + try + { + ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + +#ifdef PRINT_DEBUG_INFO_ + cout + << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl; + fflush(stdout); +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << "=========================************===============================" << endl; + fflush(stdout); + fflush(stdout); +#endif + // BLOCKING RECEIVE: +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << " Waiting for blocking receive..." << endl; + fflush(stdout); + fflush(stdout); +#endif + + //cout << myRank<<" Receiving ..."; + error_codeC = MPI_Recv(&Message[0], 3, TypeMap(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus); + if (error_codeC != MPI_SUCCESS) + { + MPI_Error_string(error_codeC, error_message, &message_length); + cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n"; + fflush(stdout); + } + Sender = computeStatus.MPI_SOURCE; + //cout << " ...from "<(), Sender, BundleTag, comm, &computeStatus); + if (error_codeC != MPI_SUCCESS) { + MPI_Error_string(error_codeC, error_message, &message_length); + cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n"; + fflush(stdout); + } +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message Bundle After: " << endl; + for (int i = 0; i < bundleSize; i++) + cout << ReceiveBuffer[i] << ","; + cout << endl; + fflush(stdout); +#endif + } else { // Just a single message: +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl; + fflush(stdout); +#endif + // Add the current message to Queue: + bundleSize = 3; //#of integers in the message + // Build the Message Buffer: + if (!ReceiveBuffer.empty()) + ReceiveBuffer.clear(); // Empty it out first + ReceiveBuffer.resize(bundleSize, -1); // Initialize + + ReceiveBuffer[0] = Message[0]; // u + ReceiveBuffer[1] = Message[1]; // v + ReceiveBuffer[2] = Message[2]; // message_type + } + +#ifdef DEBUG_GHOST_ + if ((v < StartIndex) || (v > EndIndex)) { + cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl; + fflush(stdout); + } +#endif +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl; + fflush(stdout); +#endif + + // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop + for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3) { + u = ReceiveBuffer[bundleCounter - 3]; // GHOST + v = ReceiveBuffer[bundleCounter - 2]; // LOCAL + message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE + + // CASE I: REQUEST + if (message_type == REQUEST) { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message type is REQUEST" << endl; + fflush(stdout); +#endif +#ifdef DEBUG_GHOST_ + if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { + cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } + +#endif + + if (Mate[v - StartIndex] == -1) { + // Process only if not already matched (v is local) + candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost + if (candidateMate[v - StartIndex] == u) { + GMate[Ghost2LocalMap[u]] = v; // u is ghost + Mate[v - StartIndex] = u; // v is local + U.push_back(v); + U.push_back(u); + (*myCard)++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl; + fflush(stdout); +#endif + + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); + } // End of if ( candidateMate[v-StartIndex] == u )e + } // End of if ( Mate[v] == -1 ) + } // End of REQUEST + else { // CASE II: SUCCESS + if (message_type == SUCCESS) { +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message type is SUCCESS" << endl; + fflush(stdout); +#endif + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); +#ifdef DEBUG_GHOST_ + if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) { + cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl; + fflush(stdout); + } +#endif + if (Mate[v - StartIndex] == -1) { + // Process only if not already matched ( v is local) + if (candidateMate[v - StartIndex] == u) { + // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, + verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap); + candidateMate[v - StartIndex] = w; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl; + fflush(stdout); +#endif + // If found a dominating edge: + if (w >= 0) { + if ((w < StartIndex) || (w > EndIndex)) { + // w is a ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = REQUEST; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a request message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); +#endif + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); + //cout << myRank<<" Sending to "<(), ghostOwner, ComputeTag, comm); + (*msgInd)++; + (*msgActual)++; + if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) { + Mate[v - StartIndex] = w; // v is local + GMate[Ghost2LocalMap[w]] = v; // w is ghost + U.push_back(v); + U.push_back(w); + (*myCard)++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; + fflush(stdout); +#endif + + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S); + } // End of if CandidateMate[w] = v + } // End of if a Ghost Vertex + else { // w is a local vertex + if (candidateMate[w - StartIndex] == v) { + Mate[v - StartIndex] = w; // v is local + Mate[w - StartIndex] = v; // w is local + // Q.push_back(u); + U.push_back(v); + U.push_back(w); + (*myCard)++; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl; + fflush(stdout); +#endif + } // End of if(CandidateMate(w) = v + } // End of Else + } // End of if(w >=0) + else { // No dominant edge found + adj11 = verLocPtr[v - StartIndex]; + adj12 = verLocPtr[v - StartIndex + 1]; + for (k1 = adj11; k1 < adj12; k1++) { + w = verLocInd[k1]; + if ((w < StartIndex) || (w > EndIndex)) { + // A ghost + // Build the Message Packet: + Message[0] = v; // LOCAL + Message[1] = w; // GHOST + Message[2] = FAILURE; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending a failure message: "; + cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl; + fflush(stdout); +#endif + ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs); + //assert(ghostOwner != -1); + //assert(ghostOwner != myRank); + //cout << myRank<<" Sending to "<(), ghostOwner, ComputeTag, comm); + (*msgInd)++; + (*msgActual)++; + } // End of if(GHOST) + } // End of for loop + } // End of Else: w == -1 + // End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v) + } // End of if ( candidateMate[v-StartIndex] == u ) + } // End of if ( Mate[v] == -1 ) + } // End of if ( message_type == SUCCESS ) + else { + // CASE III: FAILURE +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Message type is FAILURE" << endl; + fflush(stdout); +#endif + GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore + PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter + } // End of else: CASE III + } // End of else: CASE I + } + + return; +} diff --git a/amgprec/impl/aggregator/queueTransfer.cpp b/amgprec/impl/aggregator/queueTransfer.cpp new file mode 100644 index 00000000..33c65749 --- /dev/null +++ b/amgprec/impl/aggregator/queueTransfer.cpp @@ -0,0 +1,36 @@ +#include "MatchBoxPC.h" + +void queuesTransfer(vector &U, + vector &privateU, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &privateQLocalVtx, + vector &privateQGhostVtx, + vector &privateQMsgType, + vector &privateQOwner) +{ + +#pragma omp critical(U) + { + U.insert(U.end(), privateU.begin(), privateU.end()); + } + + privateU.clear(); + +#pragma omp critical(sendMessageTransfer) + { + + QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end()); + QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end()); + QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end()); + QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end()); + } + + privateQLocalVtx.clear(); + privateQGhostVtx.clear(); + privateQMsgType.clear(); + privateQOwner.clear(); + +} diff --git a/amgprec/impl/aggregator/sendBundledMessages.cpp b/amgprec/impl/aggregator/sendBundledMessages.cpp new file mode 100644 index 00000000..80a88b94 --- /dev/null +++ b/amgprec/impl/aggregator/sendBundledMessages.cpp @@ -0,0 +1,209 @@ +#include "MatchBoxPC.h" + +void sendBundledMessages(MilanLongInt *numGhostEdges, + MilanInt *BufferSize, + MilanLongInt *Buffer, + vector &PCumulative, + vector &PMessageBundle, + vector &PSizeInfoMessages, + MilanLongInt *PCounter, + MilanLongInt NumMessagesBundled, + MilanLongInt *msgActual, + MilanLongInt *msgInd, + MilanInt numProcs, + MilanInt myRank, + MPI_Comm comm, + vector &QLocalVtx, + vector &QGhostVtx, + vector &QMsgType, + vector &QOwner, + vector &SRequest, + vector &SStatus) +{ + + MilanLongInt myIndex = 0, numMessagesToSend; + MilanInt i = 0, OneMessageSize = 0; + +#ifdef DEBUG_HANG_ + if (myRank == 0) + cout << "\n(" << myRank << ") Send Bundles" << endl; + fflush(stdout); +#endif + +#pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD) + { +#pragma omp master + { +// Data structures for Bundled Messages: +#pragma omp task depend(inout \ + : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \ + : NumMessagesBundled, numProcs) + { + try { + PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message + PCumulative.reserve(numProcs + 1); // Similar to Row Pointer vector in CSR data structure + PSizeInfoMessages.reserve(numProcs * 3); // Buffer to hold the Size info message packets + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize + PCumulative.resize(numProcs + 1, 0); // Only initialize the counter variable + PSizeInfoMessages.resize(numProcs * 3, 0); + } + +#pragma omp task depend(inout \ + : PCumulative) depend(in \ + : PCounter) + { + for (i = 0; i < numProcs; i++) + PCumulative[i + 1] = PCumulative[i] + PCounter[i]; + } + +#pragma omp task depend(inout \ + : PCounter) + { + // Reuse PCounter to keep track of how many messages were inserted: + for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers! + PCounter[i] = 0; + } + +// Build the Message Bundle packet: +#pragma omp task depend(in \ + : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \ + : myIndex, PMessageBundle, PCounter) +{ + for (i = 0; i < NumMessagesBundled; i++) { + myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3; + PMessageBundle[myIndex + 0] = QLocalVtx[i]; + PMessageBundle[myIndex + 1] = QGhostVtx[i]; + PMessageBundle[myIndex + 2] = QMsgType[i]; + PCounter[QOwner[i]]++; + } + } + +// Send the Bundled Messages: Use ISend +#pragma omp task depend(out \ + : SRequest, SStatus) + { + try + { + SRequest.reserve(numProcs * 2); // At most two messages per processor + SStatus.reserve(numProcs * 2); // At most two messages per processor + } + catch (length_error) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n"; + cout << "Not enough memory to allocate the internal variables \n"; + exit(1); + } + } + +// Send the Messages +#pragma omp task depend(inout \ + : SRequest, PSizeInfoMessages, PCumulative) depend(out \ + : *msgActual, *msgInd) +{ + for (i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers! + if (i == myRank) // Do not send anything to yourself + continue; + // Send the Message with information about the size of next message: + // Build the Message Packet: + PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message + PSizeInfoMessages[i * 3 + 1] = -1; // Dummy packet + PSizeInfoMessages[i * 3 + 2] = SIZEINFO; // TYPE + // Send a Request (Asynchronous) +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl; + fflush(stdout); +#endif + if (PSizeInfoMessages[i * 3 + 0] > 0) + { // Send only if it is a nonempty packet + MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap(), i, ComputeTag, comm, + &SRequest[(*msgInd)]); + (*msgActual)++; + (*msgInd)++; + // Now Send the message with the data packet: +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl; + for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++) + cout << PMessageBundle[k] << ","; + cout << endl; + fflush(stdout); +#endif + MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0], + TypeMap(), i, BundleTag, comm, &SRequest[(*msgInd)]); + (*msgInd)++; + } // End of if size > 0 + } +} + +#pragma omp task depend(inout \ + : PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner) +{ + + // Free up temporary memory: + PCumulative.clear(); + QLocalVtx.clear(); + QGhostVtx.clear(); + QMsgType.clear(); + QOwner.clear(); +} + +#pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges) +{ + +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; + cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2; + cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled; + if (*numGhostEdges > 0) + { + cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n"; + } + fflush(stdout); +#endif + + // Allocate memory for MPI Send messages: + /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */ + OneMessageSize = 0; + MPI_Pack_size(3, TypeMap(), comm, &OneMessageSize); // Size of one message packet + // How many messages to send? + // Potentially three kinds of messages will be sent/received: + // Request, Success, Failure. + // But only two will be sent from a given processor. + // Substract the number of messages that have already been sent as bundled messages: + numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled; + *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend; +} + +#pragma omp task depend(out : Buffer) depend(in : *BufferSize) + { + Buffer = 0; +#ifdef PRINT_DEBUG_INFO_ + cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize; + cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD; + cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges; + cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend; + cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize); + cout << "\n(" << myRank << ")Attaching Buffer on.. "; + fflush(stdout); +#endif + if ((*BufferSize) > 0) + { + Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory + if (Buffer == 0) + { + cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n"; + cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n"; + exit(1); + } + MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer + } + } +} +} +} diff --git a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 index e79c90c9..27896806 100644 --- a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_cspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_c_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 diff --git a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 index e9e55a9a..6bd4e1ac 100644 --- a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_dspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_d_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 diff --git a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 index 271b31d0..034151d3 100644 --- a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_sspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_s_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 diff --git a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 index 07ab3e0b..eb11cad2 100644 --- a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 +++ b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90 @@ -109,6 +109,8 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) type(psb_zspmat_type) :: ac, op_restr, op_prol integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: debug_level, debug_unit + integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1 + logical, parameter :: do_timings=.false. name='amg_z_onelev_mat_asb' call psb_erractionsave(err_act) @@ -120,6 +122,12 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) info = psb_success_ ctxt = desc_a%get_context() call psb_info(ctxt,me,np) + if ((do_timings).and.(idx_matbld==-1)) & + & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld") + if ((do_timings).and.(idx_matasb==-1)) & + & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb") + if ((do_timings).and.(idx_mapbld==-1)) & + & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld") call amg_check_def(lv%parms%aggr_prol,'Smoother',& & amg_smooth_prol_,is_legal_ml_aggr_prol) @@ -139,9 +147,10 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! the mapping defined by amg_aggrmap_bld and applying the aggregation ! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! + if (do_timings) call psb_tic(idx_matbld) call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) - + if (do_timings) call psb_toc(idx_matbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') goto 9999 @@ -151,14 +160,17 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info) ! Now build its descriptor and convert global indices for ! ac, op_restr and op_prol ! + if (do_timings) call psb_tic(idx_matasb) if (info == psb_success_) & & call lv%aggr%mat_asb(lv%parms,a,desc_a,& & lv%ac,lv%desc_ac,op_prol,op_restr,info) - + if (do_timings) call psb_toc(idx_matasb) + if (do_timings) call psb_tic(idx_mapbld) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) + if (do_timings) call psb_toc(idx_mapbld) if(info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') goto 9999 diff --git a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 index f760c80f..11ea6576 100644 --- a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_c_gs_solver_bld.f90 b/amgprec/impl/solver/amg_c_gs_solver_bld.f90 index 3cdfe7e7..79be20b5 100644 --- a/amgprec/impl/solver/amg_c_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_c_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='c_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 index 859c8ebe..de5f91f8 100644 --- a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_d_gs_solver_bld.f90 b/amgprec/impl/solver/amg_d_gs_solver_bld.f90 index 3cbc78ee..918712b5 100644 --- a/amgprec/impl/solver/amg_d_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_d_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 index e96e1229..d285d0b3 100644 --- a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_s_gs_solver_bld.f90 b/amgprec/impl/solver/amg_s_gs_solver_bld.f90 index b4580f0f..6e0870b7 100644 --- a/amgprec/impl/solver/amg_s_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_s_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='s_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 index dec629f5..a953c608 100644 --- a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='d_bwgs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("BWGS_BLD: tril") n_row = desc_a%get_local_rows() @@ -77,7 +81,10 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. ! + !write(0,*) 'Calling A%TRIL in bwgs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) + if (do_timings) call psb_toc(idx_tril) else diff --git a/amgprec/impl/solver/amg_z_gs_solver_bld.f90 b/amgprec/impl/solver/amg_z_gs_solver_bld.f90 index b347937a..748a6122 100644 --- a/amgprec/impl/solver/amg_z_gs_solver_bld.f90 +++ b/amgprec/impl/solver/amg_z_gs_solver_bld.f90 @@ -56,6 +56,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level character(len=20) :: name='z_gs_solver_bld', ch_err + integer(psb_ipk_), save :: idx_tril=-1 + logical, parameter :: do_timings=.true. info=psb_success_ call psb_erractionsave(err_act) @@ -65,6 +67,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) call psb_info(ctxt, me, np) if (debug_level >= psb_debug_outer_) & & write(debug_unit,*) me,' ',trim(name),' start' + if ((do_timings).and.(idx_tril==-1)) & + & idx_tril = psb_get_timer_idx("GS_BLD: tril") n_row = desc_a%get_local_rows() @@ -76,9 +80,12 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold) ! ! This cuts out the off-diagonal part, because it's supposed to ! be handled by the outer Jacobi smoother. - ! + ! + !write(0,*) 'Calling A%TRIL in gs_solver_bld' + if (do_timings) call psb_tic(idx_tril) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) - + if (do_timings) call psb_toc(idx_tril) + !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros() else info = psb_err_missing_override_method_ diff --git a/exec.sh b/exec.sh new file mode 100755 index 00000000..1181f776 --- /dev/null +++ b/exec.sh @@ -0,0 +1,25 @@ +cd amgprec/impl/aggregator/ +rm MatchBoxPC.o +rm sendBundledMessages.o +rm initialize.o +rm extractUChunk.o +rm isAlreadyMatched.o +rm findOwnerOfGhost.o +rm computeCandidateMate.o +rm parallelComputeCandidateMateB.o +rm processMatchedVertices.o +rm processCrossEdge.o +rm queueTransfer.o +rm processMessages.o +rm processExposedVertex.o +rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o +rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o +cd ../../../ +make all +cd samples/advanced/pdegen +make amg_d_pde3d +cd runs +mpirun -np 4 amg_d_pde3d amg_pde3d.inp + + + diff --git a/samples/advanced/pdegen/Makefile b/samples/advanced/pdegen/Makefile index 0720b6f3..b5092a22 100644 --- a/samples/advanced/pdegen/Makefile +++ b/samples/advanced/pdegen/Makefile @@ -3,7 +3,7 @@ AMGINCDIR=$(AMGDIR)/include include $(AMGINCDIR)/Make.inc.amg4psblas AMGMODDIR=$(AMGDIR)/modules AMGLIBDIR=$(AMGDIR)/lib -AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec +AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec FINCLUDES=$(FMFLAG). $(FMFLAG)$(AMGMODDIR) $(FMFLAG)$(AMGINCDIR) $(PSBLAS_INCLUDES) $(FIFLAG). LINKOPT= diff --git a/samples/advanced/pdegen/amg_d_genpde_mod.F90 b/samples/advanced/pdegen/amg_d_genpde_mod.F90 index d6acd01c..ec3affc5 100644 --- a/samples/advanced/pdegen/amg_d_genpde_mod.F90 +++ b/samples/advanced/pdegen/amg_d_genpde_mod.F90 @@ -93,6 +93,9 @@ contains & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -128,7 +131,6 @@ contains type(psb_d_csc_sparse_mat) :: acsc type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr - real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -141,8 +143,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2 @@ -368,119 +369,128 @@ contains call psb_barrier(ctxt) talc = psb_wtime()-t0 - if (info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='allocation rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if (info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + !write(0,*) ' Outer in_parallel ',omp_in_parallel() + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -490,7 +500,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) call psb_barrier(ctxt) t1 = psb_wtime() @@ -557,6 +566,9 @@ contains & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -591,7 +603,6 @@ contains type(psb_d_csc_sparse_mat) :: acsc type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr - real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -604,8 +615,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2, dd @@ -791,7 +801,7 @@ contains !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() end if end block - + case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -816,93 +826,109 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(done,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + ! loop over rows belonging to current process in a block + ! distribution. + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(done,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -912,8 +938,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info) diff --git a/samples/advanced/pdegen/amg_d_pde2d.f90 b/samples/advanced/pdegen/amg_d_pde2d.F90 similarity index 98% rename from samples/advanced/pdegen/amg_d_pde2d.f90 rename to samples/advanced/pdegen/amg_d_pde2d.F90 index c036aa6d..145c5890 100644 --- a/samples/advanced/pdegen/amg_d_pde2d.f90 +++ b/samples/advanced/pdegen/amg_d_pde2d.F90 @@ -73,6 +73,9 @@ program amg_d_pde2d use amg_d_pde2d_exp_mod use amg_d_pde2d_box_mod use amg_d_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -93,7 +96,7 @@ program amg_d_pde2d type(psb_d_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -197,6 +200,15 @@ program amg_d_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -451,12 +463,14 @@ program amg_d_pde2d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size - write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) - write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) - write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) - write(psb_out_unit,'("Iterations to convergence : ",i12)') iter + write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) + write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) + write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) + write(psb_out_unit,'("Iterations to convergence : ",i12)') iter write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs() write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier diff --git a/samples/advanced/pdegen/amg_d_pde3d.f90 b/samples/advanced/pdegen/amg_d_pde3d.F90 similarity index 97% rename from samples/advanced/pdegen/amg_d_pde3d.f90 rename to samples/advanced/pdegen/amg_d_pde3d.F90 index 1f6118ca..75dcd1a0 100644 --- a/samples/advanced/pdegen/amg_d_pde3d.f90 +++ b/samples/advanced/pdegen/amg_d_pde3d.F90 @@ -74,6 +74,9 @@ program amg_d_pde3d use amg_d_pde3d_exp_mod use amg_d_pde3d_gauss_mod use amg_d_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -94,7 +97,7 @@ program amg_d_pde3d type(psb_d_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -192,12 +195,21 @@ program amg_d_pde3d ! other variables integer(psb_ipk_) :: info, i, k character(len=20) :: name,ch_err - + type(psb_d_csr_sparse_mat) :: amold info=psb_success_ call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -390,7 +402,7 @@ program amg_d_pde3d end if call psb_barrier(ctxt) t1 = psb_wtime() - call prec%smoothers_build(a,desc_a,info) + call prec%smoothers_build(a,desc_a,info,amold=amold) tprec = psb_wtime()-t1 if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld') @@ -455,7 +467,9 @@ program amg_d_pde3d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) @@ -478,7 +492,7 @@ program amg_d_pde3d write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt() end if - + call psb_print_timers(ctxt) ! ! cleanup storage and exit ! diff --git a/samples/advanced/pdegen/amg_s_genpde_mod.F90 b/samples/advanced/pdegen/amg_s_genpde_mod.F90 index 7d32cf30..dfa79ab3 100644 --- a/samples/advanced/pdegen/amg_s_genpde_mod.F90 +++ b/samples/advanced/pdegen/amg_s_genpde_mod.F90 @@ -93,6 +93,9 @@ contains & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -128,7 +131,6 @@ contains type(psb_s_csc_sparse_mat) :: acsc type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr - real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -141,8 +143,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2 @@ -368,119 +369,128 @@ contains call psb_barrier(ctxt) talc = psb_wtime()-t0 - if (info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='allocation rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if (info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + !write(0,*) ' Outer in_parallel ',omp_in_parallel() + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -490,7 +500,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) call psb_barrier(ctxt) t1 = psb_wtime() @@ -557,6 +566,9 @@ contains & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -591,7 +603,6 @@ contains type(psb_s_csc_sparse_mat) :: acsc type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr - real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -604,8 +615,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2, dd @@ -791,7 +801,7 @@ contains !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() end if end block - + case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -816,93 +826,109 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + ! loop over rows belonging to current process in a block + ! distribution. + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -912,8 +938,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info) diff --git a/samples/advanced/pdegen/amg_s_pde2d.f90 b/samples/advanced/pdegen/amg_s_pde2d.F90 similarity index 98% rename from samples/advanced/pdegen/amg_s_pde2d.f90 rename to samples/advanced/pdegen/amg_s_pde2d.F90 index a81d16ff..ad28d1f6 100644 --- a/samples/advanced/pdegen/amg_s_pde2d.f90 +++ b/samples/advanced/pdegen/amg_s_pde2d.F90 @@ -73,6 +73,9 @@ program amg_s_pde2d use amg_s_pde2d_exp_mod use amg_s_pde2d_box_mod use amg_s_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -93,7 +96,7 @@ program amg_s_pde2d type(psb_s_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -197,6 +200,15 @@ program amg_s_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -451,12 +463,14 @@ program amg_s_pde2d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size - write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) - write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) - write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) - write(psb_out_unit,'("Iterations to convergence : ",i12)') iter + write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) + write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) + write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) + write(psb_out_unit,'("Iterations to convergence : ",i12)') iter write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs() write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier diff --git a/samples/advanced/pdegen/amg_s_pde3d.f90 b/samples/advanced/pdegen/amg_s_pde3d.F90 similarity index 97% rename from samples/advanced/pdegen/amg_s_pde3d.f90 rename to samples/advanced/pdegen/amg_s_pde3d.F90 index 7542c3a2..cda6a48b 100644 --- a/samples/advanced/pdegen/amg_s_pde3d.f90 +++ b/samples/advanced/pdegen/amg_s_pde3d.F90 @@ -74,6 +74,9 @@ program amg_s_pde3d use amg_s_pde3d_exp_mod use amg_s_pde3d_gauss_mod use amg_s_genpde_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -94,7 +97,7 @@ program amg_s_pde3d type(psb_s_vect_type) :: x,b,r ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv @@ -192,12 +195,21 @@ program amg_s_pde3d ! other variables integer(psb_ipk_) :: info, i, k character(len=20) :: name,ch_err - + type(psb_s_csr_sparse_mat) :: amold info=psb_success_ call psb_init(ctxt) call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif if (iam < 0) then ! This should not happen, but just in case @@ -390,7 +402,7 @@ program amg_s_pde3d end if call psb_barrier(ctxt) t1 = psb_wtime() - call prec%smoothers_build(a,desc_a,info) + call prec%smoothers_build(a,desc_a,info,amold=amold) tprec = psb_wtime()-t1 if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld') @@ -455,7 +467,9 @@ program amg_s_pde3d call psb_sum(ctxt,precsize) call prec%descr(info,iout=psb_out_unit) if (iam == psb_root_) then - write(psb_out_unit,'("Computed solution on ",i8," processors")') np + write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np + write(psb_out_unit,'("Number of threads : ",i12)') nth + write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) @@ -478,7 +492,7 @@ program amg_s_pde3d write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt() end if - + call psb_print_timers(ctxt) ! ! cleanup storage and exit ! diff --git a/samples/advanced/pdegen/runs/amg_pde3d.inp b/samples/advanced/pdegen/runs/amg_pde3d.inp index eb254780..7a3329cd 100644 --- a/samples/advanced/pdegen/runs/amg_pde3d.inp +++ b/samples/advanced/pdegen/runs/amg_pde3d.inp @@ -1,6 +1,6 @@ %%%%%%%%%%% General arguments % Lines starting with % are ignored. CSR ! Storage format CSR COO JAD -0080 ! IDIM; domain size. Linear system size is IDIM**3 +0200 ! IDIM; domain size. Linear system size is IDIM**3 CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES 2 ! ISTOPC @@ -9,7 +9,7 @@ BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS F 30 ! IRST (restart for RGMRES and BiCGSTABL) 1.d-6 ! EPS %%%%%%%%%%% Main preconditioner choices %%%%%%%%%%%%%%%% -ML-VCYCLE-BJAC-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars) +ML-VBM-VCYCLE-FBGS-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars) ML ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML %%%%%%%%%%% First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%% FBGS ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous. @@ -39,8 +39,8 @@ VCYCLE ! Type of multilevel CYCLE: VCYCLE WCYCLE KCYCLE MUL -3 ! Max Number of levels in a multilevel preconditioner; if <0, lib default -3 ! Target coarse matrix size per process; if <0, lib default SMOOTHED ! Type of aggregation: SMOOTHED UNSMOOTHED -COUPLED ! Parallel aggregation: DEC, SYMDEC, COUPLED -MATCHBOXP ! aggregation measure SOC1, MATCHBOXP +DEC ! Parallel aggregation: DEC, SYMDEC, COUPLED +SOC1 ! aggregation measure SOC1, MATCHBOXP 8 ! Requested size of the aggregates for MATCHBOXP NATURAL ! Ordering of aggregation NATURAL DEGREE -1.5 ! Coarsening ratio, if < 0 use library default