Merge branch 'dev-openmp' into development

master
sfilippone 1 year ago
commit 0bcc9d7b55

@ -1,4 +1,3 @@
AMG4PSBLAS
Algebraic Multigrid Package based on PSBLAS (Parallel Sparse BLAS version 3.8)

@ -143,9 +143,10 @@ contains
type(psb_ld_coo_sparse_mat) :: tmpcoo
logical :: display_out_, print_out_, reproducible_
logical, parameter :: dump=.false., debug=.false., dump_mate=.false., &
& debug_ilaggr=.false., debug_sync=.false.
& debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
logical, parameter :: do_timings=.true.
integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
ictxt = desc_a%get_ctxt()
call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
call desc_a%l2gip(ilv,info,owned=.false.)
call psb_geall(ilaggr,desc_a,info)
ilaggr = -1
ilaggr = ilaggr_neginit
call psb_geasb(ilaggr,desc_a,info)
nr = a%get_nrows()
nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
call psb_barrier(ictxt)
if (iam == 0) write(0,*)' out from buildmatching:', info
end if
if (debug_mate) then
block
integer(psb_lpk_), allocatable :: ckmate(:)
allocate(ckmate(nr))
ckmate(1:nr) = mate(1:nr)
call psb_msort(ckmate(1:nr))
do i=1,nr-1
if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i)
end if
end do
end block
end if
if (info == 0) then
if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
cycle
else
if (ilaggr(k) == -1) then
if (ilaggr(k) == ilaggr_neginit) then
wk = w(k)
widx = w(idx)
@ -267,7 +281,7 @@ contains
nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
if (nrmagg > epsilon(nrmagg)) then
if (idx <= nr) then
if (ilaggr(idx) == -1) then
if (ilaggr(idx) == ilaggr_neginit) then
! Now, if both vertices are local, the aggregate is local
! (kinda obvious).
nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
ilaggr(idx) = nlaggr(iam)
wtemp(k) = w(k)/nrmagg
wtemp(idx) = w(idx)/nrmagg
else
write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
&mate(idx),ilaggr(idx)
end if
nlpairs = nlpairs+1
else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1
else
ilaggr(k) = -2
ilaggr(k) = ilaggr_nonlocal
end if
else
! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1
else
ilaggr(k) = -2
ilaggr(k) = ilaggr_nonlocal
end if
end if
end if
@ -325,6 +342,12 @@ contains
nlsingl = nlsingl + 1
end if
end if
if (ilaggr(k) == ilaggr_neginit) then
write(0,*) iam,' Error: no update to ',k,mate(k),&
& abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
end if
else
if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
end if
end if
end do
@ -332,7 +355,7 @@ contains
if (do_timings) call psb_tic(idx_phase3)
! Ok, now compute offsets, gather halo and fix non-local
! aggregates (those where ilaggr == -2)
! aggregates (those where ilaggr == ilaggr_nonlocal)
call psb_sum(ictxt,nlaggr)
ntaggr = sum(nlaggr(0:np-1))
naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
call psb_halo(wtemp,desc_a,info)
! Cleanup as yet unmarked entries
do k=1,nr
if (ilaggr(k) == -2) then
if (ilaggr(k) == ilaggr_nonlocal) then
idx = mate(k)
if (idx > nr) then
i = ilaggr(idx)
@ -359,9 +382,14 @@ contains
else
write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
end if
end if
if (ilaggr(k) <0) then
write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
else if (ilaggr(k) <0) then
write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
write(0,*) iam,' : : ',nr,nc,mate(k)
if (mate(k) <= nr) then
write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),&
& ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
end if
flush(0)
end if
end do
if (debug_sync) then
@ -414,7 +442,7 @@ contains
end block
if (iam == 0) then
write(0,*) 'Matching statistics: Unmatched nodes ',&
write(0,*) iam,'Matching statistics: Unmatched nodes ',&
& nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
end if

@ -143,9 +143,10 @@ contains
type(psb_ls_coo_sparse_mat) :: tmpcoo
logical :: display_out_, print_out_, reproducible_
logical, parameter :: dump=.false., debug=.false., dump_mate=.false., &
& debug_ilaggr=.false., debug_sync=.false.
& debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
logical, parameter :: do_timings=.true.
integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
ictxt = desc_a%get_ctxt()
call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
call desc_a%l2gip(ilv,info,owned=.false.)
call psb_geall(ilaggr,desc_a,info)
ilaggr = -1
ilaggr = ilaggr_neginit
call psb_geasb(ilaggr,desc_a,info)
nr = a%get_nrows()
nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
call psb_barrier(ictxt)
if (iam == 0) write(0,*)' out from buildmatching:', info
end if
if (debug_mate) then
block
integer(psb_lpk_), allocatable :: ckmate(:)
allocate(ckmate(nr))
ckmate(1:nr) = mate(1:nr)
call psb_msort(ckmate(1:nr))
do i=1,nr-1
if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i)
end if
end do
end block
end if
if (info == 0) then
if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
cycle
else
if (ilaggr(k) == -1) then
if (ilaggr(k) == ilaggr_neginit) then
wk = w(k)
widx = w(idx)
@ -267,7 +281,7 @@ contains
nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
if (nrmagg > epsilon(nrmagg)) then
if (idx <= nr) then
if (ilaggr(idx) == -1) then
if (ilaggr(idx) == ilaggr_neginit) then
! Now, if both vertices are local, the aggregate is local
! (kinda obvious).
nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
ilaggr(idx) = nlaggr(iam)
wtemp(k) = w(k)/nrmagg
wtemp(idx) = w(idx)/nrmagg
else
write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
&mate(idx),ilaggr(idx)
end if
nlpairs = nlpairs+1
else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1
else
ilaggr(k) = -2
ilaggr(k) = ilaggr_nonlocal
end if
else
! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1
else
ilaggr(k) = -2
ilaggr(k) = ilaggr_nonlocal
end if
end if
end if
@ -325,6 +342,12 @@ contains
nlsingl = nlsingl + 1
end if
end if
if (ilaggr(k) == ilaggr_neginit) then
write(0,*) iam,' Error: no update to ',k,mate(k),&
& abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
end if
else
if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
end if
end if
end do
@ -332,7 +355,7 @@ contains
if (do_timings) call psb_tic(idx_phase3)
! Ok, now compute offsets, gather halo and fix non-local
! aggregates (those where ilaggr == -2)
! aggregates (those where ilaggr == ilaggr_nonlocal)
call psb_sum(ictxt,nlaggr)
ntaggr = sum(nlaggr(0:np-1))
naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
call psb_halo(wtemp,desc_a,info)
! Cleanup as yet unmarked entries
do k=1,nr
if (ilaggr(k) == -2) then
if (ilaggr(k) == ilaggr_nonlocal) then
idx = mate(k)
if (idx > nr) then
i = ilaggr(idx)
@ -359,9 +382,14 @@ contains
else
write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
end if
end if
if (ilaggr(k) <0) then
write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
else if (ilaggr(k) <0) then
write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
write(0,*) iam,' : : ',nr,nc,mate(k)
if (mate(k) <= nr) then
write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),&
& ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
end if
flush(0)
end if
end do
if (debug_sync) then
@ -414,7 +442,7 @@ contains
end block
if (iam == 0) then
write(0,*) 'Matching statistics: Unmatched nodes ',&
write(0,*) iam,'Matching statistics: Unmatched nodes ',&
& nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
end if

@ -62,7 +62,22 @@ amg_s_parmatch_smth_bld.o \
amg_s_parmatch_spmm_bld_inner.o
MPCOBJS=MatchBoxPC.o \
algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o
sendBundledMessages.o \
initialize.o \
extractUChunk.o \
isAlreadyMatched.o \
findOwnerOfGhost.o \
clean.o \
computeCandidateMate.o \
parallelComputeCandidateMateB.o \
processMatchedVertices.o \
processMatchedVerticesAndSendMessages.o \
processCrossEdge.o \
queueTransfer.o \
processMessages.o \
processExposedVertex.o \
algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
OBJS = $(FOBJS) $(MPCOBJS)

@ -60,17 +60,43 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
#if !defined(SERIAL_MPI)
MPI_Comm C_comm=MPI_Comm_f2c(icomm);
#ifdef DEBUG
fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n",
myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
#endif
dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
#define TIME_TRACKER
#ifdef TIME_TRACKER
double tmr = MPI_Wtime();
#endif
#define OMP
#ifdef OMP
dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge,
verLocPtr, verLocInd, edgeLocWeight,
verDistance, Mate,
myRank, numProcs, C_comm,
msgIndSent, msgActualSent, msgPercent,
ph0_time, ph1_time, ph2_time,
ph1_card, ph2_card );
#else
dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
verLocPtr, verLocInd, edgeLocWeight,
verDistance, Mate,
myRank, numProcs, C_comm,
msgIndSent, msgActualSent, msgPercent,
ph0_time, ph1_time, ph2_time,
ph1_card, ph2_card );
#endif
#ifdef TIME_TRACKER
tmr = MPI_Wtime() - tmr;
fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer);
#endif
#endif
}

@ -52,145 +52,412 @@
#ifndef _matchboxpC_H_
#define _matchboxpC_H_
//Turn on a lot of debugging information with this switch:
// Turn on a lot of debugging information with this switch:
//#define PRINT_DEBUG_INFO_
#include <stdio.h>
#include <iostream>
#include <assert.h>
#include <map>
#include <vector>
// #include "matchboxp.h"
#include "omp.h"
#include "primitiveDataTypeDefinitions.h"
#include "dataStrStaticQueue.h"
using namespace std;
const int NUM_THREAD = 4;
const int UCHUNK = 10;
const MilanLongInt REQUEST = 1;
const MilanLongInt SUCCESS = 2;
const MilanLongInt FAILURE = 3;
const MilanLongInt SIZEINFO = 4;
const int ComputeTag = 7; // Predefined tag
const int BundleTag = 9; // Predefined tag
static vector<MilanLongInt> DEFAULT_VECTOR;
// MPI type map
template <typename T>
MPI_Datatype TypeMap();
template <>
inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
template <>
inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
template <>
inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
template <>
inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#if !defined(SERIAL_MPI)
#define MilanMpiLongInt MPI_LONG_LONG
#define MilanMpiLongInt MPI_LONG_LONG
#ifndef _primitiveDataType_Definition_
#define _primitiveDataType_Definition_
//Regular integer:
#ifndef INTEGER_H
#define INTEGER_H
typedef int32_t MilanInt;
#endif
//Regular long integer:
#ifndef LONG_INT_H
#define LONG_INT_H
#ifdef BIT64
typedef int64_t MilanLongInt;
typedef MPI_LONG MilanMpiLongInt;
#else
typedef int32_t MilanLongInt;
typedef MPI_INT MilanMpiLongInt;
#endif
#endif
//Regular boolean
#ifndef BOOL_H
#define BOOL_H
typedef bool MilanBool;
#endif
//Regular double and absolute value computation:
#ifndef REAL_H
#define REAL_H
typedef double MilanReal;
typedef MPI_DOUBLE MilanMpiReal;
inline MilanReal MilanAbs(MilanReal value)
{
return fabs(value);
}
#endif
//Regular float and absolute value computation:
#ifndef FLOAT_H
#define FLOAT_H
typedef float MilanFloat;
typedef MPI_FLOAT MilanMpiFloat;
inline MilanFloat MilanAbsFloat(MilanFloat value)
{
return fabs(value);
}
#endif
//// Define the limits:
#ifndef LIMITS_H
#define LIMITS_H
//Integer Maximum and Minimum:
// #define MilanIntMax INT_MAX
// #define MilanIntMin INT_MIN
#define MilanIntMax INT32_MAX
#define MilanIntMin INT32_MIN
#ifdef BIT64
#define MilanLongIntMax INT64_MAX
#define MilanLongIntMin -INT64_MAX
#else
#define MilanLongIntMax INT32_MAX
#define MilanLongIntMin -INT32_MAX
#endif
#endif
// Regular integer:
#ifndef INTEGER_H
#define INTEGER_H
typedef int32_t MilanInt;
#endif
// Regular long integer:
#ifndef LONG_INT_H
#define LONG_INT_H
#ifdef BIT64
typedef int64_t MilanLongInt;
typedef MPI_LONG MilanMpiLongInt;
#else
typedef int32_t MilanLongInt;
typedef MPI_INT MilanMpiLongInt;
#endif
#endif
// Regular boolean
#ifndef BOOL_H
#define BOOL_H
typedef bool MilanBool;
#endif
// Regular double and absolute value computation:
#ifndef REAL_H
#define REAL_H
typedef double MilanReal;
typedef MPI_DOUBLE MilanMpiReal;
inline MilanReal MilanAbs(MilanReal value)
{
return fabs(value);
}
#endif
// Regular float and absolute value computation:
#ifndef FLOAT_H
#define FLOAT_H
typedef float MilanFloat;
typedef MPI_FLOAT MilanMpiFloat;
inline MilanFloat MilanAbsFloat(MilanFloat value)
{
return fabs(value);
}
#endif
//// Define the limits:
#ifndef LIMITS_H
#define LIMITS_H
// Integer Maximum and Minimum:
// #define MilanIntMax INT_MAX
// #define MilanIntMin INT_MIN
#define MilanIntMax INT32_MAX
#define MilanIntMin INT32_MIN
#ifdef BIT64
#define MilanLongIntMax INT64_MAX
#define MilanLongIntMin -INT64_MAX
#else
#define MilanLongIntMax INT32_MAX
#define MilanLongIntMin -INT32_MAX
#endif
#endif
// +INFINITY
const double PLUS_INFINITY = numeric_limits<int>::infinity();
const double MINUS_INFINITY = -PLUS_INFINITY;
//#define MilanRealMax LDBL_MAX
#define MilanRealMax PLUS_INFINITY
#define MilanRealMin MINUS_INFINITY
//#define MilanRealMax LDBL_MAX
#define MilanRealMax PLUS_INFINITY
#define MilanRealMin MINUS_INFINITY
#endif
//Function of find the owner of a ghost vertex using binary search:
inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
MilanInt myRank, MilanInt numProcs);
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
MilanLongInt* verDistance,
MilanLongInt* Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
MilanLongInt* ph1_card, MilanLongInt* ph2_card );
void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
MilanLongInt* verDistance,
MilanLongInt* Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
MilanLongInt* ph1_card, MilanLongInt* ph2_card );
void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
MilanLongInt* verDistance,
MilanLongInt* Mate,
MilanInt myRank, MilanInt numProcs, MilanInt icomm,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
MilanLongInt* ph1_card, MilanLongInt* ph2_card );
void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
MilanLongInt* verDistance,
MilanLongInt* Mate,
MilanInt myRank, MilanInt numProcs, MilanInt icomm,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
MilanLongInt* ph1_card, MilanLongInt* ph2_card );
// Function of find the owner of a ghost vertex using binary search:
MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
MilanInt myRank, MilanInt numProcs);
MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
MilanLongInt adj2,
MilanLongInt *verLocInd,
MilanReal *edgeLocWeight);
void queuesTransfer(vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner);
bool isAlreadyMatched(MilanLongInt node,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
MilanLongInt computeCandidateMate(MilanLongInt adj1,
MilanLongInt adj2,
MilanReal *edgeLocWeight,
MilanLongInt k,
MilanLongInt *verLocInd,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt StartIndex, MilanLongInt EndIndex,
MilanLongInt *numGhostEdgesPtr,
MilanLongInt *numGhostVerticesPtr,
MilanLongInt *S,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &Counter,
vector<MilanLongInt> &verGhostPtr,
vector<MilanLongInt> &verGhostInd,
vector<MilanLongInt> &tempCounter,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Message,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
MilanLongInt *&candidateMate,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner);
void clean(MilanLongInt NLVer,
MilanInt myRank,
MilanLongInt MessageIndex,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus,
MilanInt BufferSize,
MilanLongInt *Buffer,
MilanLongInt msgActual,
MilanLongInt *msgActualSent,
MilanLongInt msgInd,
MilanLongInt *msgIndSent,
MilanLongInt NumMessagesBundled,
MilanReal *msgPercent);
void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanInt myRank,
MilanReal *edgeLocWeight,
MilanLongInt *candidateMate);
void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
MilanLongInt *candidateMate,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *Mate,
vector<MilanLongInt> &GMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *NumMessagesBundledPtr,
MilanLongInt *SPtr,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner);
void PROCESS_CROSS_EDGE(MilanLongInt *edge,
MilanLongInt *SPtr);
void processMatchedVertices(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *NumMessagesBundledPtr,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner);
void processMatchedVerticesAndSendMessages(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *NumMessagesBundledPtr,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner,
MPI_Comm comm,
MilanLongInt *msgActual,
vector<MilanLongInt> &Message);
void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
MilanInt *BufferSizePtr,
MilanLongInt *Buffer,
vector<MilanLongInt> &PCumulative,
vector<MilanLongInt> &PMessageBundle,
vector<MilanLongInt> &PSizeInfoMessages,
MilanLongInt *PCounter,
MilanLongInt NumMessagesBundled,
MilanLongInt *msgActualPtr,
MilanLongInt *MessageIndexPtr,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus);
void processMessages(
MilanLongInt NLVer,
MilanLongInt *Mate,
MilanLongInt *candidateMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Counter,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *msgActualPtr,
MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *verLocPtr,
MilanLongInt k,
MilanLongInt *verLocInd,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &Message,
MilanLongInt numGhostEdges,
MilanLongInt u,
MilanLongInt v,
MilanLongInt *SPtr,
vector<MilanLongInt> &U);
void extractUChunk(
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU);
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MilanInt icomm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MilanInt icomm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
#endif
#ifdef __cplusplus

@ -72,12 +72,6 @@
#ifdef SERIAL_MPI
#else
//MPI type map
template<typename T> MPI_Datatype TypeMap();
template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
// DOUBLE PRECISION VERSION
//WARNING: The vertex block on a given rank is contiguous

@ -0,0 +1,554 @@
#include "MatchBoxPC.h"
// ***********************************************************************
//
// MatchboxP: A C++ library for approximate weighted matching
// Mahantesh Halappanavar (hala@pnnl.gov)
// Pacific Northwest National Laboratory
//
// ***********************************************************************
//
// Copyright (2021) Battelle Memorial Institute
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// ************************************************************************
//////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// DOMINATING EDGES MODEL ///////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
/* Function : algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate()
*
* Date : New update: Feb 17, 2019, Richland, Washington.
* Date : Original development: May 17, 2009, E&CS Bldg.
*
* Purpose : Compute Approximate Maximum Weight Matching in Linear Time
*
* Args : inputMatrix - instance of Compressed-Col format of Matrix
* Mate - The Mate array
*
* Returns : By Value: (void)
* By Reference: Mate
*
* Comments : 1/2 Approx Algorithm. Picks the locally available heaviest edge.
* Assumption: The Mate Array is empty.
*/
/*
NLVer = #of vertices, NLEdge = #of edges
CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers)
verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process
Mate = A vector of size |V_p| (local subgraph) to store the output (matching)
MPI: myRank, numProcs, comm,
Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world
Statistics: ph0_time, ph1_time, ph2_time: Runtimes
Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
*/
//#define DEBUG_HANG_
#ifdef SERIAL_MPI
#else
// DOUBLE PRECISION VERSION
// WARNING: The vertex block on a given rank is contiguous
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd,
MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent,
MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card)
{
/*
* verDistance: it's a vector long as the number of processors.
* verDistance[i] contains the first node index of the i-th processor
* verDistance[i + 1] contains the last node index of the i-th processor
* NLVer: number of elements in the LocPtr
* NLEdge: number of edges assigned to the current processor
*
* Contains the portion of matrix assigned to the processor in
* Yale notation
* verLocInd: contains the positions on row of the matrix
* verLocPtr: i-th value is the position of the first element on the i-th row and
* i+1-th value is the position of the first element on the i+1-th row
*/
#if !defined(SERIAL_MPI)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()";
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") verDistance [" ;
for (int i = 0; i < numProcs; i++)
cout << verDistance[i] << "," << verDistance[i+1];
cout << "]\n";
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
if (myRank == 0) {
cout << "\n(" << myRank << ") verDistance [" ;
for (int i = 0; i < numProcs; i++)
cout << verDistance[i] << "," ;
cout << verDistance[numProcs]<< "]\n";
}
fflush(stdout);
#endif
MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank
MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
MPI_Status computeStatus;
MilanLongInt msgActual = 0, msgInd = 0;
MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight
MilanReal startTime, finishTime;
startTime = MPI_Wtime();
// Data structures for sending and receiving messages:
vector<MilanLongInt> Message; // [ u, v, message_type ]
Message.resize(3, -1);
// Data structures for Message Bundling:
// Although up to two messages can be sent along any cross edge,
// only one message will be sent in the initialization phase -
// one of: REQUEST/FAILURE/SUCCESS
vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
MilanLongInt *PCounter = new MilanLongInt[numProcs];
for (int i = 0; i < numProcs; i++)
PCounter[i] = 0;
MilanLongInt NumMessagesBundled = 0;
// TODO when the last computational section will be refactored this could be eliminated
MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
MilanLongInt *candidateMate = nullptr;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")NV: " << NLVer << " Edges: " << NLEdge;
fflush(stdout);
cout << "\n(" << myRank << ")StartIndex: " << StartIndex << " EndIndex: " << EndIndex;
fflush(stdout);
#endif
// Other Variables:
MilanLongInt u = -1, v = -1, w = -1, i = 0;
MilanLongInt k = -1, adj1 = -1, adj2 = -1;
MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
MilanLongInt myCard = 0;
// Build the Ghost Vertex Set: Vg
map<MilanLongInt, MilanLongInt> Ghost2LocalMap; // Map each ghost vertex to a local vertex
vector<MilanLongInt> Counter; // Store the edge count for each ghost vertex
MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
fflush(stdout);
#endif
// Define Adjacency Lists for Ghost Vertices:
// cout<<"Building Ghost data structures ... \n\n";
vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
// Mate array for ghost vertices:
vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
MilanLongInt S;
MilanLongInt privateMyCard = 0;
vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
vector<MPI_Request> SRequest; // Requests that are used for each send message
vector<MPI_Status> SStatus; // Status of sent messages, used in MPI_Wait
MilanLongInt MessageIndex = 0; // Pointer for current message
MilanInt BufferSize;
MilanLongInt *Buffer;
vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType;
vector<MilanInt> privateQOwner;
vector<MilanLongInt> U, privateU;
initialize(NLVer, NLEdge, StartIndex,
EndIndex, &numGhostEdges,
&numGhostVertices, &S,
verLocInd, verLocPtr,
Ghost2LocalMap, Counter,
verGhostPtr, verGhostInd,
tempCounter, GMate,
Message, QLocalVtx,
QGhostVtx, QMsgType, QOwner,
candidateMate, U,
privateU,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
finishTime = MPI_Wtime();
*ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization
#ifdef DEBUG_HANG_
cout << myRank << " Finished initialization" << endl;
fflush(stdout);
#endif
startTime = MPI_Wtime();
/////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// INITIALIZATION /////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
// Compute the Initial Matching Set:
/*
* OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
* PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
* the two.
* PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
*/
PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer,
verLocPtr,
verLocInd,
myRank,
edgeLocWeight,
candidateMate);
#ifdef DEBUG_HANG_
cout << myRank << " Finished Exposed Vertex" << endl;
fflush(stdout);
#if 0
cout << myRank << " candidateMate after parallelCompute " <<endl;
for (int i=0; i<NLVer; i++) {
cout << candidateMate[i] << " " ;
}
cout << endl;
#endif
#endif
/*
* PARALLEL_PROCESS_EXPOSED_VERTEX_B
* TODO: write comment
*
* TODO: Test when it's actually more efficient to execute this code
* in parallel.
*/
PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
candidateMate,
verLocInd,
verLocPtr,
StartIndex,
EndIndex,
Mate,
GMate,
Ghost2LocalMap,
edgeLocWeight,
&myCard,
&msgInd,
&NumMessagesBundled,
&S,
verDistance,
PCounter,
Counter,
myRank,
numProcs,
U,
privateU,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
tempCounter.clear(); // Do not need this any more
#ifdef DEBUG_HANG_
cout << myRank << " Finished Exposed Vertex" << endl;
fflush(stdout);
#if 0
cout << myRank << " Mate after Exposed Vertices " <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
// TODO what would be the optimal UCHUNK
vector<MilanLongInt> UChunkBeingProcessed;
UChunkBeingProcessed.reserve(UCHUNK);
processMatchedVertices(NLVer,
UChunkBeingProcessed,
U,
privateU,
StartIndex,
EndIndex,
&myCard,
&msgInd,
&NumMessagesBundled,
&S,
verLocPtr,
verLocInd,
verDistance,
PCounter,
Counter,
myRank,
numProcs,
candidateMate,
GMate,
Mate,
Ghost2LocalMap,
edgeLocWeight,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
#ifdef DEBUG_HANG_
cout << myRank << " Finished Process Vertices" << endl;
fflush(stdout);
#if 0
cout << myRank << " Mate after Matched Vertices " <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
/////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
sendBundledMessages(&numGhostEdges,
&BufferSize,
Buffer,
PCumulative,
PMessageBundle,
PSizeInfoMessages,
PCounter,
NumMessagesBundled,
&msgActual,
&MessageIndex,
numProcs,
myRank,
comm,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
SRequest,
SStatus);
///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
finishTime = MPI_Wtime();
*ph1_time = finishTime - startTime; // Time taken for Phase-1
#ifdef DEBUG_HANG_
cout << myRank << " Finished sendBundles" << endl;
fflush(stdout);
#endif
*ph1_card = myCard; // Cardinality at the end of Phase-1
startTime = MPI_Wtime();
/////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// MAIN LOOP //////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
// Main While Loop:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Entering While(true) loop..";
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
while (true) {
#ifdef DEBUG_HANG_
//if (myRank == 0)
cout << "\n(" << myRank << ") Main loop" << endl;
fflush(stdout);
#endif
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
processMatchedVerticesAndSendMessages(NLVer,
UChunkBeingProcessed,
U,
privateU,
StartIndex,
EndIndex,
&myCard,
&msgInd,
&NumMessagesBundled,
&S,
verLocPtr,
verLocInd,
verDistance,
PCounter,
Counter,
myRank,
numProcs,
candidateMate,
GMate,
Mate,
Ghost2LocalMap,
edgeLocWeight,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner,
comm,
&msgActual,
Message);
///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
//// BREAK IF NO MESSAGES EXPECTED /////////
#ifdef DEBUG_HANG_
#if 0
cout << myRank << " Mate after ProcessMatchedAndSend phase "<<S <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Deciding whether to break: S= " << S << endl;
#endif
if (S == 0) {
#ifdef DEBUG_HANG_
cout << "\n(" << myRank << ") Breaking out" << endl;
fflush(stdout);
#endif
break;
}
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////// PROCESS MESSAGES //////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
processMessages(NLVer,
Mate,
candidateMate,
Ghost2LocalMap,
GMate,
Counter,
StartIndex,
EndIndex,
&myCard,
&msgInd,
&msgActual,
edgeLocWeight,
verDistance,
verLocPtr,
k,
verLocInd,
numProcs,
myRank,
comm,
Message,
numGhostEdges,
u,
v,
&S,
U);
///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
#ifdef DEBUG_HANG_
#if 0
cout << myRank << " Mate after ProcessMessages phase "<<S <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
fflush(stdout);
cout << "\n(" << myRank << ")** SENT : ACTUAL= " << msgActual;
fflush(stdout);
cout << "\n(" << myRank << ")** SENT : INDIVIDUAL= " << msgInd << endl;
fflush(stdout);
#endif
} // End of while (true)
clean(NLVer,
myRank,
MessageIndex,
SRequest,
SStatus,
BufferSize,
Buffer,
msgActual,
msgActualSent,
msgInd,
msgIndSent,
NumMessagesBundled,
msgPercent);
finishTime = MPI_Wtime();
*ph2_time = finishTime - startTime; // Time taken for Phase-2
*ph2_card = myCard; // Cardinality at the end of Phase-2
}
// End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
#endif
#endif

@ -97,6 +97,8 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_c_dec_aggregator_tprol'
call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
!
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then
info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1
integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
& idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1)
ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done.
if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block
call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols())
end block
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999
@ -168,26 +173,27 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp()
nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows()
i=0
call coo_restr%fix(info)
i=coo_restr%get_nzeros()
!
! Only keep local rows
!
do k=1, nzl
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
i = i+1
coo_restr%val(i) = coo_restr%val(k)
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
search: do k=i,1,-1
if (coo_restr%ia(k) <= nrl) then
call coo_restr%set_nzeros(k)
exit search
end if
end do
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
end do search
nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999

@ -72,7 +72,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_c_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! Arguments
@ -85,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info
! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:)
complex(psb_spk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_tic(idx_soc1_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
icnt = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii)
if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1
do k=1, ip
ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
& ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -274,7 +461,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
enddo
if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1
ilaggr(i) = naggr
do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999
@ -336,9 +528,13 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0
nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free()
call psb_erractionrestore(err_act)
return

@ -71,6 +71,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_c_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do
end do
!$omp end parallel do
!
! Compute the 1-neigbour; mark strong links with +1, weak links with -1
!
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = sone
s_neigh_coo%val(k) = sone
else
s_neigh_coo%val(ip) = -sone
s_neigh_coo%val(k) = -sone
end if
else
s_neigh_coo%val(k) = -sone
end if
end do
end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip)
call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0
step1: do ii=1, nr
i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -294,6 +477,8 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr
if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_spk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false.
character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld'
info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context()
call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates
! nrow: local rows.
!
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D
adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
!
! Build the filtered matrix Af from A
!
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow
tmp = czero
jd = -1
@ -214,11 +235,13 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp
end if
enddo
!$OMP end parallel do
! Take out zeroed terms
call acsrf%clean_zeros(info)
end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag)
if (adiag(i) /= czero) then
adiag(i) = cone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = cone
end if
end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
!
! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate '

@ -97,6 +97,8 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_d_dec_aggregator_tprol'
call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
!
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then
info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1
integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
& idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1)
ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done.
if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block
call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols())
end block
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999
@ -168,26 +173,27 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp()
nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows()
i=0
call coo_restr%fix(info)
i=coo_restr%get_nzeros()
!
! Only keep local rows
!
do k=1, nzl
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
i = i+1
coo_restr%val(i) = coo_restr%val(k)
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
search: do k=i,1,-1
if (coo_restr%ia(k) <= nrl) then
call coo_restr%set_nzeros(k)
exit search
end if
end do
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
end do search
nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999

@ -72,7 +72,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_d_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! Arguments
@ -85,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info
! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:)
real(psb_dpk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_tic(idx_soc1_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
icnt = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii)
if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1
do k=1, ip
ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
& ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -274,7 +461,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
enddo
if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1
ilaggr(i) = naggr
do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999
@ -336,9 +528,13 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0
nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free()
call psb_erractionrestore(err_act)
return

@ -71,6 +71,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_d_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do
end do
!$omp end parallel do
!
! Compute the 1-neigbour; mark strong links with +1, weak links with -1
!
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = done
s_neigh_coo%val(k) = done
else
s_neigh_coo%val(ip) = -done
s_neigh_coo%val(k) = -done
end if
else
s_neigh_coo%val(k) = -done
end if
end do
end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip)
call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0
step1: do ii=1, nr
i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -294,6 +477,8 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr
if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_dpk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false.
character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld'
info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context()
call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates
! nrow: local rows.
!
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D
adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
!
! Build the filtered matrix Af from A
!
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow
tmp = dzero
jd = -1
@ -214,11 +235,13 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp
end if
enddo
!$OMP end parallel do
! Take out zeroed terms
call acsrf%clean_zeros(info)
end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag)
if (adiag(i) /= dzero) then
adiag(i) = done / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = done
end if
end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
!
! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate '

@ -97,6 +97,8 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_s_dec_aggregator_tprol'
call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
!
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then
info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1
integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
& idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1)
ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done.
if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block
call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols())
end block
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999
@ -168,26 +173,27 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp()
nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows()
i=0
call coo_restr%fix(info)
i=coo_restr%get_nzeros()
!
! Only keep local rows
!
do k=1, nzl
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
i = i+1
coo_restr%val(i) = coo_restr%val(k)
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
search: do k=i,1,-1
if (coo_restr%ia(k) <= nrl) then
call coo_restr%set_nzeros(k)
exit search
end if
end do
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
end do search
nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999

@ -72,7 +72,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_s_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! Arguments
@ -85,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info
! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:)
real(psb_spk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_tic(idx_soc1_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
icnt = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii)
if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1
do k=1, ip
ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
& ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -274,7 +461,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
enddo
if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1
ilaggr(i) = naggr
do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999
@ -336,9 +528,13 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0
nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free()
call psb_erractionrestore(err_act)
return

@ -71,6 +71,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_s_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do
end do
!$omp end parallel do
!
! Compute the 1-neigbour; mark strong links with +1, weak links with -1
!
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = sone
s_neigh_coo%val(k) = sone
else
s_neigh_coo%val(ip) = -sone
s_neigh_coo%val(k) = -sone
end if
else
s_neigh_coo%val(k) = -sone
end if
end do
end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip)
call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0
step1: do ii=1, nr
i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -294,6 +477,8 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr
if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_spk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false.
character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld'
info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context()
call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates
! nrow: local rows.
!
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D
adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
!
! Build the filtered matrix Af from A
!
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow
tmp = szero
jd = -1
@ -214,11 +235,13 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp
end if
enddo
!$OMP end parallel do
! Take out zeroed terms
call acsrf%clean_zeros(info)
end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag)
if (adiag(i) /= szero) then
adiag(i) = sone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = sone
end if
end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
!
! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate '

@ -97,6 +97,8 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_z_dec_aggregator_tprol'
call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
!
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then
info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1
integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
& idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1)
ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done.
if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block
call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols())
end block
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999
@ -168,26 +173,27 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp()
nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows()
i=0
call coo_restr%fix(info)
i=coo_restr%get_nzeros()
!
! Only keep local rows
!
do k=1, nzl
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
i = i+1
coo_restr%val(i) = coo_restr%val(k)
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
search: do k=i,1,-1
if (coo_restr%ia(k) <= nrl) then
call coo_restr%set_nzeros(k)
exit search
end if
end do
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
end do search
nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999

@ -72,7 +72,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_z_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! Arguments
@ -85,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info
! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:)
complex(psb_dpk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_tic(idx_soc1_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
icnt = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii)
if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1
do k=1, ip
ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
& ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -274,7 +461,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
enddo
if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1
ilaggr(i) = naggr
do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999
@ -336,9 +528,13 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0
nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free()
call psb_erractionrestore(err_act)
return

@ -71,6 +71,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod
use amg_base_prec_type
use amg_z_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_
name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows()
nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999
end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
!
call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do
end do
!$omp end parallel do
!
! Compute the 1-neigbour; mark strong links with +1, weak links with -1
!
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = done
s_neigh_coo%val(k) = done
else
s_neigh_coo%val(ip) = -done
s_neigh_coo%val(k) = -done
end if
else
s_neigh_coo%val(k) = -done
end if
end do
end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip)
call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
idxs(i) = i
end do
!$omp end parallel do
else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr
ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i)
end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
!
! Phase one: Start with disjoint groups.
!
naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0
step1: do ii=1, nr
i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
endif
enddo step1
#endif
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1))
end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
!
! Phase two: join the neighbours
!
!$omp workshare
tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr
i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if
end if
end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
!
! Phase three: sweep over leftovers, if any
!
@ -294,6 +477,8 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3
! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr
if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes.
ilaggr(i) = -(nrglob+nr)
else
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999
cycle
endif
end if
end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then
info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_dpk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false.
character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld'
info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context()
call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates
! nrow: local rows.
!
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D
adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
!
! Build the filtered matrix Af from A
!
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow
tmp = zzero
jd = -1
@ -214,11 +235,13 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp
end if
enddo
!$OMP end parallel do
! Take out zeroed terms
call acsrf%clean_zeros(info)
end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag)
if (adiag(i) /= zzero) then
adiag(i) = zone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = zone
end if
end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
!
! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999
end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate '

@ -0,0 +1,91 @@
#include "MatchBoxPC.h"
// TODO comment
void clean(MilanLongInt NLVer,
MilanInt myRank,
MilanLongInt MessageIndex,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus,
MilanInt BufferSize,
MilanLongInt *Buffer,
MilanLongInt msgActual,
MilanLongInt *msgActualSent,
MilanLongInt msgInd,
MilanLongInt *msgIndSent,
MilanLongInt NumMessagesBundled,
MilanReal *msgPercent)
{
// Cleanup Phase
#pragma omp parallel
{
#pragma omp master
{
#pragma omp task
{
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") Waitall= " << endl;
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
cout << "\n(" << myRank << ") Waitall " << endl;
fflush(stdout);
#endif
//return;
MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
// MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
if (BufferSize > 0)
{
MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
free(Buffer); // Free the memory that was allocated
}
}
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
fflush(stdout);
cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
fflush(stdout);
cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
fflush(stdout);
cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
if (msgInd > 0)
{
cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
}
fflush(stdout);
#endif
#pragma omp task
{
*msgActualSent = msgActual;
*msgIndSent = msgInd;
if (msgInd > 0)
{
*msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
}
else
{
*msgPercent = 0;
}
}
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ") Done" << endl;
fflush(stdout);
#endif
}
}
}

@ -0,0 +1,73 @@
#include "MatchBoxPC.h"
/**
* Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
* Returns the vertices with the highest weight
* @param adj1
* @param adj2
* @param verLocInd
* @param edgeLocWeight
* @return
*/
MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
MilanLongInt adj2,
MilanLongInt *verLocInd,
MilanReal *edgeLocWeight)
{
MilanInt w = -1;
MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
int finalK;
for (int k = adj1; k < adj2; k++) {
if ((edgeLocWeight[k] > heaviestEdgeWt) ||
((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
heaviestEdgeWt = edgeLocWeight[k];
w = verLocInd[k];
finalK = k;
}
} // End of for loop
return finalK;
}
/**
* //TODO documentation
* @param adj1
* @param adj2
* @param edgeLocWeight
* @param k
* @param verLocInd
* @param StartIndex
* @param EndIndex
* @param GMate
* @param Mate
* @param Ghost2LocalMap
* @return
*/
MilanLongInt computeCandidateMate(MilanLongInt adj1,
MilanLongInt adj2,
MilanReal *edgeLocWeight,
MilanLongInt k,
MilanLongInt *verLocInd,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
{
// Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
MilanInt w = -1;
MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
for (k = adj1; k < adj2; k++) {
if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
continue;
if ((edgeLocWeight[k] > heaviestEdgeWt) ||
((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
heaviestEdgeWt = edgeLocWeight[k];
w = verLocInd[k];
}
} // End of for loop
// End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
return w;
}

@ -80,9 +80,11 @@ class staticQueue
MilanLongInt squeueTail;
MilanLongInt NumNodes;
//FIXME I had to comment this piece of code in order to make everything work.
// why?
//Prevent Assignment and Pass by Value:
staticQueue(const staticQueue& src);
staticQueue& operator=(const staticQueue& rhs);
//staticQueue(const staticQueue& src);
//staticQueue& operator=(const staticQueue& rhs);
public:
//Constructors and Destructors

@ -0,0 +1,31 @@
#include "MatchBoxPC.h"
void extractUChunk(
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU)
{
UChunkBeingProcessed.clear();
#pragma omp critical(U)
{
if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
{
while (!privateU.empty())
UChunkBeingProcessed.push_back(privateU.back());
privateU.pop_back();
}
else
{
for (int i = 0; i < UCHUNK; i++)
{ // Pop the new nodes
if (U.empty())
break;
UChunkBeingProcessed.push_back(U.back());
U.pop_back();
}
}
} // End of critical U // End of critical U
}

@ -0,0 +1,29 @@
#include "MatchBoxPC.h"
/// Find the owner of a ghost node:
MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
MilanInt myRank, MilanInt numProcs)
{
MilanLongInt mStartInd = mVerDistance[myRank];
MilanInt Start = 0;
MilanInt End = numProcs;
MilanInt Current = 0;
while (Start <= End)
{
Current = (End + Start) / 2;
// CASE-1:
if (mVerDistance[Current] == vtxIndex) return Current;
else // CASE 2:
if (mVerDistance[Current] > vtxIndex)
End = Current - 1;
else // CASE 3:
Start = Current + 1;
} // End of While()
if (mVerDistance[Current] > vtxIndex)
return (Current - 1);
return Current;
} // End of findOwnerOfGhost()

@ -0,0 +1,304 @@
#include "MatchBoxPC.h"
void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt StartIndex, MilanLongInt EndIndex,
MilanLongInt *numGhostEdges,
MilanLongInt *numGhostVertices,
MilanLongInt *S,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &Counter,
vector<MilanLongInt> &verGhostPtr,
vector<MilanLongInt> &verGhostInd,
vector<MilanLongInt> &tempCounter,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Message,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
MilanLongInt *&candidateMate,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
MilanLongInt insertMe = 0;
MilanLongInt adj1, adj2;
int i, v, k, w;
// index that starts with zero to |Vg| - 1
map<MilanLongInt, MilanLongInt>::iterator storedAlready;
#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD)
{
#pragma omp single
{
#ifdef TIME_TRACKER
double Ghost2LocalInitialization = MPI_Wtime();
#endif
/*
* OMP Ghost2LocalInitialization
* This loop analyzes all the edges and when finds a ghost edge
* puts it in the Ghost2LocalMap.
* A critical region is needed when inserting data in the map.
*
* Despite the critical region it is still productive to
* parallelize this cycle because the critical region is exeuted
* only when a ghost edge is found and ghost edges are a minority,
* circa 3.5% during the tests.
*/
#pragma omp task depend(out \
: *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
{
#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
: numGhostEdges[:1])
for (i = 0; i < NLEdge; i++)
{ // O(m) - Each edge stored twice
insertMe = verLocInd[i];
if ((insertMe < StartIndex) || (insertMe > EndIndex))
{ // Find a ghost
(*numGhostEdges)++;
#pragma omp critical
{
storedAlready = Ghost2LocalMap.find(insertMe);
if (storedAlready != Ghost2LocalMap.end())
{ // Has already been added
Counter[storedAlready->second]++; // Increment the counter
}
else
{ // Insert an entry for the ghost:
Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
Counter.push_back(1); // Initialize the counter
(*numGhostVertices)++; // Increment the number of ghost vertices
} // End of else()
}
} // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
} // End of for(ghost vertices)
} // end of task depend
// *numGhostEdges = atomicNumGhostEdges;
#ifdef TIME_TRACKER
Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges;
if (!Ghost2LocalMap.empty())
{
cout << "\n(" << myRank << ")Final Map : on process ";
cout << "\n(" << myRank << ")Key \t Value \t Counter \n";
fflush(stdout);
storedAlready = Ghost2LocalMap.begin();
do
{
cout << storedAlready->second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl;
fflush(stdout);
storedAlready++;
} while (storedAlready != Ghost2LocalMap.end());
}
#endif
#pragma omp task depend(out \
: verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
: *numGhostVertices, *numGhostEdges)
{
// Initialize adjacency Lists for Ghost Vertices:
try
{
verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector
tempCounter.reserve(*numGhostVertices); // Pointer Vector
verGhostInd.reserve(*numGhostEdges); // Index Vector
GMate.reserve(*numGhostVertices); // Ghost Mate Vector
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
// Initialize the Vectors:
verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector
tempCounter.resize(*numGhostVertices, 0); // Temporary Counter
verGhostInd.resize(*numGhostEdges, -1); // Index Vector
GMate.resize(*numGhostVertices, -1); // Temporary Counter
verGhostPtr[0] = 0; // The first value
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Ghost Vertex Pointer: ";
fflush(stdout);
#endif
} // End of task
#pragma omp task depend(out \
: verGhostPtr) depend(in \
: Counter, *numGhostVertices)
{
#ifdef TIME_TRACKER
double verGhostPtrInitialization = MPI_Wtime();
#endif
for (i = 0; i < *numGhostVertices; i++)
{ // O(|Ghost Vertices|)
verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
#ifdef PRINT_DEBUG_INFO_
cout << verGhostPtr[i] << "\t";
fflush(stdout);
#endif
}
#ifdef TIME_TRACKER
verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
#endif
} // End of task
#ifdef PRINT_DEBUG_INFO_
if (*numGhostVertices > 0)
cout << verGhostPtr[*numGhostVertices] << "\n";
fflush(stdout);
#endif
#ifdef TIME_TRACKER
double verGhostIndInitialization = MPI_Wtime();
#endif
/*
* OMP verGhostIndInitialization
*
* In this cycle the verGhostInd is initialized
* with the datas related to ghost edges.
* The check to see if a node is a ghost node is
* executed in paralle and when a ghost node
* is found a critical region is started.
*
* Despite the critical region it's still useful to
* parallelize the for cause the ghost nodes
* are a minority hence the critical region is executed
* few times, circa 3.5% of the times in the tests.
*/
#pragma omp task depend(in \
: insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \
: verGhostInd)
{
#pragma omp taskloop num_tasks(NUM_THREAD)
for (v = 0; v < NLVer; v++)
{
adj1 = verLocPtr[v]; // Vertex Pointer
adj2 = verLocPtr[v + 1];
for (k = adj1; k < adj2; k++)
{
w = verLocInd[k]; // Get the adjacent vertex
if ((w < StartIndex) || (w > EndIndex))
{ // Find a ghost
#pragma omp critical
{
insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
tempCounter[Ghost2LocalMap[w]]++; // Increment the counter
}
verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
} // End of if((w < StartIndex) || (w > EndIndex))
} // End of for(k)
} // End of for (v)
} // end of tasklopp
#ifdef TIME_TRACKER
verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Ghost Vertex Index: ";
for (v = 0; v < *numGhostEdges; v++)
cout << verGhostInd[v] << "\t";
cout << endl;
fflush(stdout);
#endif
#pragma omp task depend(in \
: *numGhostEdges) depend(out \
: QLocalVtx, QGhostVtx, QMsgType, QOwner)
{
try
{
QLocalVtx.reserve(*numGhostEdges); // Local Vertex
QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex
QMsgType.reserve(*numGhostEdges); // Message Type (Request/Failure)
QOwner.reserve(*numGhostEdges); // Owner of the ghost: COmpute once and use later
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
} // end of task
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
fflush(stdout);
#endif
#pragma omp task depend(in \
: *numGhostVertices) depend(out \
: candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
{
// Allocate Data Structures:
/*
* candidateMate was a vector and has been replaced with an array
* there is no point in using the vector (or maybe there is (???))
* so I replaced it with an array wich is slightly faster
*/
candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)];
*S = (*numGhostVertices); // Initialize S with number of Ghost Vertices
/*
* Create the Queue Data Structure for the Dominating Set
*
* I had to declare the staticuQueue U before the parallel region
* to have it in the correct scope. Since we can't change the dimension
* of a staticQueue I had to destroy the previous object and instantiate
* a new one of the correct size.
*/
//new (&U) staticQueue(NLVer + (*numGhostVertices));
U.reserve(NLVer + (*numGhostVertices));
// Initialize the private vectors
privateQLocalVtx.reserve(*numGhostVertices);
privateQGhostVtx.reserve(*numGhostVertices);
privateQMsgType.reserve(*numGhostVertices);
privateQOwner.reserve(*numGhostVertices);
privateU.reserve(*numGhostVertices);
} // end of task
} // End of single region
} // End of parallel region
}

@ -0,0 +1,46 @@
#include "MatchBoxPC.h"
/**
* //TODO documentation
* @param k
* @param verLocInd
* @param StartIndex
* @param EndIndex
* @param GMate
* @param Mate
* @param Ghost2LocalMap
* @return
*/
bool isAlreadyMatched(MilanLongInt node,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
{
/*
#pragma omp critical(Mate)
{
if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched
} else { //A local vertex
result = (Mate[node - StartIndex] >= 0); // Already matched
}
}
*/
MilanLongInt val;
if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex
{
#pragma omp atomic read
val = GMate[Ghost2LocalMap[node]];
return val >= 0; // Already matched
}
// If not ghost vertex
#pragma omp atomic read
val = Mate[node - StartIndex];
return val >= 0; // Already matched
}

@ -0,0 +1,27 @@
#include "MatchBoxPC.h"
void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanInt myRank,
MilanReal *edgeLocWeight,
MilanLongInt *candidateMate)
{
MilanLongInt v = -1;
#pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD)
{
#pragma omp for schedule(static)
for (v = 0; v < NLVer; v++) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
fflush(stdout);
#endif
// Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
// End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
}
}
}

@ -0,0 +1,24 @@
#include "MatchBoxPC.h"
void PROCESS_CROSS_EDGE(MilanLongInt *edge,
MilanLongInt *S)
{
// Start: PARALLEL_PROCESS_CROSS_EDGE_B
MilanLongInt captureCounter;
#pragma omp atomic capture
captureCounter = --(*edge); // Decrement
//assert(captureCounter >= 0);
if (captureCounter == 0)
#pragma omp atomic
(*S)--; // Decrement S
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
fflush(stdout);
#endif
// End: PARALLEL_PROCESS_CROSS_EDGE_B
}

@ -0,0 +1,195 @@
#include "MatchBoxPC.h"
void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
MilanLongInt *candidateMate,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *Mate,
vector<MilanLongInt> &GMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *NumMessagesBundled,
MilanLongInt *S,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
MilanInt ghostOwner = 0, option, igw;
#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) \
firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) \
default(shared) num_threads(NUM_THREAD)
{
#pragma omp for reduction(+ \
: PCounter[:numProcs], myCard \
[:1], msgInd \
[:1], NumMessagesBundled \
[:1]) \
schedule(static)
for (v = 0; v < NLVer; v++) {
option = -1;
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
k = candidateMate[v];
candidateMate[v] = verLocInd[k];
w = candidateMate[v];
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0)
{
#pragma omp critical(processExposed)
{
if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
w = computeCandidateMate(verLocPtr[v],
verLocPtr[v + 1],
edgeLocWeight, 0,
verLocInd,
StartIndex,
EndIndex,
GMate,
Mate,
Ghost2LocalMap);
candidateMate[v] = w;
}
if (w >= 0) {
(*myCard)++;
if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex
option = 2;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
option = 1;
Mate[v] = w;
GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == (v + StartIndex)) {
option = 3;
Mate[v] = w; // v is local
Mate[w - StartIndex] = v + StartIndex; // w is local
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
fflush(stdout);
#endif
} // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
} // End of Else
} // End of second if
} // End critical processExposed
} // End of if(w >=0)
else {
// This piece of code is executed a really small amount of times
adj11 = verLocPtr[v];
adj12 = verLocPtr[v + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
fflush(stdout);
#endif
(*msgInd)++;
(*NumMessagesBundled)++;
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
privateQLocalVtx.push_back(v + StartIndex);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(FAILURE);
privateQOwner.push_back(ghostOwner);
} // End of if(GHOST)
} // End of for loop
}
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
switch (option)
{
case -1:
break;
case 1:
privateU.push_back(v + StartIndex);
privateU.push_back(w);
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
fflush(stdout);
#endif
// Decrement the counter:
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
case 2:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message (291):";
cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
fflush(stdout);
#endif
(*msgInd)++;
(*NumMessagesBundled)++;
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
privateQLocalVtx.push_back(v + StartIndex);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(REQUEST);
privateQOwner.push_back(ghostOwner);
break;
case 3:
default:
privateU.push_back(v + StartIndex);
privateU.push_back(w);
break;
}
} // End of for ( v=0; v < NLVer; v++ )
queuesTransfer(U, privateU, QLocalVtx,
QGhostVtx,
QMsgType, QOwner, privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
} // End of parallel region
}

@ -0,0 +1,294 @@
#include "MatchBoxPC.h"
void processMatchedVertices(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *NumMessagesBundled,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
int option;
MilanLongInt mateVal;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef COUNT_LOCAL_VERTEX
MilanLongInt localVertices = 0;
#endif
//#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, \
privateQMsgType, privateQOwner, UChunkBeingProcessed) \
default(shared) num_threads(NUM_THREAD) \
reduction(+ \
: msgInd[:1], PCounter \
[:numProcs], myCard \
[:1], NumMessagesBundled \
[:1])
{
while (!U.empty()) {
extractUChunk(UChunkBeingProcessed, U, privateU);
for (MilanLongInt u : UChunkBeingProcessed) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")u: " << u;
fflush(stdout);
#endif
if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices
#ifdef COUNT_LOCAL_VERTEX
localVertices++;
#endif
// Get the Adjacency list for u
adj1 = verLocPtr[u - StartIndex]; // Pointer
adj2 = verLocPtr[u - StartIndex + 1];
for (k = adj1; k < adj2; k++) {
option = -1;
v = verLocInd[k];
if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
fflush(stdout);
#endif
#pragma omp atomic read
mateVal = Mate[v - StartIndex];
// If the current vertex is pointing to a matched vertex and is not matched
if (mateVal < 0) {
#pragma omp critical
{
if (candidateMate[v - StartIndex] == u) {
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
w = computeCandidateMate(verLocPtr[v - StartIndex],
verLocPtr[v - StartIndex + 1],
edgeLocWeight, 0,
verLocInd,
StartIndex,
EndIndex,
GMate,
Mate,
Ghost2LocalMap);
candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v << " Points to: " << w;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0) {
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message:";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
#endif
option = 2;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
option = 1;
Mate[v - StartIndex] = w; // v is a local vertex
GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == v) {
option = 3;
Mate[v - StartIndex] = w; // v is a local vertex
Mate[w - StartIndex] = v; // w is a local vertex
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
} // End of if(CandidateMate(w) = v
} // End of Else
} // End of if(w >=0)
else
option = 4; // End of Else: w == -1
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
} // End of If (candidateMate[v-StartIndex] == u
} // End of task
} // mateval < 0
} // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
else { // Neighbor is a ghost vertex
#pragma omp critical
{
if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
if (v != Mate[u - StartIndex])
option = 5; // u is local
} // End of critical
} // End of Else //A Ghost Vertex
switch (option)
{
case -1:
// No things to do
break;
case 1:
// Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
// Decrement the counter:
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
case 2:
// Found a dominating edge, it is a ghost
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
(*NumMessagesBundled)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(REQUEST);
privateQOwner.push_back(ghostOwner);
break;
case 3:
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
break;
case 4:
// Could not find a dominating vertex
adj11 = verLocPtr[v - StartIndex];
adj12 = verLocPtr[v - StartIndex + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
(*NumMessagesBundled)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(FAILURE);
privateQOwner.push_back(ghostOwner);
} // End of if(GHOST)
} // End of for loop
break;
case 5:
default:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a success message: ";
cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
(*NumMessagesBundled)++;
PCounter[ghostOwner]++;
(*msgInd)++;
privateQLocalVtx.push_back(u);
privateQGhostVtx.push_back(v);
privateQMsgType.push_back(SUCCESS);
privateQOwner.push_back(ghostOwner);
break;
} // End of switch
} // End of inner for
}
} // End of outer for
queuesTransfer(U, privateU, QLocalVtx,
QGhostVtx,
QMsgType, QOwner, privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
#pragma omp critical(U)
{
U.insert(U.end(), privateU.begin(), privateU.end());
}
privateU.clear();
#pragma omp critical(sendMessageTransfer)
{
QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
}
privateQLocalVtx.clear();
privateQGhostVtx.clear();
privateQMsgType.clear();
privateQOwner.clear();
} // End of while ( !U.empty() )
#ifdef COUNT_LOCAL_VERTEX
printf("Count local vertexes: %ld for thread %d of processor %d\n",
localVertices,
omp_get_thread_num(),
myRank);
#endif
} // End of parallel region
}

@ -0,0 +1,308 @@
#include "MatchBoxPC.h"
//#define DEBUG_HANG_
void processMatchedVerticesAndSendMessages(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *NumMessagesBundled,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner,
MPI_Comm comm,
MilanLongInt *msgActual,
vector<MilanLongInt> &Message)
{
MilanLongInt initialSize = QLocalVtx.size();
MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
int option;
MilanLongInt mateVal;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef COUNT_LOCAL_VERTEX
MilanLongInt localVertices = 0;
#endif
//#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\
privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
num_threads(NUM_THREAD) \
reduction(+ \
: msgInd[:1], PCounter \
[:numProcs], myCard \
[:1], NumMessagesBundled \
[:1], msgActual \
[:1])
{
while (!U.empty()) {
extractUChunk(UChunkBeingProcessed, U, privateU);
for (MilanLongInt u : UChunkBeingProcessed) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")u: " << u;
fflush(stdout);
#endif
if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices
#ifdef COUNT_LOCAL_VERTEX
localVertices++;
#endif
// Get the Adjacency list for u
adj1 = verLocPtr[u - StartIndex]; // Pointer
adj2 = verLocPtr[u - StartIndex + 1];
for (k = adj1; k < adj2; k++) {
option = -1;
v = verLocInd[k];
if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
fflush(stdout);
#endif
#pragma omp atomic read
mateVal = Mate[v - StartIndex];
// If the current vertex is pointing to a matched vertex and is not matched
if (mateVal < 0) {
#pragma omp critical
{
if (candidateMate[v - StartIndex] == u) {
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
w = computeCandidateMate(verLocPtr[v - StartIndex],
verLocPtr[v - StartIndex + 1],
edgeLocWeight, 0,
verLocInd,
StartIndex,
EndIndex,
GMate,
Mate,
Ghost2LocalMap);
candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v << " Points to: " << w;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0) {
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message:";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
#endif
option = 2;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
option = 1;
Mate[v - StartIndex] = w; // v is a local vertex
GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == v) {
option = 3;
Mate[v - StartIndex] = w; // v is a local vertex
Mate[w - StartIndex] = v; // w is a local vertex
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
} // End of if(CandidateMate(w) = v
} // End of Else
} // End of if(w >=0)
else
option = 4; // End of Else: w == -1
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
} // End of If (candidateMate[v-StartIndex] == u
} // End of task
} // mateval < 0
} // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
else { // Neighbor is a ghost vertex
#pragma omp critical
{
if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
if (v != Mate[u - StartIndex])
option = 5; // u is local
} // End of critical
} // End of Else //A Ghost Vertex
switch (option)
{
case -1:
// No things to do
break;
case 1:
// Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
// Decrement the counter:
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
case 2:
// Found a dominating edge, it is a ghost
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// Build the Message Packet:
// Message[0] = v; // LOCAL
// Message[1] = w; // GHOST
// Message[2] = REQUEST; // TYPE
// Send a Request (Asynchronous)
// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgActual)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(REQUEST);
privateQOwner.push_back(ghostOwner);
break;
case 3:
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
break;
case 4:
// Could not find a dominating vertex
adj11 = verLocPtr[v - StartIndex];
adj12 = verLocPtr[v - StartIndex + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// Build the Message Packet:
// Message[0] = v; // LOCAL
// Message[1] = w; // GHOST
// Message[2] = FAILURE; // TYPE
// Send a Request (Asynchronous)
// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgActual)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(FAILURE);
privateQOwner.push_back(ghostOwner);
} // End of if(GHOST)
} // End of for loop
break;
case 5:
default:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a success message: ";
cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
// Build the Message Packet:
// Message[0] = u; // LOCAL
// Message[1] = v; // GHOST
// Message[2] = SUCCESS; // TYPE
// Send a Request (Asynchronous)
// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgActual)++;
(*msgInd)++;
privateQLocalVtx.push_back(u);
privateQGhostVtx.push_back(v);
privateQMsgType.push_back(SUCCESS);
privateQOwner.push_back(ghostOwner);
break;
} // End of switch
} // End of inner for
}
} // End of outer for
queuesTransfer(U, privateU, QLocalVtx,
QGhostVtx,
QMsgType, QOwner, privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
} // End of while ( !U.empty() )
#ifdef COUNT_LOCAL_VERTEX
printf("Count local vertexes: %ld for thread %d of processor %d\n",
localVertices,
omp_get_thread_num(),
myRank);
#endif
} // End of parallel region
// Send the messages
#ifdef DEBUG_HANG_
cout << myRank<<" Sending: "<<QOwner.size()-initialSize<<" messages" <<endl;
#endif
for (int i = initialSize; i < QOwner.size(); i++) {
Message[0] = QLocalVtx[i];
Message[1] = QGhostVtx[i];
Message[2] = QMsgType[i];
ghostOwner = QOwner[i];
//MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
//cout << myRank<<" Sending to "<<ghostOwner<<endl;
MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
}
#ifdef DEBUG_HANG_
cout << myRank<<" Done sending messages"<<endl;
#endif
}

@ -0,0 +1,315 @@
#include "MatchBoxPC.h"
//#define DEBUG_HANG_
void processMessages(
MilanLongInt NLVer,
MilanLongInt *Mate,
MilanLongInt *candidateMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Counter,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *msgActual,
MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *verLocPtr,
MilanLongInt k,
MilanLongInt *verLocInd,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &Message,
MilanLongInt numGhostEdges,
MilanLongInt u,
MilanLongInt v,
MilanLongInt *S,
vector<MilanLongInt> &U)
{
//#define PRINT_DEBUG_INFO_
MilanInt Sender;
MPI_Status computeStatus;
MilanLongInt bundleSize, w;
MilanLongInt adj11, adj12, k1;
MilanLongInt ghostOwner;
int error_codeC;
error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
char error_message[MPI_MAX_ERROR_STRING];
int message_length;
MilanLongInt message_type = 0;
// Buffer to receive bundled messages
// Maximum messages that can be received from any processor is
// twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
vector<MilanLongInt> ReceiveBuffer;
try
{
ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
#ifdef PRINT_DEBUG_INFO_
cout
<< "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl;
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
// BLOCKING RECEIVE:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
fflush(stdout);
fflush(stdout);
#endif
//cout << myRank<<" Receiving ...";
error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
if (error_codeC != MPI_SUCCESS)
{
MPI_Error_string(error_codeC, error_message, &message_length);
cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
fflush(stdout);
}
Sender = computeStatus.MPI_SOURCE;
//cout << " ...from "<<Sender << endl;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
fflush(stdout);
#endif
if (Message[2] == SIZEINFO) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
fflush(stdout);
#endif
bundleSize = Message[0]; //#of integers in the message
// Build the Message Buffer:
if (!ReceiveBuffer.empty())
ReceiveBuffer.clear(); // Empty it out first
ReceiveBuffer.resize(bundleSize, -1); // Initialize
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
for (int i = 0; i < bundleSize; i++)
cout << ReceiveBuffer[i] << ",";
cout << endl;
fflush(stdout);
#endif
// Receive the message
//cout << myRank<<" Receiving from "<<Sender<<endl;
error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
if (error_codeC != MPI_SUCCESS) {
MPI_Error_string(error_codeC, error_message, &message_length);
cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
fflush(stdout);
}
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message Bundle After: " << endl;
for (int i = 0; i < bundleSize; i++)
cout << ReceiveBuffer[i] << ",";
cout << endl;
fflush(stdout);
#endif
} else { // Just a single message:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
fflush(stdout);
#endif
// Add the current message to Queue:
bundleSize = 3; //#of integers in the message
// Build the Message Buffer:
if (!ReceiveBuffer.empty())
ReceiveBuffer.clear(); // Empty it out first
ReceiveBuffer.resize(bundleSize, -1); // Initialize
ReceiveBuffer[0] = Message[0]; // u
ReceiveBuffer[1] = Message[1]; // v
ReceiveBuffer[2] = Message[2]; // message_type
}
#ifdef DEBUG_GHOST_
if ((v < StartIndex) || (v > EndIndex)) {
cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
fflush(stdout);
}
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
fflush(stdout);
#endif
// Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3) {
u = ReceiveBuffer[bundleCounter - 3]; // GHOST
v = ReceiveBuffer[bundleCounter - 2]; // LOCAL
message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
// CASE I: REQUEST
if (message_type == REQUEST) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
fflush(stdout);
#endif
#ifdef DEBUG_GHOST_
if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
fflush(stdout);
}
#endif
if (Mate[v - StartIndex] == -1) {
// Process only if not already matched (v is local)
candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
if (candidateMate[v - StartIndex] == u) {
GMate[Ghost2LocalMap[u]] = v; // u is ghost
Mate[v - StartIndex] = u; // v is local
U.push_back(v);
U.push_back(u);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
fflush(stdout);
#endif
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
} // End of if ( candidateMate[v-StartIndex] == u )e
} // End of if ( Mate[v] == -1 )
} // End of REQUEST
else { // CASE II: SUCCESS
if (message_type == SUCCESS) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
fflush(stdout);
#endif
GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
#ifdef DEBUG_GHOST_
if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
fflush(stdout);
}
#endif
if (Mate[v - StartIndex] == -1) {
// Process only if not already matched ( v is local)
if (candidateMate[v - StartIndex] == u) {
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k,
verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0) {
if ((w < StartIndex) || (w > EndIndex)) {
// w is a ghost
// Build the Message Packet:
Message[0] = v; // LOCAL
Message[1] = w; // GHOST
Message[2] = REQUEST; // TYPE
// Send a Request (Asynchronous)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
//assert(ghostOwner != -1);
//assert(ghostOwner != myRank);
//cout << myRank<<" Sending to "<<ghostOwner<<endl;
MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgInd)++;
(*msgActual)++;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
Mate[v - StartIndex] = w; // v is local
GMate[Ghost2LocalMap[w]] = v; // w is ghost
U.push_back(v);
U.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
fflush(stdout);
#endif
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == v) {
Mate[v - StartIndex] = w; // v is local
Mate[w - StartIndex] = v; // w is local
// Q.push_back(u);
U.push_back(v);
U.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
fflush(stdout);
#endif
} // End of if(CandidateMate(w) = v
} // End of Else
} // End of if(w >=0)
else { // No dominant edge found
adj11 = verLocPtr[v - StartIndex];
adj12 = verLocPtr[v - StartIndex + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) {
// A ghost
// Build the Message Packet:
Message[0] = v; // LOCAL
Message[1] = w; // GHOST
Message[2] = FAILURE; // TYPE
// Send a Request (Asynchronous)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
//assert(ghostOwner != -1);
//assert(ghostOwner != myRank);
//cout << myRank<<" Sending to "<<ghostOwner<<endl;
MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgInd)++;
(*msgActual)++;
} // End of if(GHOST)
} // End of for loop
} // End of Else: w == -1
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
} // End of if ( candidateMate[v-StartIndex] == u )
} // End of if ( Mate[v] == -1 )
} // End of if ( message_type == SUCCESS )
else {
// CASE III: FAILURE
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
fflush(stdout);
#endif
GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
} // End of else: CASE III
} // End of else: CASE I
}
return;
}

@ -0,0 +1,36 @@
#include "MatchBoxPC.h"
void queuesTransfer(vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
#pragma omp critical(U)
{
U.insert(U.end(), privateU.begin(), privateU.end());
}
privateU.clear();
#pragma omp critical(sendMessageTransfer)
{
QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
}
privateQLocalVtx.clear();
privateQGhostVtx.clear();
privateQMsgType.clear();
privateQOwner.clear();
}

@ -0,0 +1,209 @@
#include "MatchBoxPC.h"
void sendBundledMessages(MilanLongInt *numGhostEdges,
MilanInt *BufferSize,
MilanLongInt *Buffer,
vector<MilanLongInt> &PCumulative,
vector<MilanLongInt> &PMessageBundle,
vector<MilanLongInt> &PSizeInfoMessages,
MilanLongInt *PCounter,
MilanLongInt NumMessagesBundled,
MilanLongInt *msgActual,
MilanLongInt *msgInd,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus)
{
MilanLongInt myIndex = 0, numMessagesToSend;
MilanInt i = 0, OneMessageSize = 0;
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ") Send Bundles" << endl;
fflush(stdout);
#endif
#pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD)
{
#pragma omp master
{
// Data structures for Bundled Messages:
#pragma omp task depend(inout \
: PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \
: NumMessagesBundled, numProcs)
{
try {
PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
PCumulative.reserve(numProcs + 1); // Similar to Row Pointer vector in CSR data structure
PSizeInfoMessages.reserve(numProcs * 3); // Buffer to hold the Size info message packets
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
PCumulative.resize(numProcs + 1, 0); // Only initialize the counter variable
PSizeInfoMessages.resize(numProcs * 3, 0);
}
#pragma omp task depend(inout \
: PCumulative) depend(in \
: PCounter)
{
for (i = 0; i < numProcs; i++)
PCumulative[i + 1] = PCumulative[i] + PCounter[i];
}
#pragma omp task depend(inout \
: PCounter)
{
// Reuse PCounter to keep track of how many messages were inserted:
for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
PCounter[i] = 0;
}
// Build the Message Bundle packet:
#pragma omp task depend(in \
: PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \
: myIndex, PMessageBundle, PCounter)
{
for (i = 0; i < NumMessagesBundled; i++) {
myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
PMessageBundle[myIndex + 0] = QLocalVtx[i];
PMessageBundle[myIndex + 1] = QGhostVtx[i];
PMessageBundle[myIndex + 2] = QMsgType[i];
PCounter[QOwner[i]]++;
}
}
// Send the Bundled Messages: Use ISend
#pragma omp task depend(out \
: SRequest, SStatus)
{
try
{
SRequest.reserve(numProcs * 2); // At most two messages per processor
SStatus.reserve(numProcs * 2); // At most two messages per processor
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
}
// Send the Messages
#pragma omp task depend(inout \
: SRequest, PSizeInfoMessages, PCumulative) depend(out \
: *msgActual, *msgInd)
{
for (i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
if (i == myRank) // Do not send anything to yourself
continue;
// Send the Message with information about the size of next message:
// Build the Message Packet:
PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
PSizeInfoMessages[i * 3 + 1] = -1; // Dummy packet
PSizeInfoMessages[i * 3 + 2] = SIZEINFO; // TYPE
// Send a Request (Asynchronous)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
fflush(stdout);
#endif
if (PSizeInfoMessages[i * 3 + 0] > 0)
{ // Send only if it is a nonempty packet
MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
&SRequest[(*msgInd)]);
(*msgActual)++;
(*msgInd)++;
// Now Send the message with the data packet:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
cout << PMessageBundle[k] << ",";
cout << endl;
fflush(stdout);
#endif
MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
(*msgInd)++;
} // End of if size > 0
}
}
#pragma omp task depend(inout \
: PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner)
{
// Free up temporary memory:
PCumulative.clear();
QLocalVtx.clear();
QGhostVtx.clear();
QMsgType.clear();
QOwner.clear();
}
#pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges)
{
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
if (*numGhostEdges > 0)
{
cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
}
fflush(stdout);
#endif
// Allocate memory for MPI Send messages:
/* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
OneMessageSize = 0;
MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
// How many messages to send?
// Potentially three kinds of messages will be sent/received:
// Request, Success, Failure.
// But only two will be sent from a given processor.
// Substract the number of messages that have already been sent as bundled messages:
numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
*BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
}
#pragma omp task depend(out : Buffer) depend(in : *BufferSize)
{
Buffer = 0;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
cout << "\n(" << myRank << ")Attaching Buffer on.. ";
fflush(stdout);
#endif
if ((*BufferSize) > 0)
{
Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
if (Buffer == 0)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
exit(1);
}
MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
}
}
}
}
}

@ -109,6 +109,8 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_cspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_c_onelev_mat_asb'
call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_)
!
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999
@ -151,14 +160,17 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol
!
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999

@ -109,6 +109,8 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_dspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_d_onelev_mat_asb'
call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_)
!
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999
@ -151,14 +160,17 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol
!
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999

@ -109,6 +109,8 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_sspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_s_onelev_mat_asb'
call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_)
!
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999
@ -151,14 +160,17 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol
!
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999

@ -109,6 +109,8 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_zspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_z_onelev_mat_asb'
call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_
ctxt = desc_a%get_context()
call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_)
!
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999
@ -151,14 +160,17 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol
!
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999

@ -56,6 +56,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else

@ -56,6 +56,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='c_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else
info = psb_err_missing_override_method_

@ -56,6 +56,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else

@ -56,6 +56,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else
info = psb_err_missing_override_method_

@ -56,6 +56,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else

@ -56,6 +56,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='s_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else
info = psb_err_missing_override_method_

@ -56,6 +56,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else

@ -56,6 +56,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='z_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_
call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother.
!
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else
info = psb_err_missing_override_method_

@ -0,0 +1,25 @@
cd amgprec/impl/aggregator/
rm MatchBoxPC.o
rm sendBundledMessages.o
rm initialize.o
rm extractUChunk.o
rm isAlreadyMatched.o
rm findOwnerOfGhost.o
rm computeCandidateMate.o
rm parallelComputeCandidateMateB.o
rm processMatchedVertices.o
rm processCrossEdge.o
rm queueTransfer.o
rm processMessages.o
rm processExposedVertex.o
rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o
rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
cd ../../../
make all
cd samples/advanced/pdegen
make amg_d_pde3d
cd runs
mpirun -np 4 amg_d_pde3d amg_pde3d.inp

@ -93,6 +93,9 @@ contains
& a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod
use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
!
! Discretizes the partial differential equation
!
@ -128,7 +131,6 @@ contains
type(psb_d_csc_sparse_mat) :: acsc
type(psb_d_coo_sparse_mat) :: acoo
type(psb_d_csr_sparse_mat) :: acsr
real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
! Process grid
integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:)
real(psb_dpk_), allocatable :: val(:)
integer(psb_lpk_), allocatable :: myidx(:)
! deltah dimension of each grid cell
! deltat discretization time
real(psb_dpk_) :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
call psb_barrier(ctxt)
talc = psb_wtime()-t0
if (info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='allocation rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt)
t1 = psb_wtime()
do ii=1, nlr,nb
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
! x, y, z coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
z = (iz-1)*deltah
zt(k) = f_(x,y,z)
! internal point: build discretization
!
! term depending on (x-1,y,z)
!
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
if (ix == 1) then
zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1,z)
val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
if (iy == 1) then
zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z-1)
val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
if (iz == 1) then
zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z)
val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
& + c(x,y,z)
call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
! term depending on (x,y,z+1)
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
if (iz == idim) then
zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1,z)
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
if (iy == idim) then
zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y,z)
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then
zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
!$omp parallel shared(deltah,myidx,a,desc_a)
!
block
integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
integer(psb_lpk_) :: glob_row
integer(psb_lpk_), allocatable :: irow(:),icol(:)
real(psb_dpk_), allocatable :: val(:)
real(psb_dpk_) :: x,y,z, zt(nb)
#if defined(OPENMP)
nth = omp_get_num_threads()
ith = omp_get_thread_num()
#else
nth = 1
ith = 0
#endif
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
!goto 9999
endif
!$omp do schedule(dynamic)
!
do ii=1, nlr, nb
if (info /= psb_success_) cycle
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
! x, y, z coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
z = (iz-1)*deltah
zt(k) = f_(x,y,z)
! internal point: build discretization
!
! term depending on (x-1,y,z)
!
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
if (ix == 1) then
zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1,z)
val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
if (iy == 1) then
zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z-1)
val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
if (iz == 1) then
zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z)
val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
& + c(x,y,z)
call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z+1)
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
if (iz == idim) then
zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1,z)
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
if (iy == idim) then
zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y,z)
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then
zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
!write(0,*) ' Outer in_parallel ',omp_in_parallel()
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=dzero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) exit
zt(:)=dzero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
!$omp end do
deallocate(val,irow,icol)
end block
!$omp end parallel
tgen = psb_wtime()-t1
if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
goto 9999
end if
deallocate(val,irow,icol)
call psb_barrier(ctxt)
t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
& a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod
use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
!
! Discretizes the partial differential equation
!
@ -591,7 +603,6 @@ contains
type(psb_d_csc_sparse_mat) :: acsc
type(psb_d_coo_sparse_mat) :: acoo
type(psb_d_csr_sparse_mat) :: acsr
real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
! Process grid
integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:)
real(psb_dpk_), allocatable :: val(:)
integer(psb_lpk_), allocatable :: myidx(:)
! deltah dimension of each grid cell
! deltat discretization time
real(psb_dpk_) :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
goto 9999
end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt)
t1 = psb_wtime()
do ii=1, nlr,nb
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,glob_row,idim,idim)
! x, y coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
zt(k) = f_(x,y)
! internal point: build discretization
!
! term depending on (x-1,y)
!
val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
if (ix == 1) then
zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1)
val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
if (iy == 1) then
zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y)
val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
call ijk2idx(icol(icoeff),ix,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
! term depending on (x,y+1)
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
if (iy == idim) then
zt(k) = g(x,done)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y)
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
if (ix==idim) then
zt(k) = g(done,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
!$omp parallel shared(deltah,myidx,a,desc_a)
!
block
integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
integer(psb_lpk_) :: glob_row
integer(psb_lpk_), allocatable :: irow(:),icol(:)
real(psb_dpk_), allocatable :: val(:)
real(psb_dpk_) :: x,y,z, zt(nb)
#if defined(OPENMP)
nth = omp_get_num_threads()
ith = omp_get_thread_num()
#else
nth = 1
ith = 0
#endif
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
!goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
!$omp do schedule(dynamic)
!
do ii=1, nlr,nb
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,glob_row,idim,idim)
! x, y coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
zt(k) = f_(x,y)
! internal point: build discretization
!
! term depending on (x-1,y)
!
val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
if (ix == 1) then
zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1)
val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
if (iy == 1) then
zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y)
val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
call ijk2idx(icol(icoeff),ix,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1)
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
if (iy == idim) then
zt(k) = g(x,done)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y)
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
if (ix==idim) then
zt(k) = g(done,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=dzero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) exit
zt(:)=dzero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
!$omp end do
deallocate(val,irow,icol)
end block
!$omp end parallel
tgen = psb_wtime()-t1
if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
goto 9999
end if
deallocate(val,irow,icol)
call psb_barrier(ctxt)
t1 = psb_wtime()
call psb_cdasb(desc_a,info)

@ -73,6 +73,9 @@ program amg_d_pde2d
use amg_d_pde2d_exp_mod
use amg_d_pde2d_box_mod
use amg_d_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! input parameters
@ -93,7 +96,7 @@ program amg_d_pde2d
type(psb_d_vect_type) :: x,b,r
! parallel environment
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np
integer(psb_ipk_) :: iam, np, nth
! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_d_pde2d
call psb_init(ctxt)
call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then
! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_d_pde2d
call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np
write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr)
write(psb_out_unit,'("Iterations to convergence : ",i12)') iter
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr)
write(psb_out_unit,'("Iterations to convergence : ",i12)') iter
write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err
write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs()
write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier

@ -74,6 +74,9 @@ program amg_d_pde3d
use amg_d_pde3d_exp_mod
use amg_d_pde3d_gauss_mod
use amg_d_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! input parameters
@ -94,7 +97,7 @@ program amg_d_pde3d
type(psb_d_vect_type) :: x,b,r
! parallel environment
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np
integer(psb_ipk_) :: iam, np, nth
! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_d_pde3d
! other variables
integer(psb_ipk_) :: info, i, k
character(len=20) :: name,ch_err
type(psb_d_csr_sparse_mat) :: amold
info=psb_success_
call psb_init(ctxt)
call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then
! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_d_pde3d
end if
call psb_barrier(ctxt)
t1 = psb_wtime()
call prec%smoothers_build(a,desc_a,info)
call prec%smoothers_build(a,desc_a,info,amold=amold)
tprec = psb_wtime()-t1
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_d_pde3d
call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np
write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_d_pde3d
write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt()
end if
call psb_print_timers(ctxt)
!
! cleanup storage and exit
!

@ -93,6 +93,9 @@ contains
& a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod
use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
!
! Discretizes the partial differential equation
!
@ -128,7 +131,6 @@ contains
type(psb_s_csc_sparse_mat) :: acsc
type(psb_s_coo_sparse_mat) :: acoo
type(psb_s_csr_sparse_mat) :: acsr
real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
! Process grid
integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:)
real(psb_spk_), allocatable :: val(:)
integer(psb_lpk_), allocatable :: myidx(:)
! deltah dimension of each grid cell
! deltat discretization time
real(psb_spk_) :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
call psb_barrier(ctxt)
talc = psb_wtime()-t0
if (info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='allocation rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt)
t1 = psb_wtime()
do ii=1, nlr,nb
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
! x, y, z coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
z = (iz-1)*deltah
zt(k) = f_(x,y,z)
! internal point: build discretization
!
! term depending on (x-1,y,z)
!
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
if (ix == 1) then
zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1,z)
val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
if (iy == 1) then
zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z-1)
val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
if (iz == 1) then
zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z)
val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
& + c(x,y,z)
call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
! term depending on (x,y,z+1)
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
if (iz == idim) then
zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1,z)
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
if (iy == idim) then
zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y,z)
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then
zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
!$omp parallel shared(deltah,myidx,a,desc_a)
!
block
integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
integer(psb_lpk_) :: glob_row
integer(psb_lpk_), allocatable :: irow(:),icol(:)
real(psb_spk_), allocatable :: val(:)
real(psb_spk_) :: x,y,z, zt(nb)
#if defined(OPENMP)
nth = omp_get_num_threads()
ith = omp_get_thread_num()
#else
nth = 1
ith = 0
#endif
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
!goto 9999
endif
!$omp do schedule(dynamic)
!
do ii=1, nlr, nb
if (info /= psb_success_) cycle
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
! x, y, z coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
z = (iz-1)*deltah
zt(k) = f_(x,y,z)
! internal point: build discretization
!
! term depending on (x-1,y,z)
!
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
if (ix == 1) then
zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1,z)
val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
if (iy == 1) then
zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z-1)
val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
if (iz == 1) then
zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z)
val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
& + c(x,y,z)
call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z+1)
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
if (iz == idim) then
zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1,z)
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
if (iy == idim) then
zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y,z)
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then
zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
!write(0,*) ' Outer in_parallel ',omp_in_parallel()
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=szero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) exit
zt(:)=szero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
!$omp end do
deallocate(val,irow,icol)
end block
!$omp end parallel
tgen = psb_wtime()-t1
if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
goto 9999
end if
deallocate(val,irow,icol)
call psb_barrier(ctxt)
t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
& a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod
use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
!
! Discretizes the partial differential equation
!
@ -591,7 +603,6 @@ contains
type(psb_s_csc_sparse_mat) :: acsc
type(psb_s_coo_sparse_mat) :: acoo
type(psb_s_csr_sparse_mat) :: acsr
real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
! Process grid
integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:)
real(psb_spk_), allocatable :: val(:)
integer(psb_lpk_), allocatable :: myidx(:)
! deltah dimension of each grid cell
! deltat discretization time
real(psb_spk_) :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
goto 9999
end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt)
t1 = psb_wtime()
do ii=1, nlr,nb
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,glob_row,idim,idim)
! x, y coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
zt(k) = f_(x,y)
! internal point: build discretization
!
! term depending on (x-1,y)
!
val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
if (ix == 1) then
zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1)
val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
if (iy == 1) then
zt(k) = g(x,szero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y)
val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
call ijk2idx(icol(icoeff),ix,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
! term depending on (x,y+1)
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
if (iy == idim) then
zt(k) = g(x,sone)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y)
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
if (ix==idim) then
zt(k) = g(sone,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
!$omp parallel shared(deltah,myidx,a,desc_a)
!
block
integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
integer(psb_lpk_) :: glob_row
integer(psb_lpk_), allocatable :: irow(:),icol(:)
real(psb_spk_), allocatable :: val(:)
real(psb_spk_) :: x,y,z, zt(nb)
#if defined(OPENMP)
nth = omp_get_num_threads()
ith = omp_get_thread_num()
#else
nth = 1
ith = 0
#endif
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
!goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
!$omp do schedule(dynamic)
!
do ii=1, nlr,nb
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,glob_row,idim,idim)
! x, y coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
zt(k) = f_(x,y)
! internal point: build discretization
!
! term depending on (x-1,y)
!
val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
if (ix == 1) then
zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1)
val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
if (iy == 1) then
zt(k) = g(x,szero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y)
val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
call ijk2idx(icol(icoeff),ix,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1)
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
if (iy == idim) then
zt(k) = g(x,sone)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y)
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
if (ix==idim) then
zt(k) = g(sone,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=szero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) exit
zt(:)=szero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
!$omp end do
deallocate(val,irow,icol)
end block
!$omp end parallel
tgen = psb_wtime()-t1
if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
goto 9999
end if
deallocate(val,irow,icol)
call psb_barrier(ctxt)
t1 = psb_wtime()
call psb_cdasb(desc_a,info)

@ -73,6 +73,9 @@ program amg_s_pde2d
use amg_s_pde2d_exp_mod
use amg_s_pde2d_box_mod
use amg_s_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! input parameters
@ -93,7 +96,7 @@ program amg_s_pde2d
type(psb_s_vect_type) :: x,b,r
! parallel environment
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np
integer(psb_ipk_) :: iam, np, nth
! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_s_pde2d
call psb_init(ctxt)
call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then
! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_s_pde2d
call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np
write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr)
write(psb_out_unit,'("Iterations to convergence : ",i12)') iter
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr)
write(psb_out_unit,'("Iterations to convergence : ",i12)') iter
write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err
write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs()
write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier

@ -74,6 +74,9 @@ program amg_s_pde3d
use amg_s_pde3d_exp_mod
use amg_s_pde3d_gauss_mod
use amg_s_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none
! input parameters
@ -94,7 +97,7 @@ program amg_s_pde3d
type(psb_s_vect_type) :: x,b,r
! parallel environment
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np
integer(psb_ipk_) :: iam, np, nth
! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_s_pde3d
! other variables
integer(psb_ipk_) :: info, i, k
character(len=20) :: name,ch_err
type(psb_s_csr_sparse_mat) :: amold
info=psb_success_
call psb_init(ctxt)
call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then
! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_s_pde3d
end if
call psb_barrier(ctxt)
t1 = psb_wtime()
call prec%smoothers_build(a,desc_a,info)
call prec%smoothers_build(a,desc_a,info,amold=amold)
tprec = psb_wtime()-t1
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_s_pde3d
call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np
write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_s_pde3d
write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt()
end if
call psb_print_timers(ctxt)
!
! cleanup storage and exit
!

@ -1,6 +1,6 @@
%%%%%%%%%%% General arguments % Lines starting with % are ignored.
CSR ! Storage format CSR COO JAD
0080 ! IDIM; domain size. Linear system size is IDIM**3
0200 ! IDIM; domain size. Linear system size is IDIM**3
CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
2 ! ISTOPC
@ -9,7 +9,7 @@ BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS F
30 ! IRST (restart for RGMRES and BiCGSTABL)
1.d-6 ! EPS
%%%%%%%%%%% Main preconditioner choices %%%%%%%%%%%%%%%%
ML-VCYCLE-BJAC-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars)
ML-VBM-VCYCLE-FBGS-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars)
ML ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML
%%%%%%%%%%% First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%%
FBGS ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous.
@ -39,8 +39,8 @@ VCYCLE ! Type of multilevel CYCLE: VCYCLE WCYCLE KCYCLE MUL
-3 ! Max Number of levels in a multilevel preconditioner; if <0, lib default
-3 ! Target coarse matrix size per process; if <0, lib default
SMOOTHED ! Type of aggregation: SMOOTHED UNSMOOTHED
COUPLED ! Parallel aggregation: DEC, SYMDEC, COUPLED
MATCHBOXP ! aggregation measure SOC1, MATCHBOXP
DEC ! Parallel aggregation: DEC, SYMDEC, COUPLED
SOC1 ! aggregation measure SOC1, MATCHBOXP
8 ! Requested size of the aggregates for MATCHBOXP
NATURAL ! Ordering of aggregation NATURAL DEGREE
-1.5 ! Coarsening ratio, if < 0 use library default

Loading…
Cancel
Save