Merge branch 'dev-openmp' into development

master
sfilippone 1 year ago
commit 0bcc9d7b55

@ -1,4 +1,3 @@
AMG4PSBLAS AMG4PSBLAS
Algebraic Multigrid Package based on PSBLAS (Parallel Sparse BLAS version 3.8) Algebraic Multigrid Package based on PSBLAS (Parallel Sparse BLAS version 3.8)

@ -143,9 +143,10 @@ contains
type(psb_ld_coo_sparse_mat) :: tmpcoo type(psb_ld_coo_sparse_mat) :: tmpcoo
logical :: display_out_, print_out_, reproducible_ logical :: display_out_, print_out_, reproducible_
logical, parameter :: dump=.false., debug=.false., dump_mate=.false., & logical, parameter :: dump=.false., debug=.false., dump_mate=.false., &
& debug_ilaggr=.false., debug_sync=.false. & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
logical, parameter :: do_timings=.true. logical, parameter :: do_timings=.true.
integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
ictxt = desc_a%get_ctxt() ictxt = desc_a%get_ctxt()
call psb_info(ictxt,iam,np) call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
call desc_a%l2gip(ilv,info,owned=.false.) call desc_a%l2gip(ilv,info,owned=.false.)
call psb_geall(ilaggr,desc_a,info) call psb_geall(ilaggr,desc_a,info)
ilaggr = -1 ilaggr = ilaggr_neginit
call psb_geasb(ilaggr,desc_a,info) call psb_geasb(ilaggr,desc_a,info)
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
call psb_barrier(ictxt) call psb_barrier(ictxt)
if (iam == 0) write(0,*)' out from buildmatching:', info if (iam == 0) write(0,*)' out from buildmatching:', info
end if end if
if (debug_mate) then
block
integer(psb_lpk_), allocatable :: ckmate(:)
allocate(ckmate(nr))
ckmate(1:nr) = mate(1:nr)
call psb_msort(ckmate(1:nr))
do i=1,nr-1
if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i)
end if
end do
end block
end if
if (info == 0) then if (info == 0) then
if (do_timings) call psb_tic(idx_phase2) if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
cycle cycle
else else
if (ilaggr(k) == -1) then if (ilaggr(k) == ilaggr_neginit) then
wk = w(k) wk = w(k)
widx = w(idx) widx = w(idx)
@ -267,7 +281,7 @@ contains
nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2) nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
if (nrmagg > epsilon(nrmagg)) then if (nrmagg > epsilon(nrmagg)) then
if (idx <= nr) then if (idx <= nr) then
if (ilaggr(idx) == -1) then if (ilaggr(idx) == ilaggr_neginit) then
! Now, if both vertices are local, the aggregate is local ! Now, if both vertices are local, the aggregate is local
! (kinda obvious). ! (kinda obvious).
nlaggr(iam) = nlaggr(iam) + 1 nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
ilaggr(idx) = nlaggr(iam) ilaggr(idx) = nlaggr(iam)
wtemp(k) = w(k)/nrmagg wtemp(k) = w(k)/nrmagg
wtemp(idx) = w(idx)/nrmagg wtemp(idx) = w(idx)/nrmagg
else
write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
&mate(idx),ilaggr(idx)
end if end if
nlpairs = nlpairs+1 nlpairs = nlpairs+1
else if (idx <= nc) then else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
ilaggr(k) = nlaggr(iam) ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1 nlpairs = nlpairs+1
else else
ilaggr(k) = -2 ilaggr(k) = ilaggr_nonlocal
end if end if
else else
! Use a statistically unbiased tie-breaking rule, ! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
ilaggr(k) = nlaggr(iam) ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1 nlpairs = nlpairs+1
else else
ilaggr(k) = -2 ilaggr(k) = ilaggr_nonlocal
end if end if
end if end if
end if end if
@ -325,6 +342,12 @@ contains
nlsingl = nlsingl + 1 nlsingl = nlsingl + 1
end if end if
end if end if
if (ilaggr(k) == ilaggr_neginit) then
write(0,*) iam,' Error: no update to ',k,mate(k),&
& abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
end if
else
if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
end if end if
end if end if
end do end do
@ -332,7 +355,7 @@ contains
if (do_timings) call psb_tic(idx_phase3) if (do_timings) call psb_tic(idx_phase3)
! Ok, now compute offsets, gather halo and fix non-local ! Ok, now compute offsets, gather halo and fix non-local
! aggregates (those where ilaggr == -2) ! aggregates (those where ilaggr == ilaggr_nonlocal)
call psb_sum(ictxt,nlaggr) call psb_sum(ictxt,nlaggr)
ntaggr = sum(nlaggr(0:np-1)) ntaggr = sum(nlaggr(0:np-1))
naggrm1 = sum(nlaggr(0:iam-1)) naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
call psb_halo(wtemp,desc_a,info) call psb_halo(wtemp,desc_a,info)
! Cleanup as yet unmarked entries ! Cleanup as yet unmarked entries
do k=1,nr do k=1,nr
if (ilaggr(k) == -2) then if (ilaggr(k) == ilaggr_nonlocal) then
idx = mate(k) idx = mate(k)
if (idx > nr) then if (idx > nr) then
i = ilaggr(idx) i = ilaggr(idx)
@ -359,9 +382,14 @@ contains
else else
write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx) write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
end if end if
end if else if (ilaggr(k) <0) then
if (ilaggr(k) <0) then write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) write(0,*) iam,' : : ',nr,nc,mate(k)
if (mate(k) <= nr) then
write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),&
& ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
end if
flush(0)
end if end if
end do end do
if (debug_sync) then if (debug_sync) then
@ -414,7 +442,7 @@ contains
end block end block
if (iam == 0) then if (iam == 0) then
write(0,*) 'Matching statistics: Unmatched nodes ',& write(0,*) iam,'Matching statistics: Unmatched nodes ',&
& nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
end if end if

@ -143,9 +143,10 @@ contains
type(psb_ls_coo_sparse_mat) :: tmpcoo type(psb_ls_coo_sparse_mat) :: tmpcoo
logical :: display_out_, print_out_, reproducible_ logical :: display_out_, print_out_, reproducible_
logical, parameter :: dump=.false., debug=.false., dump_mate=.false., & logical, parameter :: dump=.false., debug=.false., dump_mate=.false., &
& debug_ilaggr=.false., debug_sync=.false. & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1 integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
logical, parameter :: do_timings=.true. logical, parameter :: do_timings=.true.
integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
ictxt = desc_a%get_ctxt() ictxt = desc_a%get_ctxt()
call psb_info(ictxt,iam,np) call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
call desc_a%l2gip(ilv,info,owned=.false.) call desc_a%l2gip(ilv,info,owned=.false.)
call psb_geall(ilaggr,desc_a,info) call psb_geall(ilaggr,desc_a,info)
ilaggr = -1 ilaggr = ilaggr_neginit
call psb_geasb(ilaggr,desc_a,info) call psb_geasb(ilaggr,desc_a,info)
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
call psb_barrier(ictxt) call psb_barrier(ictxt)
if (iam == 0) write(0,*)' out from buildmatching:', info if (iam == 0) write(0,*)' out from buildmatching:', info
end if end if
if (debug_mate) then
block
integer(psb_lpk_), allocatable :: ckmate(:)
allocate(ckmate(nr))
ckmate(1:nr) = mate(1:nr)
call psb_msort(ckmate(1:nr))
do i=1,nr-1
if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
write(0,*) iam,' Duplicate mate entry at',i,' :',ckmate(i)
end if
end do
end block
end if
if (info == 0) then if (info == 0) then
if (do_timings) call psb_tic(idx_phase2) if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
cycle cycle
else else
if (ilaggr(k) == -1) then if (ilaggr(k) == ilaggr_neginit) then
wk = w(k) wk = w(k)
widx = w(idx) widx = w(idx)
@ -267,7 +281,7 @@ contains
nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2) nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
if (nrmagg > epsilon(nrmagg)) then if (nrmagg > epsilon(nrmagg)) then
if (idx <= nr) then if (idx <= nr) then
if (ilaggr(idx) == -1) then if (ilaggr(idx) == ilaggr_neginit) then
! Now, if both vertices are local, the aggregate is local ! Now, if both vertices are local, the aggregate is local
! (kinda obvious). ! (kinda obvious).
nlaggr(iam) = nlaggr(iam) + 1 nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
ilaggr(idx) = nlaggr(iam) ilaggr(idx) = nlaggr(iam)
wtemp(k) = w(k)/nrmagg wtemp(k) = w(k)/nrmagg
wtemp(idx) = w(idx)/nrmagg wtemp(idx) = w(idx)/nrmagg
else
write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
&mate(idx),ilaggr(idx)
end if end if
nlpairs = nlpairs+1 nlpairs = nlpairs+1
else if (idx <= nc) then else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
ilaggr(k) = nlaggr(iam) ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1 nlpairs = nlpairs+1
else else
ilaggr(k) = -2 ilaggr(k) = ilaggr_nonlocal
end if end if
else else
! Use a statistically unbiased tie-breaking rule, ! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
ilaggr(k) = nlaggr(iam) ilaggr(k) = nlaggr(iam)
nlpairs = nlpairs+1 nlpairs = nlpairs+1
else else
ilaggr(k) = -2 ilaggr(k) = ilaggr_nonlocal
end if end if
end if end if
end if end if
@ -325,6 +342,12 @@ contains
nlsingl = nlsingl + 1 nlsingl = nlsingl + 1
end if end if
end if end if
if (ilaggr(k) == ilaggr_neginit) then
write(0,*) iam,' Error: no update to ',k,mate(k),&
& abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
end if
else
if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
end if end if
end if end if
end do end do
@ -332,7 +355,7 @@ contains
if (do_timings) call psb_tic(idx_phase3) if (do_timings) call psb_tic(idx_phase3)
! Ok, now compute offsets, gather halo and fix non-local ! Ok, now compute offsets, gather halo and fix non-local
! aggregates (those where ilaggr == -2) ! aggregates (those where ilaggr == ilaggr_nonlocal)
call psb_sum(ictxt,nlaggr) call psb_sum(ictxt,nlaggr)
ntaggr = sum(nlaggr(0:np-1)) ntaggr = sum(nlaggr(0:np-1))
naggrm1 = sum(nlaggr(0:iam-1)) naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
call psb_halo(wtemp,desc_a,info) call psb_halo(wtemp,desc_a,info)
! Cleanup as yet unmarked entries ! Cleanup as yet unmarked entries
do k=1,nr do k=1,nr
if (ilaggr(k) == -2) then if (ilaggr(k) == ilaggr_nonlocal) then
idx = mate(k) idx = mate(k)
if (idx > nr) then if (idx > nr) then
i = ilaggr(idx) i = ilaggr(idx)
@ -359,9 +382,14 @@ contains
else else
write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx) write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
end if end if
end if else if (ilaggr(k) <0) then
if (ilaggr(k) <0) then write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k) write(0,*) iam,' : : ',nr,nc,mate(k)
if (mate(k) <= nr) then
write(0,*) iam,' : : ',ilaggr(mate(k)),mate(mate(k)),&
& ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
end if
flush(0)
end if end if
end do end do
if (debug_sync) then if (debug_sync) then
@ -414,7 +442,7 @@ contains
end block end block
if (iam == 0) then if (iam == 0) then
write(0,*) 'Matching statistics: Unmatched nodes ',& write(0,*) iam,'Matching statistics: Unmatched nodes ',&
& nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
end if end if

@ -62,7 +62,22 @@ amg_s_parmatch_smth_bld.o \
amg_s_parmatch_spmm_bld_inner.o amg_s_parmatch_spmm_bld_inner.o
MPCOBJS=MatchBoxPC.o \ MPCOBJS=MatchBoxPC.o \
algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o sendBundledMessages.o \
initialize.o \
extractUChunk.o \
isAlreadyMatched.o \
findOwnerOfGhost.o \
clean.o \
computeCandidateMate.o \
parallelComputeCandidateMateB.o \
processMatchedVertices.o \
processMatchedVerticesAndSendMessages.o \
processCrossEdge.o \
queueTransfer.o \
processMessages.o \
processExposedVertex.o \
algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
OBJS = $(FOBJS) $(MPCOBJS) OBJS = $(FOBJS) $(MPCOBJS)

@ -60,17 +60,43 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt* ph1_card, MilanLongInt* ph2_card ) { MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
#if !defined(SERIAL_MPI) #if !defined(SERIAL_MPI)
MPI_Comm C_comm=MPI_Comm_f2c(icomm); MPI_Comm C_comm=MPI_Comm_f2c(icomm);
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n", fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n",
myRank,NLVer, NLEdge,verDistance[0],verDistance[1]); myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
#endif #endif
dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
#define TIME_TRACKER
#ifdef TIME_TRACKER
double tmr = MPI_Wtime();
#endif
#define OMP
#ifdef OMP
dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge,
verLocPtr, verLocInd, edgeLocWeight,
verDistance, Mate,
myRank, numProcs, C_comm,
msgIndSent, msgActualSent, msgPercent,
ph0_time, ph1_time, ph2_time,
ph1_card, ph2_card );
#else
dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
verLocPtr, verLocInd, edgeLocWeight, verLocPtr, verLocInd, edgeLocWeight,
verDistance, Mate, verDistance, Mate,
myRank, numProcs, C_comm, myRank, numProcs, C_comm,
msgIndSent, msgActualSent, msgPercent, msgIndSent, msgActualSent, msgPercent,
ph0_time, ph1_time, ph2_time, ph0_time, ph1_time, ph2_time,
ph1_card, ph2_card ); ph1_card, ph2_card );
#endif
#ifdef TIME_TRACKER
tmr = MPI_Wtime() - tmr;
fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer);
#endif
#endif #endif
} }

@ -52,145 +52,412 @@
#ifndef _matchboxpC_H_ #ifndef _matchboxpC_H_
#define _matchboxpC_H_ #define _matchboxpC_H_
//Turn on a lot of debugging information with this switch: // Turn on a lot of debugging information with this switch:
//#define PRINT_DEBUG_INFO_ //#define PRINT_DEBUG_INFO_
#include <stdio.h> #include <stdio.h>
#include <iostream> #include <iostream>
#include <assert.h> #include <assert.h>
#include <map> #include <map>
#include <vector> #include <vector>
// #include "matchboxp.h" #include "omp.h"
#include "primitiveDataTypeDefinitions.h" #include "primitiveDataTypeDefinitions.h"
#include "dataStrStaticQueue.h" #include "dataStrStaticQueue.h"
using namespace std; using namespace std;
const int NUM_THREAD = 4;
const int UCHUNK = 10;
const MilanLongInt REQUEST = 1;
const MilanLongInt SUCCESS = 2;
const MilanLongInt FAILURE = 3;
const MilanLongInt SIZEINFO = 4;
const int ComputeTag = 7; // Predefined tag
const int BundleTag = 9; // Predefined tag
static vector<MilanLongInt> DEFAULT_VECTOR;
// MPI type map
template <typename T>
MPI_Datatype TypeMap();
template <>
inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
template <>
inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
template <>
inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
template <>
inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
#if !defined(SERIAL_MPI) #if !defined(SERIAL_MPI)
#define MilanMpiLongInt MPI_LONG_LONG #define MilanMpiLongInt MPI_LONG_LONG
#ifndef _primitiveDataType_Definition_ #ifndef _primitiveDataType_Definition_
#define _primitiveDataType_Definition_ #define _primitiveDataType_Definition_
//Regular integer: // Regular integer:
#ifndef INTEGER_H #ifndef INTEGER_H
#define INTEGER_H #define INTEGER_H
typedef int32_t MilanInt; typedef int32_t MilanInt;
#endif #endif
//Regular long integer: // Regular long integer:
#ifndef LONG_INT_H #ifndef LONG_INT_H
#define LONG_INT_H #define LONG_INT_H
#ifdef BIT64 #ifdef BIT64
typedef int64_t MilanLongInt; typedef int64_t MilanLongInt;
typedef MPI_LONG MilanMpiLongInt; typedef MPI_LONG MilanMpiLongInt;
#else #else
typedef int32_t MilanLongInt; typedef int32_t MilanLongInt;
typedef MPI_INT MilanMpiLongInt; typedef MPI_INT MilanMpiLongInt;
#endif #endif
#endif #endif
//Regular boolean // Regular boolean
#ifndef BOOL_H #ifndef BOOL_H
#define BOOL_H #define BOOL_H
typedef bool MilanBool; typedef bool MilanBool;
#endif #endif
//Regular double and absolute value computation: // Regular double and absolute value computation:
#ifndef REAL_H #ifndef REAL_H
#define REAL_H #define REAL_H
typedef double MilanReal; typedef double MilanReal;
typedef MPI_DOUBLE MilanMpiReal; typedef MPI_DOUBLE MilanMpiReal;
inline MilanReal MilanAbs(MilanReal value) inline MilanReal MilanAbs(MilanReal value)
{ {
return fabs(value); return fabs(value);
} }
#endif #endif
//Regular float and absolute value computation: // Regular float and absolute value computation:
#ifndef FLOAT_H #ifndef FLOAT_H
#define FLOAT_H #define FLOAT_H
typedef float MilanFloat; typedef float MilanFloat;
typedef MPI_FLOAT MilanMpiFloat; typedef MPI_FLOAT MilanMpiFloat;
inline MilanFloat MilanAbsFloat(MilanFloat value) inline MilanFloat MilanAbsFloat(MilanFloat value)
{ {
return fabs(value); return fabs(value);
} }
#endif #endif
//// Define the limits: //// Define the limits:
#ifndef LIMITS_H #ifndef LIMITS_H
#define LIMITS_H #define LIMITS_H
//Integer Maximum and Minimum: // Integer Maximum and Minimum:
// #define MilanIntMax INT_MAX // #define MilanIntMax INT_MAX
// #define MilanIntMin INT_MIN // #define MilanIntMin INT_MIN
#define MilanIntMax INT32_MAX #define MilanIntMax INT32_MAX
#define MilanIntMin INT32_MIN #define MilanIntMin INT32_MIN
#ifdef BIT64 #ifdef BIT64
#define MilanLongIntMax INT64_MAX #define MilanLongIntMax INT64_MAX
#define MilanLongIntMin -INT64_MAX #define MilanLongIntMin -INT64_MAX
#else #else
#define MilanLongIntMax INT32_MAX #define MilanLongIntMax INT32_MAX
#define MilanLongIntMin -INT32_MAX #define MilanLongIntMin -INT32_MAX
#endif #endif
#endif #endif
// +INFINITY // +INFINITY
const double PLUS_INFINITY = numeric_limits<int>::infinity(); const double PLUS_INFINITY = numeric_limits<int>::infinity();
const double MINUS_INFINITY = -PLUS_INFINITY; const double MINUS_INFINITY = -PLUS_INFINITY;
//#define MilanRealMax LDBL_MAX //#define MilanRealMax LDBL_MAX
#define MilanRealMax PLUS_INFINITY #define MilanRealMax PLUS_INFINITY
#define MilanRealMin MINUS_INFINITY #define MilanRealMin MINUS_INFINITY
#endif #endif
//Function of find the owner of a ghost vertex using binary search: // Function of find the owner of a ghost vertex using binary search:
inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance, MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
MilanInt myRank, MilanInt numProcs); MilanInt myRank, MilanInt numProcs);
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
( MilanLongInt adj2,
MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt *verLocInd,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, MilanReal *edgeLocWeight);
MilanLongInt* verDistance,
MilanLongInt* Mate, void queuesTransfer(vector<MilanLongInt> &U,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm, vector<MilanLongInt> &privateU,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, vector<MilanLongInt> &QLocalVtx,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, vector<MilanLongInt> &QGhostVtx,
MilanLongInt* ph1_card, MilanLongInt* ph2_card ); vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC vector<MilanLongInt> &privateQLocalVtx,
( vector<MilanLongInt> &privateQGhostVtx,
MilanLongInt NLVer, MilanLongInt NLEdge, vector<MilanLongInt> &privateQMsgType,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight, vector<MilanInt> &privateQOwner);
MilanLongInt* verDistance,
MilanLongInt* Mate, bool isAlreadyMatched(MilanLongInt node,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm, MilanLongInt StartIndex,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, MilanLongInt EndIndex,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, vector<MilanLongInt> &GMate,
MilanLongInt* ph1_card, MilanLongInt* ph2_card ); MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight, MilanLongInt computeCandidateMate(MilanLongInt adj1,
MilanLongInt* verDistance, MilanLongInt adj2,
MilanLongInt* Mate, MilanReal *edgeLocWeight,
MilanInt myRank, MilanInt numProcs, MilanInt icomm, MilanLongInt k,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, MilanLongInt *verLocInd,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, MilanLongInt StartIndex,
MilanLongInt* ph1_card, MilanLongInt* ph2_card ); MilanLongInt EndIndex,
vector<MilanLongInt> &GMate,
void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge, MilanLongInt *Mate,
MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight, map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
MilanLongInt* verDistance,
MilanLongInt* Mate, void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanInt myRank, MilanInt numProcs, MilanInt icomm, MilanLongInt StartIndex, MilanLongInt EndIndex,
MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent, MilanLongInt *numGhostEdgesPtr,
MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time, MilanLongInt *numGhostVerticesPtr,
MilanLongInt* ph1_card, MilanLongInt* ph2_card ); MilanLongInt *S,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &Counter,
vector<MilanLongInt> &verGhostPtr,
vector<MilanLongInt> &verGhostInd,
vector<MilanLongInt> &tempCounter,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Message,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
MilanLongInt *&candidateMate,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner);
void clean(MilanLongInt NLVer,
MilanInt myRank,
MilanLongInt MessageIndex,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus,
MilanInt BufferSize,
MilanLongInt *Buffer,
MilanLongInt msgActual,
MilanLongInt *msgActualSent,
MilanLongInt msgInd,
MilanLongInt *msgIndSent,
MilanLongInt NumMessagesBundled,
MilanReal *msgPercent);
void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanInt myRank,
MilanReal *edgeLocWeight,
MilanLongInt *candidateMate);
void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
MilanLongInt *candidateMate,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *Mate,
vector<MilanLongInt> &GMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *NumMessagesBundledPtr,
MilanLongInt *SPtr,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner);
void PROCESS_CROSS_EDGE(MilanLongInt *edge,
MilanLongInt *SPtr);
void processMatchedVertices(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *NumMessagesBundledPtr,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner);
void processMatchedVerticesAndSendMessages(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *NumMessagesBundledPtr,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner,
MPI_Comm comm,
MilanLongInt *msgActual,
vector<MilanLongInt> &Message);
void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
MilanInt *BufferSizePtr,
MilanLongInt *Buffer,
vector<MilanLongInt> &PCumulative,
vector<MilanLongInt> &PMessageBundle,
vector<MilanLongInt> &PSizeInfoMessages,
MilanLongInt *PCounter,
MilanLongInt NumMessagesBundled,
MilanLongInt *msgActualPtr,
MilanLongInt *MessageIndexPtr,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus);
void processMessages(
MilanLongInt NLVer,
MilanLongInt *Mate,
MilanLongInt *candidateMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Counter,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCardPtr,
MilanLongInt *msgIndPtr,
MilanLongInt *msgActualPtr,
MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *verLocPtr,
MilanLongInt k,
MilanLongInt *verLocInd,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &Message,
MilanLongInt numGhostEdges,
MilanLongInt u,
MilanLongInt v,
MilanLongInt *SPtr,
vector<MilanLongInt> &U);
void extractUChunk(
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU);
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MilanInt icomm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MilanInt icomm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus

@ -72,12 +72,6 @@
#ifdef SERIAL_MPI #ifdef SERIAL_MPI
#else #else
//MPI type map
template<typename T> MPI_Datatype TypeMap();
template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
// DOUBLE PRECISION VERSION // DOUBLE PRECISION VERSION
//WARNING: The vertex block on a given rank is contiguous //WARNING: The vertex block on a given rank is contiguous

@ -0,0 +1,554 @@
#include "MatchBoxPC.h"
// ***********************************************************************
//
// MatchboxP: A C++ library for approximate weighted matching
// Mahantesh Halappanavar (hala@pnnl.gov)
// Pacific Northwest National Laboratory
//
// ***********************************************************************
//
// Copyright (2021) Battelle Memorial Institute
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// ************************************************************************
//////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// DOMINATING EDGES MODEL ///////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
/* Function : algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate()
*
* Date : New update: Feb 17, 2019, Richland, Washington.
* Date : Original development: May 17, 2009, E&CS Bldg.
*
* Purpose : Compute Approximate Maximum Weight Matching in Linear Time
*
* Args : inputMatrix - instance of Compressed-Col format of Matrix
* Mate - The Mate array
*
* Returns : By Value: (void)
* By Reference: Mate
*
* Comments : 1/2 Approx Algorithm. Picks the locally available heaviest edge.
* Assumption: The Mate Array is empty.
*/
/*
NLVer = #of vertices, NLEdge = #of edges
CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers)
verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process
Mate = A vector of size |V_p| (local subgraph) to store the output (matching)
MPI: myRank, numProcs, comm,
Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world
Statistics: ph0_time, ph1_time, ph2_time: Runtimes
Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
*/
//#define DEBUG_HANG_
#ifdef SERIAL_MPI
#else
// DOUBLE PRECISION VERSION
// WARNING: The vertex block on a given rank is contiguous
void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt *verLocPtr, MilanLongInt *verLocInd,
MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *Mate,
MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
MilanLongInt *msgIndSent, MilanLongInt *msgActualSent,
MilanReal *msgPercent,
MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
MilanLongInt *ph1_card, MilanLongInt *ph2_card)
{
/*
* verDistance: it's a vector long as the number of processors.
* verDistance[i] contains the first node index of the i-th processor
* verDistance[i + 1] contains the last node index of the i-th processor
* NLVer: number of elements in the LocPtr
* NLEdge: number of edges assigned to the current processor
*
* Contains the portion of matrix assigned to the processor in
* Yale notation
* verLocInd: contains the positions on row of the matrix
* verLocPtr: i-th value is the position of the first element on the i-th row and
* i+1-th value is the position of the first element on the i+1-th row
*/
#if !defined(SERIAL_MPI)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()";
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") verDistance [" ;
for (int i = 0; i < numProcs; i++)
cout << verDistance[i] << "," << verDistance[i+1];
cout << "]\n";
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
if (myRank == 0) {
cout << "\n(" << myRank << ") verDistance [" ;
for (int i = 0; i < numProcs; i++)
cout << verDistance[i] << "," ;
cout << verDistance[numProcs]<< "]\n";
}
fflush(stdout);
#endif
MilanLongInt StartIndex = verDistance[myRank]; // The starting vertex owned by the current rank
MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
MPI_Status computeStatus;
MilanLongInt msgActual = 0, msgInd = 0;
MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight
MilanReal startTime, finishTime;
startTime = MPI_Wtime();
// Data structures for sending and receiving messages:
vector<MilanLongInt> Message; // [ u, v, message_type ]
Message.resize(3, -1);
// Data structures for Message Bundling:
// Although up to two messages can be sent along any cross edge,
// only one message will be sent in the initialization phase -
// one of: REQUEST/FAILURE/SUCCESS
vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
MilanLongInt *PCounter = new MilanLongInt[numProcs];
for (int i = 0; i < numProcs; i++)
PCounter[i] = 0;
MilanLongInt NumMessagesBundled = 0;
// TODO when the last computational section will be refactored this could be eliminated
MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
MilanLongInt *candidateMate = nullptr;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")NV: " << NLVer << " Edges: " << NLEdge;
fflush(stdout);
cout << "\n(" << myRank << ")StartIndex: " << StartIndex << " EndIndex: " << EndIndex;
fflush(stdout);
#endif
// Other Variables:
MilanLongInt u = -1, v = -1, w = -1, i = 0;
MilanLongInt k = -1, adj1 = -1, adj2 = -1;
MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
MilanLongInt myCard = 0;
// Build the Ghost Vertex Set: Vg
map<MilanLongInt, MilanLongInt> Ghost2LocalMap; // Map each ghost vertex to a local vertex
vector<MilanLongInt> Counter; // Store the edge count for each ghost vertex
MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
fflush(stdout);
#endif
// Define Adjacency Lists for Ghost Vertices:
// cout<<"Building Ghost data structures ... \n\n";
vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
// Mate array for ghost vertices:
vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
MilanLongInt S;
MilanLongInt privateMyCard = 0;
vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
vector<MPI_Request> SRequest; // Requests that are used for each send message
vector<MPI_Status> SStatus; // Status of sent messages, used in MPI_Wait
MilanLongInt MessageIndex = 0; // Pointer for current message
MilanInt BufferSize;
MilanLongInt *Buffer;
vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType;
vector<MilanInt> privateQOwner;
vector<MilanLongInt> U, privateU;
initialize(NLVer, NLEdge, StartIndex,
EndIndex, &numGhostEdges,
&numGhostVertices, &S,
verLocInd, verLocPtr,
Ghost2LocalMap, Counter,
verGhostPtr, verGhostInd,
tempCounter, GMate,
Message, QLocalVtx,
QGhostVtx, QMsgType, QOwner,
candidateMate, U,
privateU,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
finishTime = MPI_Wtime();
*ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization
#ifdef DEBUG_HANG_
cout << myRank << " Finished initialization" << endl;
fflush(stdout);
#endif
startTime = MPI_Wtime();
/////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// INITIALIZATION /////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
// Compute the Initial Matching Set:
/*
* OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
* PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
* the two.
* PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
*/
PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer,
verLocPtr,
verLocInd,
myRank,
edgeLocWeight,
candidateMate);
#ifdef DEBUG_HANG_
cout << myRank << " Finished Exposed Vertex" << endl;
fflush(stdout);
#if 0
cout << myRank << " candidateMate after parallelCompute " <<endl;
for (int i=0; i<NLVer; i++) {
cout << candidateMate[i] << " " ;
}
cout << endl;
#endif
#endif
/*
* PARALLEL_PROCESS_EXPOSED_VERTEX_B
* TODO: write comment
*
* TODO: Test when it's actually more efficient to execute this code
* in parallel.
*/
PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
candidateMate,
verLocInd,
verLocPtr,
StartIndex,
EndIndex,
Mate,
GMate,
Ghost2LocalMap,
edgeLocWeight,
&myCard,
&msgInd,
&NumMessagesBundled,
&S,
verDistance,
PCounter,
Counter,
myRank,
numProcs,
U,
privateU,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
tempCounter.clear(); // Do not need this any more
#ifdef DEBUG_HANG_
cout << myRank << " Finished Exposed Vertex" << endl;
fflush(stdout);
#if 0
cout << myRank << " Mate after Exposed Vertices " <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
// TODO what would be the optimal UCHUNK
vector<MilanLongInt> UChunkBeingProcessed;
UChunkBeingProcessed.reserve(UCHUNK);
processMatchedVertices(NLVer,
UChunkBeingProcessed,
U,
privateU,
StartIndex,
EndIndex,
&myCard,
&msgInd,
&NumMessagesBundled,
&S,
verLocPtr,
verLocInd,
verDistance,
PCounter,
Counter,
myRank,
numProcs,
candidateMate,
GMate,
Mate,
Ghost2LocalMap,
edgeLocWeight,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
#ifdef DEBUG_HANG_
cout << myRank << " Finished Process Vertices" << endl;
fflush(stdout);
#if 0
cout << myRank << " Mate after Matched Vertices " <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
/////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
sendBundledMessages(&numGhostEdges,
&BufferSize,
Buffer,
PCumulative,
PMessageBundle,
PSizeInfoMessages,
PCounter,
NumMessagesBundled,
&msgActual,
&MessageIndex,
numProcs,
myRank,
comm,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
SRequest,
SStatus);
///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
finishTime = MPI_Wtime();
*ph1_time = finishTime - startTime; // Time taken for Phase-1
#ifdef DEBUG_HANG_
cout << myRank << " Finished sendBundles" << endl;
fflush(stdout);
#endif
*ph1_card = myCard; // Cardinality at the end of Phase-1
startTime = MPI_Wtime();
/////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// MAIN LOOP //////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
// Main While Loop:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Entering While(true) loop..";
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
while (true) {
#ifdef DEBUG_HANG_
//if (myRank == 0)
cout << "\n(" << myRank << ") Main loop" << endl;
fflush(stdout);
#endif
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
processMatchedVerticesAndSendMessages(NLVer,
UChunkBeingProcessed,
U,
privateU,
StartIndex,
EndIndex,
&myCard,
&msgInd,
&NumMessagesBundled,
&S,
verLocPtr,
verLocInd,
verDistance,
PCounter,
Counter,
myRank,
numProcs,
candidateMate,
GMate,
Mate,
Ghost2LocalMap,
edgeLocWeight,
QLocalVtx,
QGhostVtx,
QMsgType,
QOwner,
privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner,
comm,
&msgActual,
Message);
///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
//// BREAK IF NO MESSAGES EXPECTED /////////
#ifdef DEBUG_HANG_
#if 0
cout << myRank << " Mate after ProcessMatchedAndSend phase "<<S <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Deciding whether to break: S= " << S << endl;
#endif
if (S == 0) {
#ifdef DEBUG_HANG_
cout << "\n(" << myRank << ") Breaking out" << endl;
fflush(stdout);
#endif
break;
}
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////// PROCESS MESSAGES //////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
processMessages(NLVer,
Mate,
candidateMate,
Ghost2LocalMap,
GMate,
Counter,
StartIndex,
EndIndex,
&myCard,
&msgInd,
&msgActual,
edgeLocWeight,
verDistance,
verLocPtr,
k,
verLocInd,
numProcs,
myRank,
comm,
Message,
numGhostEdges,
u,
v,
&S,
U);
///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
#ifdef DEBUG_HANG_
#if 0
cout << myRank << " Mate after ProcessMessages phase "<<S <<endl;
for (int i=0; i<NLVer; i++) {
cout << Mate[i] << " " ;
}
cout << endl;
#endif
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
fflush(stdout);
cout << "\n(" << myRank << ")** SENT : ACTUAL= " << msgActual;
fflush(stdout);
cout << "\n(" << myRank << ")** SENT : INDIVIDUAL= " << msgInd << endl;
fflush(stdout);
#endif
} // End of while (true)
clean(NLVer,
myRank,
MessageIndex,
SRequest,
SStatus,
BufferSize,
Buffer,
msgActual,
msgActualSent,
msgInd,
msgIndSent,
NumMessagesBundled,
msgPercent);
finishTime = MPI_Wtime();
*ph2_time = finishTime - startTime; // Time taken for Phase-2
*ph2_card = myCard; // Cardinality at the end of Phase-2
}
// End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
#endif
#endif

@ -97,6 +97,8 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_c_dec_aggregator_tprol' name='amg_c_dec_aggregator_tprol'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',& call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle) & amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores ! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
! !
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then if (info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1 integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld' name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) & if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1) naggr = nlaggr(me+1)
ntaggr = sum(nlaggr) ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done. ! Ok first product done.
if (present(desc_ax)) then if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block block
call coo_prol%cp_to_coo(coo_restr,info) call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols()) call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols()) call coo_restr%set_ncols(desc_ax%get_local_cols())
end block end block
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999
@ -168,26 +173,27 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp() call coo_restr%transp()
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows() nrl = desc_ac%get_local_rows()
i=0 call coo_restr%fix(info)
i=coo_restr%get_nzeros()
! !
! Only keep local rows ! Only keep local rows
! !
do k=1, nzl search: do k=i,1,-1
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then if (coo_restr%ia(k) <= nrl) then
i = i+1 call coo_restr%set_nzeros(k)
coo_restr%val(i) = coo_restr%val(k) exit search
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
end if end if
end do end do search
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols()) call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999

@ -72,7 +72,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_c_inner_mod use amg_c_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! Arguments ! Arguments
@ -85,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info integer(psb_ipk_), intent(out) :: info
! Local variables ! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:) & ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:) integer(psb_lpk_), allocatable :: tmpaggr(:)
complex(psb_spk_), allocatable :: val(:), diag(:) complex(psb_spk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc1_map_bld' name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr) call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info) if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i) ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_tic(idx_soc1_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
icnt = 0 #if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii) i = idxs(ii)
if ((i<1).or.(i>nr)) then if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
if (ilaggr(i) == -(nr+1)) then if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
do k=1, ip do k=1, ip
ilaggr(icol(k)) = naggr ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -274,7 +461,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
enddo enddo
if (ip > 0) then if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
ilaggr(i) = naggr ilaggr(i) = naggr
do k=1, ip do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i)) nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999 goto 9999
@ -336,9 +528,13 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0 nlaggr(:) = 0
nlaggr(me+1) = naggr nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np)) call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free() call acsr%free()
call psb_erractionrestore(err_act) call psb_erractionrestore(err_act)
return return

@ -71,6 +71,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_c_inner_mod use amg_c_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc2_map_bld' name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info) diag = a%get_diag(info)
if(info /= psb_success_) then if(info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
call a%cp_to(muij) call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info) if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do end do
end do end do
!$omp end parallel do
! !
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0 !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = sone s_neigh_coo%val(k) = sone
else else
s_neigh_coo%val(ip) = -sone s_neigh_coo%val(k) = -sone
end if end if
else
s_neigh_coo%val(k) = -sone
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i) ideg(i) = muij%irp(i+1) - muij%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0 icnt = 0
step1: do ii=1, nr step1: do ii=1, nr
i = idxs(ii) i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',count(ilaggr == -(nr+1))
end if end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -294,6 +477,8 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr do i=1, nr
if (ilaggr(i) <= 0) then if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then if (naggr > ncol) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_spk_) :: anorm, omega, tmp, dg, theta real(psb_spk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false. logical, parameter :: debug_new=.false.
character(len=80) :: filename character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld' name='amg_aggrmat_smth_bld'
info=psb_success_ info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows() nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates ! naggr: number of local aggregates
! nrow: local rows. ! nrow: local rows.
! !
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D ! Get the diagonal D
adiag = a%get_diag(info) adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! !
! Build the filtered matrix Af from A ! Build the filtered matrix Af from A
! !
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow do i=1, nrow
tmp = czero tmp = czero
jd = -1 jd = -1
@ -214,11 +235,13 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp acsrf%val(jd)=acsrf%val(jd)-tmp
end if end if
enddo enddo
!$OMP end parallel do
! Take out zeroed terms ! Take out zeroed terms
call acsrf%clean_zeros(info) call acsrf%clean_zeros(info)
end if end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag) do i=1,size(adiag)
if (adiag(i) /= czero) then if (adiag(i) /= czero) then
adiag(i) = cone / adiag(i) adiag(i) = cone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = cone adiag(i) = cone
end if end if
end do end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info) call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999 if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info) call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info) call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
! !
! Build the smoothed prolongator using either A or Af ! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1' & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol) call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr) call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate ' & 'Done smooth_aggregate '

@ -97,6 +97,8 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_d_dec_aggregator_tprol' name='amg_d_dec_aggregator_tprol'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',& call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle) & amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores ! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
! !
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then if (info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1 integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld' name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) & if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1) naggr = nlaggr(me+1)
ntaggr = sum(nlaggr) ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done. ! Ok first product done.
if (present(desc_ax)) then if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block block
call coo_prol%cp_to_coo(coo_restr,info) call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols()) call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols()) call coo_restr%set_ncols(desc_ax%get_local_cols())
end block end block
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999
@ -168,26 +173,27 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp() call coo_restr%transp()
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows() nrl = desc_ac%get_local_rows()
i=0 call coo_restr%fix(info)
i=coo_restr%get_nzeros()
! !
! Only keep local rows ! Only keep local rows
! !
do k=1, nzl search: do k=i,1,-1
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then if (coo_restr%ia(k) <= nrl) then
i = i+1 call coo_restr%set_nzeros(k)
coo_restr%val(i) = coo_restr%val(k) exit search
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
end if end if
end do end do search
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols()) call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999

@ -72,7 +72,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_d_inner_mod use amg_d_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! Arguments ! Arguments
@ -85,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info integer(psb_ipk_), intent(out) :: info
! Local variables ! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:) & ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:) integer(psb_lpk_), allocatable :: tmpaggr(:)
real(psb_dpk_), allocatable :: val(:), diag(:) real(psb_dpk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc1_map_bld' name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr) call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info) if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i) ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_tic(idx_soc1_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
icnt = 0 #if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii) i = idxs(ii)
if ((i<1).or.(i>nr)) then if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
if (ilaggr(i) == -(nr+1)) then if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
do k=1, ip do k=1, ip
ilaggr(icol(k)) = naggr ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -274,7 +461,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
enddo enddo
if (ip > 0) then if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
ilaggr(i) = naggr ilaggr(i) = naggr
do k=1, ip do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i)) nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999 goto 9999
@ -336,9 +528,13 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0 nlaggr(:) = 0
nlaggr(me+1) = naggr nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np)) call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free() call acsr%free()
call psb_erractionrestore(err_act) call psb_erractionrestore(err_act)
return return

@ -71,6 +71,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_d_inner_mod use amg_d_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc2_map_bld' name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info) diag = a%get_diag(info)
if(info /= psb_success_) then if(info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
call a%cp_to(muij) call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info) if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do end do
end do end do
!$omp end parallel do
! !
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0 !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = done s_neigh_coo%val(k) = done
else else
s_neigh_coo%val(ip) = -done s_neigh_coo%val(k) = -done
end if end if
else
s_neigh_coo%val(k) = -done
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i) ideg(i) = muij%irp(i+1) - muij%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0 icnt = 0
step1: do ii=1, nr step1: do ii=1, nr
i = idxs(ii) i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',count(ilaggr == -(nr+1))
end if end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -294,6 +477,8 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr do i=1, nr
if (ilaggr(i) <= 0) then if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then if (naggr > ncol) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_dpk_) :: anorm, omega, tmp, dg, theta real(psb_dpk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false. logical, parameter :: debug_new=.false.
character(len=80) :: filename character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld' name='amg_aggrmat_smth_bld'
info=psb_success_ info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows() nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates ! naggr: number of local aggregates
! nrow: local rows. ! nrow: local rows.
! !
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D ! Get the diagonal D
adiag = a%get_diag(info) adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! !
! Build the filtered matrix Af from A ! Build the filtered matrix Af from A
! !
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow do i=1, nrow
tmp = dzero tmp = dzero
jd = -1 jd = -1
@ -214,11 +235,13 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp acsrf%val(jd)=acsrf%val(jd)-tmp
end if end if
enddo enddo
!$OMP end parallel do
! Take out zeroed terms ! Take out zeroed terms
call acsrf%clean_zeros(info) call acsrf%clean_zeros(info)
end if end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag) do i=1,size(adiag)
if (adiag(i) /= dzero) then if (adiag(i) /= dzero) then
adiag(i) = done / adiag(i) adiag(i) = done / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = done adiag(i) = done
end if end if
end do end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info) call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999 if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info) call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info) call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
! !
! Build the smoothed prolongator using either A or Af ! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1' & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol) call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr) call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate ' & 'Done smooth_aggregate '

@ -97,6 +97,8 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_s_dec_aggregator_tprol' name='amg_s_dec_aggregator_tprol'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',& call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle) & amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores ! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
! !
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then if (info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1 integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld' name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) & if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1) naggr = nlaggr(me+1)
ntaggr = sum(nlaggr) ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done. ! Ok first product done.
if (present(desc_ax)) then if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block block
call coo_prol%cp_to_coo(coo_restr,info) call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols()) call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols()) call coo_restr%set_ncols(desc_ax%get_local_cols())
end block end block
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999
@ -168,26 +173,27 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp() call coo_restr%transp()
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows() nrl = desc_ac%get_local_rows()
i=0 call coo_restr%fix(info)
i=coo_restr%get_nzeros()
! !
! Only keep local rows ! Only keep local rows
! !
do k=1, nzl search: do k=i,1,-1
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then if (coo_restr%ia(k) <= nrl) then
i = i+1 call coo_restr%set_nzeros(k)
coo_restr%val(i) = coo_restr%val(k) exit search
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
end if end if
end do end do search
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols()) call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999

@ -72,7 +72,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_s_inner_mod use amg_s_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! Arguments ! Arguments
@ -85,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info integer(psb_ipk_), intent(out) :: info
! Local variables ! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:) & ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:) integer(psb_lpk_), allocatable :: tmpaggr(:)
real(psb_spk_), allocatable :: val(:), diag(:) real(psb_spk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc1_map_bld' name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr) call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info) if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i) ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_tic(idx_soc1_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
icnt = 0 #if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii) i = idxs(ii)
if ((i<1).or.(i>nr)) then if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
if (ilaggr(i) == -(nr+1)) then if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
do k=1, ip do k=1, ip
ilaggr(icol(k)) = naggr ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -274,7 +461,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
enddo enddo
if (ip > 0) then if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
ilaggr(i) = naggr ilaggr(i) = naggr
do k=1, ip do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i)) nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999 goto 9999
@ -336,9 +528,13 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0 nlaggr(:) = 0
nlaggr(me+1) = naggr nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np)) call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free() call acsr%free()
call psb_erractionrestore(err_act) call psb_erractionrestore(err_act)
return return

@ -71,6 +71,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_s_inner_mod use amg_s_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc2_map_bld' name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info) diag = a%get_diag(info)
if(info /= psb_success_) then if(info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
call a%cp_to(muij) call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info) if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do end do
end do end do
!$omp end parallel do
! !
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0 !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = sone s_neigh_coo%val(k) = sone
else else
s_neigh_coo%val(ip) = -sone s_neigh_coo%val(k) = -sone
end if end if
else
s_neigh_coo%val(k) = -sone
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i) ideg(i) = muij%irp(i+1) - muij%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0 icnt = 0
step1: do ii=1, nr step1: do ii=1, nr
i = idxs(ii) i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',count(ilaggr == -(nr+1))
end if end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -294,6 +477,8 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr do i=1, nr
if (ilaggr(i) <= 0) then if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then if (naggr > ncol) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_spk_) :: anorm, omega, tmp, dg, theta real(psb_spk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false. logical, parameter :: debug_new=.false.
character(len=80) :: filename character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld' name='amg_aggrmat_smth_bld'
info=psb_success_ info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows() nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates ! naggr: number of local aggregates
! nrow: local rows. ! nrow: local rows.
! !
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D ! Get the diagonal D
adiag = a%get_diag(info) adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! !
! Build the filtered matrix Af from A ! Build the filtered matrix Af from A
! !
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow do i=1, nrow
tmp = szero tmp = szero
jd = -1 jd = -1
@ -214,11 +235,13 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp acsrf%val(jd)=acsrf%val(jd)-tmp
end if end if
enddo enddo
!$OMP end parallel do
! Take out zeroed terms ! Take out zeroed terms
call acsrf%clean_zeros(info) call acsrf%clean_zeros(info)
end if end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag) do i=1,size(adiag)
if (adiag(i) /= szero) then if (adiag(i) /= szero) then
adiag(i) = sone / adiag(i) adiag(i) = sone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = sone adiag(i) = sone
end if end if
end do end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info) call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999 if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info) call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info) call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
! !
! Build the smoothed prolongator using either A or Af ! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1' & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol) call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr) call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate ' & 'Done smooth_aggregate '

@ -97,6 +97,8 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
integer(psb_lpk_) :: ntaggr integer(psb_lpk_) :: ntaggr
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
logical :: clean_zeros logical :: clean_zeros
integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
logical, parameter :: do_timings=.false.
name='amg_z_dec_aggregator_tprol' name='amg_z_dec_aggregator_tprol'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_map_bld==-1)) &
& idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
if ((do_timings).and.(idx_map_tprol==-1)) &
& idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
call amg_check_def(parms%ml_cycle,'Multilevel cycle',& call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
& amg_mult_ml_,is_legal_ml_cycle) & amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
! The decoupled aggregator based on SOC measures ignores ! The decoupled aggregator based on SOC measures ignores
! ag_data except for clean_zeros; soc_map_bld is a procedure pointer. ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
! !
if (do_timings) call psb_tic(idx_map_bld)
clean_zeros = ag%do_clean_zeros clean_zeros = ag%do_clean_zeros
call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info) call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
if (do_timings) call psb_toc(idx_map_bld)
if (do_timings) call psb_tic(idx_map_tprol)
if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info) if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
if (do_timings) call psb_toc(idx_map_tprol)
if (info /= psb_success_) then if (info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol') call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')

@ -76,7 +76,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza integer(psb_lpk_) :: nrsave, ncsave, nzsave, nza
logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false. logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.
integer(psb_ipk_), save :: idx_spspmm=-1 integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
name='amg_ptap_bld' name='amg_ptap_bld'
if(psb_get_errstatus().ne.0) return if(psb_get_errstatus().ne.0) return
@ -93,7 +93,11 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
if ((do_timings).and.(idx_spspmm==-1)) & if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm") & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
if ((do_timings).and.(idx_cpytrans1==-1)) &
& idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
if ((do_timings).and.(idx_cpytrans2==-1)) &
& idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
naggr = nlaggr(me+1) naggr = nlaggr(me+1)
ntaggr = sum(nlaggr) ntaggr = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
! Ok first product done. ! Ok first product done.
if (present(desc_ax)) then if (present(desc_ax)) then
if (do_timings) call psb_tic(idx_cpytrans1)
block block
call coo_prol%cp_to_coo(coo_restr,info) call coo_prol%cp_to_coo(coo_restr,info)
call coo_restr%set_ncols(desc_ac%get_local_cols()) call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%set_ncols(desc_ax%get_local_cols()) call coo_restr%set_ncols(desc_ax%get_local_cols())
end block end block
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans1)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999
@ -168,26 +173,27 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
call coo_restr%transp() call coo_restr%transp()
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
nrl = desc_ac%get_local_rows() nrl = desc_ac%get_local_rows()
i=0 call coo_restr%fix(info)
i=coo_restr%get_nzeros()
! !
! Only keep local rows ! Only keep local rows
! !
do k=1, nzl search: do k=i,1,-1
if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then if (coo_restr%ia(k) <= nrl) then
i = i+1 call coo_restr%set_nzeros(k)
coo_restr%val(i) = coo_restr%val(k) exit search
coo_restr%ia(i) = coo_restr%ia(k)
coo_restr%ja(i) = coo_restr%ja(k)
end if end if
end do end do search
call coo_restr%set_nzeros(i)
call coo_restr%fix(info)
nzl = coo_restr%get_nzeros() nzl = coo_restr%get_nzeros()
call coo_restr%set_nrows(desc_ac%get_local_rows()) call coo_restr%set_nrows(desc_ac%get_local_rows())
call coo_restr%set_ncols(desc_a%get_local_cols()) call coo_restr%set_ncols(desc_a%get_local_cols())
if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr) if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
if (do_timings) call psb_tic(idx_cpytrans2)
call csr_restr%cp_from_coo(coo_restr,info) call csr_restr%cp_from_coo(coo_restr,info)
if (do_timings) call psb_toc(idx_cpytrans2)
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr') call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
goto 9999 goto 9999

@ -72,7 +72,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_z_inner_mod use amg_z_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! Arguments ! Arguments
@ -85,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_), intent(out) :: info integer(psb_ipk_), intent(out) :: info
! Local variables ! Local variables
integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),& integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
& ideg(:), idxs(:) & ideg(:), idxs(:)
integer(psb_lpk_), allocatable :: tmpaggr(:) integer(psb_lpk_), allocatable :: tmpaggr(:)
complex(psb_dpk_), allocatable :: val(:), diag(:) complex(psb_dpk_), allocatable :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
integer(psb_lpk_) :: nrglob integer(psb_lpk_) :: nrglob
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
integer(psb_ipk_), save :: idx_soc1_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc1_map_bld' name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc1_p0==-1)) &
& idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
if ((do_timings).and.(idx_soc1_p1==-1)) &
& idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
if ((do_timings).and.(idx_soc1_p2==-1)) &
& idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
if ((do_timings).and.(idx_soc1_p3==-1)) &
& idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc1_p0)
call a%cp_to(acsr) call a%cp_to(acsr)
if (do_timings) call psb_toc(idx_soc1_p0)
if (clean_zeros) call acsr%clean_zeros(info) if (clean_zeros) call acsr%clean_zeros(info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = acsr%irp(i+1) - acsr%irp(i) ideg(i) = acsr%irp(i+1) - acsr%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_tic(idx_soc1_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
icnt = 0 #if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) cycle step1
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
nz = (acsr%irp(i+1)-acsr%irp(i))
if ((nz<0).or.(nz>size(icol))) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
val(1:nz) = acsr%val(acsr%irp(i):acsr%irp(i+1)-1)
!
! Build the set of all strongly coupled nodes
!
ip = 0
do k=1, nz
j = icol(k)
! If any of the neighbours is already assigned,
! we will not reset.
if (ilaggr(j) > 0) cycle step1
if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
ip = ip + 1
icol(ip) = icol(k)
end if
enddo
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, ip
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
step1: do ii=1, nr step1: do ii=1, nr
if (info /= 0) cycle
i = idxs(ii) i = idxs(ii)
if ((i<1).or.(i>nr)) then if ((i<1).or.(i>nr)) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
if (ilaggr(i) == -(nr+1)) then if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
if ((nz<0).or.(nz>size(icol))) then if ((nz<0).or.(nz>size(icol))) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name) call psb_errpush(info,name)
goto 9999 cycle step1
!goto 9999
end if end if
icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1) icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
if (disjoint) then if (disjoint) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
do k=1, ip do k=1, ip
ilaggr(icol(k)) = naggr ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if end if
if (do_timings) call psb_toc(idx_soc1_p1)
if (do_timings) call psb_tic(idx_soc1_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc1_p2)
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
if (do_timings) call psb_tic(idx_soc1_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -274,7 +461,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
enddo enddo
if (ip > 0) then if (ip > 0) then
icnt = icnt + 1
naggr = naggr + 1 naggr = naggr + 1
ilaggr(i) = naggr ilaggr(i) = naggr
do k=1, ip do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,info)&
!$omp private(ii,i,j,k,nz,icol,val,ip)
do i=1, nr do i=1, nr
if (info /= 0) cycle
if (ilaggr(i) < 0) then if (ilaggr(i) < 0) then
nz = (acsr%irp(i+1)-acsr%irp(i)) nz = (acsr%irp(i+1)-acsr%irp(i))
if (nz == 1) then if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc1_p3)
if (naggr > ncol) then if (naggr > ncol) then
!write(0,*) name,'Error : naggr > ncol',naggr,ncol
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
goto 9999 goto 9999
@ -336,9 +528,13 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nlaggr(:) = 0 nlaggr(:) = 0
nlaggr(me+1) = naggr nlaggr(me+1) = naggr
call psb_sum(ctxt,nlaggr(1:np)) call psb_sum(ctxt,nlaggr(1:np))
if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),&
& ' Check 2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
& count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
end if
call acsr%free() call acsr%free()
call psb_erractionrestore(err_act) call psb_erractionrestore(err_act)
return return

@ -71,6 +71,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
use psb_base_mod use psb_base_mod
use amg_base_prec_type use amg_base_prec_type
use amg_z_inner_mod use amg_z_inner_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
integer(psb_ipk_) :: np, me integer(psb_ipk_) :: np, me
integer(psb_ipk_) :: nrow, ncol, n_ne integer(psb_ipk_) :: nrow, ncol, n_ne
character(len=20) :: name, ch_err character(len=20) :: name, ch_err
integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
integer(psb_ipk_), save :: idx_soc2_p0=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
name = 'amg_soc2_map_bld' name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
ncol = desc_a%get_local_cols() ncol = desc_a%get_local_cols()
nrglob = desc_a%get_global_rows() nrglob = desc_a%get_global_rows()
if ((do_timings).and.(idx_soc2_p0==-1)) &
& idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
if ((do_timings).and.(idx_soc2_p1==-1)) &
& idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
if ((do_timings).and.(idx_soc2_p2==-1)) &
& idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
if ((do_timings).and.(idx_soc2_p3==-1)) &
& idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
nr = a%get_nrows() nr = a%get_nrows()
nc = a%get_ncols() nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
goto 9999 goto 9999
end if end if
if (do_timings) call psb_tic(idx_soc2_p0)
diag = a%get_diag(info) diag = a%get_diag(info)
if(info /= psb_success_) then if(info /= psb_success_) then
info=psb_err_from_subroutine_ info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
call a%cp_to(muij) call a%cp_to(muij)
if (clean_zeros) call muij%clean_zeros(info) if (clean_zeros) call muij%clean_zeros(info)
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
end do end do
end do end do
!$omp end parallel do
! !
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
ip = 0 !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
s_neigh_coo%ia(k) = i
s_neigh_coo%ja(k) = j
if (j<=nr) then if (j<=nr) then
ip = ip + 1
s_neigh_coo%ia(ip) = i
s_neigh_coo%ja(ip) = j
if (real(muij%val(k)) >= theta) then if (real(muij%val(k)) >= theta) then
s_neigh_coo%val(ip) = done s_neigh_coo%val(k) = done
else else
s_neigh_coo%val(ip) = -done s_neigh_coo%val(k) = -done
end if end if
else
s_neigh_coo%val(k) = -done
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(ip) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
if (iorder == amg_aggr_ord_nat_) then if (iorder == amg_aggr_ord_nat_) then
!$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
idxs(i) = i idxs(i) = i
end do end do
!$omp end parallel do
else else
!$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static)
do i=1, nr do i=1, nr
ilaggr(i) = -(nr+1) ilaggr(i) = -(nr+1)
ideg(i) = muij%irp(i+1) - muij%irp(i) ideg(i) = muij%irp(i+1) - muij%irp(i)
end do end do
!$omp end parallel do
call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
end if end if
if (do_timings) call psb_toc(idx_soc2_p0)
if (do_timings) call psb_tic(idx_soc2_p1)
! !
! Phase one: Start with disjoint groups. ! Phase one: Start with disjoint groups.
! !
naggr = 0 naggr = 0
#if defined(OPENMP)
block
integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
!$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
!$omp private(icol,val,myth,kk)
block
integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
integer(psb_lpk_) :: itmp
!$omp master
nths = omp_get_num_threads()
allocate(bnds(0:nths),locnaggr(0:nths+1))
locnaggr(:) = 0
bnds(0) = 1
!$omp end master
!$omp barrier
myth = omp_get_thread_num()
rsz = nr/nths
if (myth < mod(nr,nths)) rsz = rsz + 1
bnds(myth+1) = rsz
!$omp barrier
!$omp master
do i=1,nths
bnds(i) = bnds(i) + bnds(i-1)
end do
info = 0
!$omp end master
!$omp barrier
!$omp do schedule(static) private(disjoint)
do kk=0, nths-1
step1: do ii=bnds(kk), bnds(kk+1)-1
i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do
end do
!$omp end do
end block
!$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else
icnt = 0 icnt = 0
step1: do ii=1, nr step1: do ii=1, nr
i = idxs(ii) i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
endif endif
enddo step1 enddo step1
#endif
if (debug_level >= psb_debug_outer_) then if (debug_level >= psb_debug_outer_) then
write(debug_unit,*) me,' ',trim(name),& write(debug_unit,*) me,' ',trim(name),&
& ' Check 1:',count(ilaggr == -(nr+1)) & ' Check 1:',count(ilaggr == -(nr+1))
end if end if
if (do_timings) call psb_toc(idx_soc2_p1)
if (do_timings) call psb_tic(idx_soc2_p2)
! !
! Phase two: join the neighbours ! Phase two: join the neighbours
! !
!$omp workshare
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)&
!$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end if end if
end do step2 end do step2
!$omp end parallel do
if (do_timings) call psb_toc(idx_soc2_p2)
if (do_timings) call psb_tic(idx_soc2_p3)
! !
! Phase three: sweep over leftovers, if any ! Phase three: sweep over leftovers, if any
! !
@ -294,6 +477,8 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end do step3 end do step3
! Any leftovers? ! Any leftovers?
!$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)&
!$omp private(ii,i,j,k)
do i=1, nr do i=1, nr
if (ilaggr(i) <= 0) then if (ilaggr(i) <= 0) then
nz = (s_neigh%irp(i+1)-s_neigh%irp(i)) nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! other processes. ! other processes.
ilaggr(i) = -(nrglob+nr) ilaggr(i) = -(nrglob+nr)
else else
!$omp atomic write
info=psb_err_internal_error_ info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers') call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
goto 9999 cycle
endif endif
end if end if
end do end do
!$omp end parallel do
if (info /= 0) goto 9999
if (do_timings) call psb_toc(idx_soc2_p3)
if (naggr > ncol) then if (naggr > ncol) then
info=psb_err_internal_error_ info=psb_err_internal_error_
call psb_errpush(info,name,a_err='Fatal error: naggr>ncol') call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')

@ -140,6 +140,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
real(psb_dpk_) :: anorm, omega, tmp, dg, theta real(psb_dpk_) :: anorm, omega, tmp, dg, theta
logical, parameter :: debug_new=.false. logical, parameter :: debug_new=.false.
character(len=80) :: filename character(len=80) :: filename
logical, parameter :: do_timings=.false.
integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
name='amg_aggrmat_smth_bld' name='amg_aggrmat_smth_bld'
info=psb_success_ info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if ((do_timings).and.(idx_spspmm==-1)) &
& idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
if ((do_timings).and.(idx_phase1==-1)) &
& idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1 ")
if ((do_timings).and.(idx_phase2==-1)) &
& idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2 ")
if ((do_timings).and.(idx_phase3==-1)) &
& idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3 ")
if ((do_timings).and.(idx_gtrans==-1)) &
& idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans ")
if ((do_timings).and.(idx_refine==-1)) &
& idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine ")
if ((do_timings).and.(idx_cdasb==-1)) &
& idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb ")
if ((do_timings).and.(idx_ptap==-1)) &
& idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld ")
nglob = desc_a%get_global_rows() nglob = desc_a%get_global_rows()
nrow = desc_a%get_local_rows() nrow = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! naggr: number of local aggregates ! naggr: number of local aggregates
! nrow: local rows. ! nrow: local rows.
! !
if (do_timings) call psb_tic(idx_phase1)
! Get the diagonal D ! Get the diagonal D
adiag = a%get_diag(info) adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
! !
! Build the filtered matrix Af from A ! Build the filtered matrix Af from A
! !
!$OMP parallel do private(i,j,tmp,jd) schedule(static)
do i=1, nrow do i=1, nrow
tmp = zzero tmp = zzero
jd = -1 jd = -1
@ -214,11 +235,13 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
acsrf%val(jd)=acsrf%val(jd)-tmp acsrf%val(jd)=acsrf%val(jd)-tmp
end if end if
enddo enddo
!$OMP end parallel do
! Take out zeroed terms ! Take out zeroed terms
call acsrf%clean_zeros(info) call acsrf%clean_zeros(info)
end if end if
!$OMP parallel do private(i) schedule(static)
do i=1,size(adiag) do i=1,size(adiag)
if (adiag(i) /= zzero) then if (adiag(i) /= zzero) then
adiag(i) = zone / adiag(i) adiag(i) = zone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
adiag(i) = zone adiag(i) = zone
end if end if
end do end do
!$OMP end parallel do
if (parms%aggr_omega_alg == amg_eig_est_) then if (parms%aggr_omega_alg == amg_eig_est_) then
if (parms%aggr_eig == amg_max_norm_) then if (parms%aggr_eig == amg_max_norm_) then
@ -252,8 +275,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_') call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase1)
if (do_timings) call psb_tic(idx_phase2)
call acsrf%scal(adiag,info) call acsrf%scal(adiag,info)
if (info /= psb_success_) goto 9999 if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_cdasb(desc_ac,info) call psb_cdasb(desc_ac,info)
call psb_cd_reinit(desc_ac,info) call psb_cd_reinit(desc_ac,info)
if (do_timings) call psb_toc(idx_phase2)
if (do_timings) call psb_tic(idx_phase3)
! !
! Build the smoothed prolongator using either A or Af ! Build the smoothed prolongator using either A or Af
! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol ! acsr1 = (I-w*D*A) Prol acsr1 = (I-w*D*Af) Prol
@ -279,8 +305,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1') call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
goto 9999 goto 9999
end if end if
if (do_timings) call psb_toc(idx_phase3)
if (do_timings) call psb_tic(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done SPSPMM 1' & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
call op_prol%mv_from(coo_prol) call op_prol%mv_from(coo_prol)
call op_restr%mv_from(coo_restr) call op_restr%mv_from(coo_restr)
if (do_timings) call psb_toc(idx_ptap)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),& & write(debug_unit,*) me,' ',trim(name),&
& 'Done smooth_aggregate ' & 'Done smooth_aggregate '

@ -0,0 +1,91 @@
#include "MatchBoxPC.h"
// TODO comment
void clean(MilanLongInt NLVer,
MilanInt myRank,
MilanLongInt MessageIndex,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus,
MilanInt BufferSize,
MilanLongInt *Buffer,
MilanLongInt msgActual,
MilanLongInt *msgActualSent,
MilanLongInt msgInd,
MilanLongInt *msgIndSent,
MilanLongInt NumMessagesBundled,
MilanReal *msgPercent)
{
// Cleanup Phase
#pragma omp parallel
{
#pragma omp master
{
#pragma omp task
{
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") Waitall= " << endl;
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
cout << "\n(" << myRank << ") Waitall " << endl;
fflush(stdout);
#endif
//return;
MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
// MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
if (BufferSize > 0)
{
MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
free(Buffer); // Free the memory that was allocated
}
}
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
fflush(stdout);
cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
fflush(stdout);
cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
fflush(stdout);
cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
if (msgInd > 0)
{
cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
}
fflush(stdout);
#endif
#pragma omp task
{
*msgActualSent = msgActual;
*msgIndSent = msgInd;
if (msgInd > 0)
{
*msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
}
else
{
*msgPercent = 0;
}
}
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ") Done" << endl;
fflush(stdout);
#endif
}
}
}

@ -0,0 +1,73 @@
#include "MatchBoxPC.h"
/**
* Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
* Returns the vertices with the highest weight
* @param adj1
* @param adj2
* @param verLocInd
* @param edgeLocWeight
* @return
*/
MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
MilanLongInt adj2,
MilanLongInt *verLocInd,
MilanReal *edgeLocWeight)
{
MilanInt w = -1;
MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
int finalK;
for (int k = adj1; k < adj2; k++) {
if ((edgeLocWeight[k] > heaviestEdgeWt) ||
((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
heaviestEdgeWt = edgeLocWeight[k];
w = verLocInd[k];
finalK = k;
}
} // End of for loop
return finalK;
}
/**
* //TODO documentation
* @param adj1
* @param adj2
* @param edgeLocWeight
* @param k
* @param verLocInd
* @param StartIndex
* @param EndIndex
* @param GMate
* @param Mate
* @param Ghost2LocalMap
* @return
*/
MilanLongInt computeCandidateMate(MilanLongInt adj1,
MilanLongInt adj2,
MilanReal *edgeLocWeight,
MilanLongInt k,
MilanLongInt *verLocInd,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
{
// Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
MilanInt w = -1;
MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
for (k = adj1; k < adj2; k++) {
if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
continue;
if ((edgeLocWeight[k] > heaviestEdgeWt) ||
((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k]))) {
heaviestEdgeWt = edgeLocWeight[k];
w = verLocInd[k];
}
} // End of for loop
// End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
return w;
}

@ -80,9 +80,11 @@ class staticQueue
MilanLongInt squeueTail; MilanLongInt squeueTail;
MilanLongInt NumNodes; MilanLongInt NumNodes;
//FIXME I had to comment this piece of code in order to make everything work.
// why?
//Prevent Assignment and Pass by Value: //Prevent Assignment and Pass by Value:
staticQueue(const staticQueue& src); //staticQueue(const staticQueue& src);
staticQueue& operator=(const staticQueue& rhs); //staticQueue& operator=(const staticQueue& rhs);
public: public:
//Constructors and Destructors //Constructors and Destructors

@ -0,0 +1,31 @@
#include "MatchBoxPC.h"
void extractUChunk(
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU)
{
UChunkBeingProcessed.clear();
#pragma omp critical(U)
{
if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
{
while (!privateU.empty())
UChunkBeingProcessed.push_back(privateU.back());
privateU.pop_back();
}
else
{
for (int i = 0; i < UCHUNK; i++)
{ // Pop the new nodes
if (U.empty())
break;
UChunkBeingProcessed.push_back(U.back());
U.pop_back();
}
}
} // End of critical U // End of critical U
}

@ -0,0 +1,29 @@
#include "MatchBoxPC.h"
/// Find the owner of a ghost node:
MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
MilanInt myRank, MilanInt numProcs)
{
MilanLongInt mStartInd = mVerDistance[myRank];
MilanInt Start = 0;
MilanInt End = numProcs;
MilanInt Current = 0;
while (Start <= End)
{
Current = (End + Start) / 2;
// CASE-1:
if (mVerDistance[Current] == vtxIndex) return Current;
else // CASE 2:
if (mVerDistance[Current] > vtxIndex)
End = Current - 1;
else // CASE 3:
Start = Current + 1;
} // End of While()
if (mVerDistance[Current] > vtxIndex)
return (Current - 1);
return Current;
} // End of findOwnerOfGhost()

@ -0,0 +1,304 @@
#include "MatchBoxPC.h"
void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
MilanLongInt StartIndex, MilanLongInt EndIndex,
MilanLongInt *numGhostEdges,
MilanLongInt *numGhostVertices,
MilanLongInt *S,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &Counter,
vector<MilanLongInt> &verGhostPtr,
vector<MilanLongInt> &verGhostInd,
vector<MilanLongInt> &tempCounter,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Message,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
MilanLongInt *&candidateMate,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
MilanLongInt insertMe = 0;
MilanLongInt adj1, adj2;
int i, v, k, w;
// index that starts with zero to |Vg| - 1
map<MilanLongInt, MilanLongInt>::iterator storedAlready;
#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD)
{
#pragma omp single
{
#ifdef TIME_TRACKER
double Ghost2LocalInitialization = MPI_Wtime();
#endif
/*
* OMP Ghost2LocalInitialization
* This loop analyzes all the edges and when finds a ghost edge
* puts it in the Ghost2LocalMap.
* A critical region is needed when inserting data in the map.
*
* Despite the critical region it is still productive to
* parallelize this cycle because the critical region is exeuted
* only when a ghost edge is found and ghost edges are a minority,
* circa 3.5% during the tests.
*/
#pragma omp task depend(out \
: *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
{
#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
: numGhostEdges[:1])
for (i = 0; i < NLEdge; i++)
{ // O(m) - Each edge stored twice
insertMe = verLocInd[i];
if ((insertMe < StartIndex) || (insertMe > EndIndex))
{ // Find a ghost
(*numGhostEdges)++;
#pragma omp critical
{
storedAlready = Ghost2LocalMap.find(insertMe);
if (storedAlready != Ghost2LocalMap.end())
{ // Has already been added
Counter[storedAlready->second]++; // Increment the counter
}
else
{ // Insert an entry for the ghost:
Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
Counter.push_back(1); // Initialize the counter
(*numGhostVertices)++; // Increment the number of ghost vertices
} // End of else()
}
} // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
} // End of for(ghost vertices)
} // end of task depend
// *numGhostEdges = atomicNumGhostEdges;
#ifdef TIME_TRACKER
Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges;
if (!Ghost2LocalMap.empty())
{
cout << "\n(" << myRank << ")Final Map : on process ";
cout << "\n(" << myRank << ")Key \t Value \t Counter \n";
fflush(stdout);
storedAlready = Ghost2LocalMap.begin();
do
{
cout << storedAlready->second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl;
fflush(stdout);
storedAlready++;
} while (storedAlready != Ghost2LocalMap.end());
}
#endif
#pragma omp task depend(out \
: verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
: *numGhostVertices, *numGhostEdges)
{
// Initialize adjacency Lists for Ghost Vertices:
try
{
verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector
tempCounter.reserve(*numGhostVertices); // Pointer Vector
verGhostInd.reserve(*numGhostEdges); // Index Vector
GMate.reserve(*numGhostVertices); // Ghost Mate Vector
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
// Initialize the Vectors:
verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector
tempCounter.resize(*numGhostVertices, 0); // Temporary Counter
verGhostInd.resize(*numGhostEdges, -1); // Index Vector
GMate.resize(*numGhostVertices, -1); // Temporary Counter
verGhostPtr[0] = 0; // The first value
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Ghost Vertex Pointer: ";
fflush(stdout);
#endif
} // End of task
#pragma omp task depend(out \
: verGhostPtr) depend(in \
: Counter, *numGhostVertices)
{
#ifdef TIME_TRACKER
double verGhostPtrInitialization = MPI_Wtime();
#endif
for (i = 0; i < *numGhostVertices; i++)
{ // O(|Ghost Vertices|)
verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
#ifdef PRINT_DEBUG_INFO_
cout << verGhostPtr[i] << "\t";
fflush(stdout);
#endif
}
#ifdef TIME_TRACKER
verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
#endif
} // End of task
#ifdef PRINT_DEBUG_INFO_
if (*numGhostVertices > 0)
cout << verGhostPtr[*numGhostVertices] << "\n";
fflush(stdout);
#endif
#ifdef TIME_TRACKER
double verGhostIndInitialization = MPI_Wtime();
#endif
/*
* OMP verGhostIndInitialization
*
* In this cycle the verGhostInd is initialized
* with the datas related to ghost edges.
* The check to see if a node is a ghost node is
* executed in paralle and when a ghost node
* is found a critical region is started.
*
* Despite the critical region it's still useful to
* parallelize the for cause the ghost nodes
* are a minority hence the critical region is executed
* few times, circa 3.5% of the times in the tests.
*/
#pragma omp task depend(in \
: insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \
: verGhostInd)
{
#pragma omp taskloop num_tasks(NUM_THREAD)
for (v = 0; v < NLVer; v++)
{
adj1 = verLocPtr[v]; // Vertex Pointer
adj2 = verLocPtr[v + 1];
for (k = adj1; k < adj2; k++)
{
w = verLocInd[k]; // Get the adjacent vertex
if ((w < StartIndex) || (w > EndIndex))
{ // Find a ghost
#pragma omp critical
{
insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
tempCounter[Ghost2LocalMap[w]]++; // Increment the counter
}
verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
} // End of if((w < StartIndex) || (w > EndIndex))
} // End of for(k)
} // End of for (v)
} // end of tasklopp
#ifdef TIME_TRACKER
verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Ghost Vertex Index: ";
for (v = 0; v < *numGhostEdges; v++)
cout << verGhostInd[v] << "\t";
cout << endl;
fflush(stdout);
#endif
#pragma omp task depend(in \
: *numGhostEdges) depend(out \
: QLocalVtx, QGhostVtx, QMsgType, QOwner)
{
try
{
QLocalVtx.reserve(*numGhostEdges); // Local Vertex
QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex
QMsgType.reserve(*numGhostEdges); // Message Type (Request/Failure)
QOwner.reserve(*numGhostEdges); // Owner of the ghost: COmpute once and use later
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
} // end of task
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
fflush(stdout);
#endif
#pragma omp task depend(in \
: *numGhostVertices) depend(out \
: candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
{
// Allocate Data Structures:
/*
* candidateMate was a vector and has been replaced with an array
* there is no point in using the vector (or maybe there is (???))
* so I replaced it with an array wich is slightly faster
*/
candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)];
*S = (*numGhostVertices); // Initialize S with number of Ghost Vertices
/*
* Create the Queue Data Structure for the Dominating Set
*
* I had to declare the staticuQueue U before the parallel region
* to have it in the correct scope. Since we can't change the dimension
* of a staticQueue I had to destroy the previous object and instantiate
* a new one of the correct size.
*/
//new (&U) staticQueue(NLVer + (*numGhostVertices));
U.reserve(NLVer + (*numGhostVertices));
// Initialize the private vectors
privateQLocalVtx.reserve(*numGhostVertices);
privateQGhostVtx.reserve(*numGhostVertices);
privateQMsgType.reserve(*numGhostVertices);
privateQOwner.reserve(*numGhostVertices);
privateU.reserve(*numGhostVertices);
} // end of task
} // End of single region
} // End of parallel region
}

@ -0,0 +1,46 @@
#include "MatchBoxPC.h"
/**
* //TODO documentation
* @param k
* @param verLocInd
* @param StartIndex
* @param EndIndex
* @param GMate
* @param Mate
* @param Ghost2LocalMap
* @return
*/
bool isAlreadyMatched(MilanLongInt node,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
{
/*
#pragma omp critical(Mate)
{
if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched
} else { //A local vertex
result = (Mate[node - StartIndex] >= 0); // Already matched
}
}
*/
MilanLongInt val;
if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex
{
#pragma omp atomic read
val = GMate[Ghost2LocalMap[node]];
return val >= 0; // Already matched
}
// If not ghost vertex
#pragma omp atomic read
val = Mate[node - StartIndex];
return val >= 0; // Already matched
}

@ -0,0 +1,27 @@
#include "MatchBoxPC.h"
void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanInt myRank,
MilanReal *edgeLocWeight,
MilanLongInt *candidateMate)
{
MilanLongInt v = -1;
#pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD)
{
#pragma omp for schedule(static)
for (v = 0; v < NLVer; v++) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
fflush(stdout);
#endif
// Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
// End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
}
}
}

@ -0,0 +1,24 @@
#include "MatchBoxPC.h"
void PROCESS_CROSS_EDGE(MilanLongInt *edge,
MilanLongInt *S)
{
// Start: PARALLEL_PROCESS_CROSS_EDGE_B
MilanLongInt captureCounter;
#pragma omp atomic capture
captureCounter = --(*edge); // Decrement
//assert(captureCounter >= 0);
if (captureCounter == 0)
#pragma omp atomic
(*S)--; // Decrement S
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
fflush(stdout);
#endif
// End: PARALLEL_PROCESS_CROSS_EDGE_B
}

@ -0,0 +1,195 @@
#include "MatchBoxPC.h"
void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
MilanLongInt *candidateMate,
MilanLongInt *verLocInd,
MilanLongInt *verLocPtr,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *Mate,
vector<MilanLongInt> &GMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *NumMessagesBundled,
MilanLongInt *S,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
MilanInt ghostOwner = 0, option, igw;
#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner) \
firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) \
default(shared) num_threads(NUM_THREAD)
{
#pragma omp for reduction(+ \
: PCounter[:numProcs], myCard \
[:1], msgInd \
[:1], NumMessagesBundled \
[:1]) \
schedule(static)
for (v = 0; v < NLVer; v++) {
option = -1;
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
k = candidateMate[v];
candidateMate[v] = verLocInd[k];
w = candidateMate[v];
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0)
{
#pragma omp critical(processExposed)
{
if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
w = computeCandidateMate(verLocPtr[v],
verLocPtr[v + 1],
edgeLocWeight, 0,
verLocInd,
StartIndex,
EndIndex,
GMate,
Mate,
Ghost2LocalMap);
candidateMate[v] = w;
}
if (w >= 0) {
(*myCard)++;
if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex
option = 2;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
option = 1;
Mate[v] = w;
GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == (v + StartIndex)) {
option = 3;
Mate[v] = w; // v is local
Mate[w - StartIndex] = v + StartIndex; // w is local
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
fflush(stdout);
#endif
} // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
} // End of Else
} // End of second if
} // End critical processExposed
} // End of if(w >=0)
else {
// This piece of code is executed a really small amount of times
adj11 = verLocPtr[v];
adj12 = verLocPtr[v + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
fflush(stdout);
#endif
(*msgInd)++;
(*NumMessagesBundled)++;
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
privateQLocalVtx.push_back(v + StartIndex);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(FAILURE);
privateQOwner.push_back(ghostOwner);
} // End of if(GHOST)
} // End of for loop
}
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
switch (option)
{
case -1:
break;
case 1:
privateU.push_back(v + StartIndex);
privateU.push_back(w);
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
fflush(stdout);
#endif
// Decrement the counter:
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
case 2:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message (291):";
cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
fflush(stdout);
#endif
(*msgInd)++;
(*NumMessagesBundled)++;
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
privateQLocalVtx.push_back(v + StartIndex);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(REQUEST);
privateQOwner.push_back(ghostOwner);
break;
case 3:
default:
privateU.push_back(v + StartIndex);
privateU.push_back(w);
break;
}
} // End of for ( v=0; v < NLVer; v++ )
queuesTransfer(U, privateU, QLocalVtx,
QGhostVtx,
QMsgType, QOwner, privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
} // End of parallel region
}

@ -0,0 +1,294 @@
#include "MatchBoxPC.h"
void processMatchedVertices(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *NumMessagesBundled,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
int option;
MilanLongInt mateVal;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef COUNT_LOCAL_VERTEX
MilanLongInt localVertices = 0;
#endif
//#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, \
privateQMsgType, privateQOwner, UChunkBeingProcessed) \
default(shared) num_threads(NUM_THREAD) \
reduction(+ \
: msgInd[:1], PCounter \
[:numProcs], myCard \
[:1], NumMessagesBundled \
[:1])
{
while (!U.empty()) {
extractUChunk(UChunkBeingProcessed, U, privateU);
for (MilanLongInt u : UChunkBeingProcessed) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")u: " << u;
fflush(stdout);
#endif
if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices
#ifdef COUNT_LOCAL_VERTEX
localVertices++;
#endif
// Get the Adjacency list for u
adj1 = verLocPtr[u - StartIndex]; // Pointer
adj2 = verLocPtr[u - StartIndex + 1];
for (k = adj1; k < adj2; k++) {
option = -1;
v = verLocInd[k];
if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
fflush(stdout);
#endif
#pragma omp atomic read
mateVal = Mate[v - StartIndex];
// If the current vertex is pointing to a matched vertex and is not matched
if (mateVal < 0) {
#pragma omp critical
{
if (candidateMate[v - StartIndex] == u) {
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
w = computeCandidateMate(verLocPtr[v - StartIndex],
verLocPtr[v - StartIndex + 1],
edgeLocWeight, 0,
verLocInd,
StartIndex,
EndIndex,
GMate,
Mate,
Ghost2LocalMap);
candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v << " Points to: " << w;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0) {
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message:";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
#endif
option = 2;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
option = 1;
Mate[v - StartIndex] = w; // v is a local vertex
GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == v) {
option = 3;
Mate[v - StartIndex] = w; // v is a local vertex
Mate[w - StartIndex] = v; // w is a local vertex
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
} // End of if(CandidateMate(w) = v
} // End of Else
} // End of if(w >=0)
else
option = 4; // End of Else: w == -1
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
} // End of If (candidateMate[v-StartIndex] == u
} // End of task
} // mateval < 0
} // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
else { // Neighbor is a ghost vertex
#pragma omp critical
{
if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
if (v != Mate[u - StartIndex])
option = 5; // u is local
} // End of critical
} // End of Else //A Ghost Vertex
switch (option)
{
case -1:
// No things to do
break;
case 1:
// Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
// Decrement the counter:
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
case 2:
// Found a dominating edge, it is a ghost
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
(*NumMessagesBundled)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(REQUEST);
privateQOwner.push_back(ghostOwner);
break;
case 3:
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
break;
case 4:
// Could not find a dominating vertex
adj11 = verLocPtr[v - StartIndex];
adj12 = verLocPtr[v - StartIndex + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
PCounter[ghostOwner]++;
(*NumMessagesBundled)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(FAILURE);
privateQOwner.push_back(ghostOwner);
} // End of if(GHOST)
} // End of for loop
break;
case 5:
default:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a success message: ";
cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
// assert(ghostOwner != -1);
// assert(ghostOwner != myRank);
(*NumMessagesBundled)++;
PCounter[ghostOwner]++;
(*msgInd)++;
privateQLocalVtx.push_back(u);
privateQGhostVtx.push_back(v);
privateQMsgType.push_back(SUCCESS);
privateQOwner.push_back(ghostOwner);
break;
} // End of switch
} // End of inner for
}
} // End of outer for
queuesTransfer(U, privateU, QLocalVtx,
QGhostVtx,
QMsgType, QOwner, privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
#pragma omp critical(U)
{
U.insert(U.end(), privateU.begin(), privateU.end());
}
privateU.clear();
#pragma omp critical(sendMessageTransfer)
{
QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
}
privateQLocalVtx.clear();
privateQGhostVtx.clear();
privateQMsgType.clear();
privateQOwner.clear();
} // End of while ( !U.empty() )
#ifdef COUNT_LOCAL_VERTEX
printf("Count local vertexes: %ld for thread %d of processor %d\n",
localVertices,
omp_get_thread_num(),
myRank);
#endif
} // End of parallel region
}

@ -0,0 +1,308 @@
#include "MatchBoxPC.h"
//#define DEBUG_HANG_
void processMatchedVerticesAndSendMessages(
MilanLongInt NLVer,
vector<MilanLongInt> &UChunkBeingProcessed,
vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *NumMessagesBundled,
MilanLongInt *SPtr,
MilanLongInt *verLocPtr,
MilanLongInt *verLocInd,
MilanLongInt *verDistance,
MilanLongInt *PCounter,
vector<MilanLongInt> &Counter,
MilanInt myRank,
MilanInt numProcs,
MilanLongInt *candidateMate,
vector<MilanLongInt> &GMate,
MilanLongInt *Mate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
MilanReal *edgeLocWeight,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner,
MPI_Comm comm,
MilanLongInt *msgActual,
vector<MilanLongInt> &Message)
{
MilanLongInt initialSize = QLocalVtx.size();
MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
int option;
MilanLongInt mateVal;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef COUNT_LOCAL_VERTEX
MilanLongInt localVertices = 0;
#endif
//#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\
privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
num_threads(NUM_THREAD) \
reduction(+ \
: msgInd[:1], PCounter \
[:numProcs], myCard \
[:1], NumMessagesBundled \
[:1], msgActual \
[:1])
{
while (!U.empty()) {
extractUChunk(UChunkBeingProcessed, U, privateU);
for (MilanLongInt u : UChunkBeingProcessed) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")u: " << u;
fflush(stdout);
#endif
if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices
#ifdef COUNT_LOCAL_VERTEX
localVertices++;
#endif
// Get the Adjacency list for u
adj1 = verLocPtr[u - StartIndex]; // Pointer
adj2 = verLocPtr[u - StartIndex + 1];
for (k = adj1; k < adj2; k++) {
option = -1;
v = verLocInd[k];
if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
fflush(stdout);
#endif
#pragma omp atomic read
mateVal = Mate[v - StartIndex];
// If the current vertex is pointing to a matched vertex and is not matched
if (mateVal < 0) {
#pragma omp critical
{
if (candidateMate[v - StartIndex] == u) {
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
w = computeCandidateMate(verLocPtr[v - StartIndex],
verLocPtr[v - StartIndex + 1],
edgeLocWeight, 0,
verLocInd,
StartIndex,
EndIndex,
GMate,
Mate,
Ghost2LocalMap);
candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v << " Points to: " << w;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0) {
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message:";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
#endif
option = 2;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
option = 1;
Mate[v - StartIndex] = w; // v is a local vertex
GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == v) {
option = 3;
Mate[v - StartIndex] = w; // v is a local vertex
Mate[w - StartIndex] = v; // w is a local vertex
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
} // End of if(CandidateMate(w) = v
} // End of Else
} // End of if(w >=0)
else
option = 4; // End of Else: w == -1
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
} // End of If (candidateMate[v-StartIndex] == u
} // End of task
} // mateval < 0
} // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
else { // Neighbor is a ghost vertex
#pragma omp critical
{
if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
if (v != Mate[u - StartIndex])
option = 5; // u is local
} // End of critical
} // End of Else //A Ghost Vertex
switch (option)
{
case -1:
// No things to do
break;
case 1:
// Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
fflush(stdout);
#endif
// Decrement the counter:
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
case 2:
// Found a dominating edge, it is a ghost
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// Build the Message Packet:
// Message[0] = v; // LOCAL
// Message[1] = w; // GHOST
// Message[2] = REQUEST; // TYPE
// Send a Request (Asynchronous)
// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgActual)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(REQUEST);
privateQOwner.push_back(ghostOwner);
break;
case 3:
privateU.push_back(v);
privateU.push_back(w);
(*myCard)++;
break;
case 4:
// Could not find a dominating vertex
adj11 = verLocPtr[v - StartIndex];
adj12 = verLocPtr[v - StartIndex + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) { // A ghost
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
// Build the Message Packet:
// Message[0] = v; // LOCAL
// Message[1] = w; // GHOST
// Message[2] = FAILURE; // TYPE
// Send a Request (Asynchronous)
// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgActual)++;
(*msgInd)++;
privateQLocalVtx.push_back(v);
privateQGhostVtx.push_back(w);
privateQMsgType.push_back(FAILURE);
privateQOwner.push_back(ghostOwner);
} // End of if(GHOST)
} // End of for loop
break;
case 5:
default:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a success message: ";
cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
// Build the Message Packet:
// Message[0] = u; // LOCAL
// Message[1] = v; // GHOST
// Message[2] = SUCCESS; // TYPE
// Send a Request (Asynchronous)
// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgActual)++;
(*msgInd)++;
privateQLocalVtx.push_back(u);
privateQGhostVtx.push_back(v);
privateQMsgType.push_back(SUCCESS);
privateQOwner.push_back(ghostOwner);
break;
} // End of switch
} // End of inner for
}
} // End of outer for
queuesTransfer(U, privateU, QLocalVtx,
QGhostVtx,
QMsgType, QOwner, privateQLocalVtx,
privateQGhostVtx,
privateQMsgType,
privateQOwner);
} // End of while ( !U.empty() )
#ifdef COUNT_LOCAL_VERTEX
printf("Count local vertexes: %ld for thread %d of processor %d\n",
localVertices,
omp_get_thread_num(),
myRank);
#endif
} // End of parallel region
// Send the messages
#ifdef DEBUG_HANG_
cout << myRank<<" Sending: "<<QOwner.size()-initialSize<<" messages" <<endl;
#endif
for (int i = initialSize; i < QOwner.size(); i++) {
Message[0] = QLocalVtx[i];
Message[1] = QGhostVtx[i];
Message[2] = QMsgType[i];
ghostOwner = QOwner[i];
//MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
//cout << myRank<<" Sending to "<<ghostOwner<<endl;
MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
}
#ifdef DEBUG_HANG_
cout << myRank<<" Done sending messages"<<endl;
#endif
}

@ -0,0 +1,315 @@
#include "MatchBoxPC.h"
//#define DEBUG_HANG_
void processMessages(
MilanLongInt NLVer,
MilanLongInt *Mate,
MilanLongInt *candidateMate,
map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
vector<MilanLongInt> &GMate,
vector<MilanLongInt> &Counter,
MilanLongInt StartIndex,
MilanLongInt EndIndex,
MilanLongInt *myCard,
MilanLongInt *msgInd,
MilanLongInt *msgActual,
MilanReal *edgeLocWeight,
MilanLongInt *verDistance,
MilanLongInt *verLocPtr,
MilanLongInt k,
MilanLongInt *verLocInd,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &Message,
MilanLongInt numGhostEdges,
MilanLongInt u,
MilanLongInt v,
MilanLongInt *S,
vector<MilanLongInt> &U)
{
//#define PRINT_DEBUG_INFO_
MilanInt Sender;
MPI_Status computeStatus;
MilanLongInt bundleSize, w;
MilanLongInt adj11, adj12, k1;
MilanLongInt ghostOwner;
int error_codeC;
error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
char error_message[MPI_MAX_ERROR_STRING];
int message_length;
MilanLongInt message_type = 0;
// Buffer to receive bundled messages
// Maximum messages that can be received from any processor is
// twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
vector<MilanLongInt> ReceiveBuffer;
try
{
ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
#ifdef PRINT_DEBUG_INFO_
cout
<< "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl;
fflush(stdout);
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << "=========================************===============================" << endl;
fflush(stdout);
fflush(stdout);
#endif
// BLOCKING RECEIVE:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
fflush(stdout);
fflush(stdout);
#endif
//cout << myRank<<" Receiving ...";
error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
if (error_codeC != MPI_SUCCESS)
{
MPI_Error_string(error_codeC, error_message, &message_length);
cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
fflush(stdout);
}
Sender = computeStatus.MPI_SOURCE;
//cout << " ...from "<<Sender << endl;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
fflush(stdout);
#endif
if (Message[2] == SIZEINFO) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
fflush(stdout);
#endif
bundleSize = Message[0]; //#of integers in the message
// Build the Message Buffer:
if (!ReceiveBuffer.empty())
ReceiveBuffer.clear(); // Empty it out first
ReceiveBuffer.resize(bundleSize, -1); // Initialize
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
for (int i = 0; i < bundleSize; i++)
cout << ReceiveBuffer[i] << ",";
cout << endl;
fflush(stdout);
#endif
// Receive the message
//cout << myRank<<" Receiving from "<<Sender<<endl;
error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
if (error_codeC != MPI_SUCCESS) {
MPI_Error_string(error_codeC, error_message, &message_length);
cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
fflush(stdout);
}
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message Bundle After: " << endl;
for (int i = 0; i < bundleSize; i++)
cout << ReceiveBuffer[i] << ",";
cout << endl;
fflush(stdout);
#endif
} else { // Just a single message:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
fflush(stdout);
#endif
// Add the current message to Queue:
bundleSize = 3; //#of integers in the message
// Build the Message Buffer:
if (!ReceiveBuffer.empty())
ReceiveBuffer.clear(); // Empty it out first
ReceiveBuffer.resize(bundleSize, -1); // Initialize
ReceiveBuffer[0] = Message[0]; // u
ReceiveBuffer[1] = Message[1]; // v
ReceiveBuffer[2] = Message[2]; // message_type
}
#ifdef DEBUG_GHOST_
if ((v < StartIndex) || (v > EndIndex)) {
cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
fflush(stdout);
}
#endif
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
fflush(stdout);
#endif
// Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3) {
u = ReceiveBuffer[bundleCounter - 3]; // GHOST
v = ReceiveBuffer[bundleCounter - 2]; // LOCAL
message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
// CASE I: REQUEST
if (message_type == REQUEST) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
fflush(stdout);
#endif
#ifdef DEBUG_GHOST_
if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
fflush(stdout);
}
#endif
if (Mate[v - StartIndex] == -1) {
// Process only if not already matched (v is local)
candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
if (candidateMate[v - StartIndex] == u) {
GMate[Ghost2LocalMap[u]] = v; // u is ghost
Mate[v - StartIndex] = u; // v is local
U.push_back(v);
U.push_back(u);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
fflush(stdout);
#endif
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
} // End of if ( candidateMate[v-StartIndex] == u )e
} // End of if ( Mate[v] == -1 )
} // End of REQUEST
else { // CASE II: SUCCESS
if (message_type == SUCCESS) {
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
fflush(stdout);
#endif
GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
#ifdef DEBUG_GHOST_
if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
cout << "\n(" << myRank << ") case 2 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
fflush(stdout);
}
#endif
if (Mate[v - StartIndex] == -1) {
// Process only if not already matched ( v is local)
if (candidateMate[v - StartIndex] == u) {
// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k,
verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
fflush(stdout);
#endif
// If found a dominating edge:
if (w >= 0) {
if ((w < StartIndex) || (w > EndIndex)) {
// w is a ghost
// Build the Message Packet:
Message[0] = v; // LOCAL
Message[1] = w; // GHOST
Message[2] = REQUEST; // TYPE
// Send a Request (Asynchronous)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a request message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
//assert(ghostOwner != -1);
//assert(ghostOwner != myRank);
//cout << myRank<<" Sending to "<<ghostOwner<<endl;
MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgInd)++;
(*msgActual)++;
if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
Mate[v - StartIndex] = w; // v is local
GMate[Ghost2LocalMap[w]] = v; // w is ghost
U.push_back(v);
U.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
fflush(stdout);
#endif
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
} // End of if CandidateMate[w] = v
} // End of if a Ghost Vertex
else { // w is a local vertex
if (candidateMate[w - StartIndex] == v) {
Mate[v - StartIndex] = w; // v is local
Mate[w - StartIndex] = v; // w is local
// Q.push_back(u);
U.push_back(v);
U.push_back(w);
(*myCard)++;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
fflush(stdout);
#endif
} // End of if(CandidateMate(w) = v
} // End of Else
} // End of if(w >=0)
else { // No dominant edge found
adj11 = verLocPtr[v - StartIndex];
adj12 = verLocPtr[v - StartIndex + 1];
for (k1 = adj11; k1 < adj12; k1++) {
w = verLocInd[k1];
if ((w < StartIndex) || (w > EndIndex)) {
// A ghost
// Build the Message Packet:
Message[0] = v; // LOCAL
Message[1] = w; // GHOST
Message[2] = FAILURE; // TYPE
// Send a Request (Asynchronous)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending a failure message: ";
cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
fflush(stdout);
#endif
ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
//assert(ghostOwner != -1);
//assert(ghostOwner != myRank);
//cout << myRank<<" Sending to "<<ghostOwner<<endl;
MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
(*msgInd)++;
(*msgActual)++;
} // End of if(GHOST)
} // End of for loop
} // End of Else: w == -1
// End: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
} // End of if ( candidateMate[v-StartIndex] == u )
} // End of if ( Mate[v] == -1 )
} // End of if ( message_type == SUCCESS )
else {
// CASE III: FAILURE
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
fflush(stdout);
#endif
GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
} // End of else: CASE III
} // End of else: CASE I
}
return;
}

@ -0,0 +1,36 @@
#include "MatchBoxPC.h"
void queuesTransfer(vector<MilanLongInt> &U,
vector<MilanLongInt> &privateU,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MilanLongInt> &privateQLocalVtx,
vector<MilanLongInt> &privateQGhostVtx,
vector<MilanLongInt> &privateQMsgType,
vector<MilanInt> &privateQOwner)
{
#pragma omp critical(U)
{
U.insert(U.end(), privateU.begin(), privateU.end());
}
privateU.clear();
#pragma omp critical(sendMessageTransfer)
{
QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
}
privateQLocalVtx.clear();
privateQGhostVtx.clear();
privateQMsgType.clear();
privateQOwner.clear();
}

@ -0,0 +1,209 @@
#include "MatchBoxPC.h"
void sendBundledMessages(MilanLongInt *numGhostEdges,
MilanInt *BufferSize,
MilanLongInt *Buffer,
vector<MilanLongInt> &PCumulative,
vector<MilanLongInt> &PMessageBundle,
vector<MilanLongInt> &PSizeInfoMessages,
MilanLongInt *PCounter,
MilanLongInt NumMessagesBundled,
MilanLongInt *msgActual,
MilanLongInt *msgInd,
MilanInt numProcs,
MilanInt myRank,
MPI_Comm comm,
vector<MilanLongInt> &QLocalVtx,
vector<MilanLongInt> &QGhostVtx,
vector<MilanLongInt> &QMsgType,
vector<MilanInt> &QOwner,
vector<MPI_Request> &SRequest,
vector<MPI_Status> &SStatus)
{
MilanLongInt myIndex = 0, numMessagesToSend;
MilanInt i = 0, OneMessageSize = 0;
#ifdef DEBUG_HANG_
if (myRank == 0)
cout << "\n(" << myRank << ") Send Bundles" << endl;
fflush(stdout);
#endif
#pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD)
{
#pragma omp master
{
// Data structures for Bundled Messages:
#pragma omp task depend(inout \
: PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \
: NumMessagesBundled, numProcs)
{
try {
PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
PCumulative.reserve(numProcs + 1); // Similar to Row Pointer vector in CSR data structure
PSizeInfoMessages.reserve(numProcs * 3); // Buffer to hold the Size info message packets
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
PCumulative.resize(numProcs + 1, 0); // Only initialize the counter variable
PSizeInfoMessages.resize(numProcs * 3, 0);
}
#pragma omp task depend(inout \
: PCumulative) depend(in \
: PCounter)
{
for (i = 0; i < numProcs; i++)
PCumulative[i + 1] = PCumulative[i] + PCounter[i];
}
#pragma omp task depend(inout \
: PCounter)
{
// Reuse PCounter to keep track of how many messages were inserted:
for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
PCounter[i] = 0;
}
// Build the Message Bundle packet:
#pragma omp task depend(in \
: PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \
: myIndex, PMessageBundle, PCounter)
{
for (i = 0; i < NumMessagesBundled; i++) {
myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
PMessageBundle[myIndex + 0] = QLocalVtx[i];
PMessageBundle[myIndex + 1] = QGhostVtx[i];
PMessageBundle[myIndex + 2] = QMsgType[i];
PCounter[QOwner[i]]++;
}
}
// Send the Bundled Messages: Use ISend
#pragma omp task depend(out \
: SRequest, SStatus)
{
try
{
SRequest.reserve(numProcs * 2); // At most two messages per processor
SStatus.reserve(numProcs * 2); // At most two messages per processor
}
catch (length_error)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
cout << "Not enough memory to allocate the internal variables \n";
exit(1);
}
}
// Send the Messages
#pragma omp task depend(inout \
: SRequest, PSizeInfoMessages, PCumulative) depend(out \
: *msgActual, *msgInd)
{
for (i = 0; i < numProcs; i++) { // Changed by Fabio to be an integer, addresses needs to be integers!
if (i == myRank) // Do not send anything to yourself
continue;
// Send the Message with information about the size of next message:
// Build the Message Packet:
PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
PSizeInfoMessages[i * 3 + 1] = -1; // Dummy packet
PSizeInfoMessages[i * 3 + 2] = SIZEINFO; // TYPE
// Send a Request (Asynchronous)
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
fflush(stdout);
#endif
if (PSizeInfoMessages[i * 3 + 0] > 0)
{ // Send only if it is a nonempty packet
MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
&SRequest[(*msgInd)]);
(*msgActual)++;
(*msgInd)++;
// Now Send the message with the data packet:
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
cout << PMessageBundle[k] << ",";
cout << endl;
fflush(stdout);
#endif
MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
(*msgInd)++;
} // End of if size > 0
}
}
#pragma omp task depend(inout \
: PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner)
{
// Free up temporary memory:
PCumulative.clear();
QLocalVtx.clear();
QGhostVtx.clear();
QMsgType.clear();
QOwner.clear();
}
#pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges)
{
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
if (*numGhostEdges > 0)
{
cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
}
fflush(stdout);
#endif
// Allocate memory for MPI Send messages:
/* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
OneMessageSize = 0;
MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
// How many messages to send?
// Potentially three kinds of messages will be sent/received:
// Request, Success, Failure.
// But only two will be sent from a given processor.
// Substract the number of messages that have already been sent as bundled messages:
numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
*BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
}
#pragma omp task depend(out : Buffer) depend(in : *BufferSize)
{
Buffer = 0;
#ifdef PRINT_DEBUG_INFO_
cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
cout << "\n(" << myRank << ")Attaching Buffer on.. ";
fflush(stdout);
#endif
if ((*BufferSize) > 0)
{
Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
if (Buffer == 0)
{
cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
exit(1);
}
MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
}
}
}
}
}

@ -109,6 +109,8 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_cspmat_type) :: ac, op_restr, op_prol type(psb_cspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_c_onelev_mat_asb' name='amg_c_onelev_mat_asb'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',& call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol) & amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation ! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
! !
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999 goto 9999
@ -151,14 +160,17 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for ! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol ! ac, op_restr and op_prol
! !
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) & if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,& & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info) & lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999 goto 9999

@ -109,6 +109,8 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_dspmat_type) :: ac, op_restr, op_prol type(psb_dspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_d_onelev_mat_asb' name='amg_d_onelev_mat_asb'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',& call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol) & amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation ! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
! !
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999 goto 9999
@ -151,14 +160,17 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for ! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol ! ac, op_restr and op_prol
! !
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) & if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,& & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info) & lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999 goto 9999

@ -109,6 +109,8 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_sspmat_type) :: ac, op_restr, op_prol type(psb_sspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_s_onelev_mat_asb' name='amg_s_onelev_mat_asb'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',& call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol) & amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation ! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
! !
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999 goto 9999
@ -151,14 +160,17 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for ! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol ! ac, op_restr and op_prol
! !
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) & if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,& & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info) & lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999 goto 9999

@ -109,6 +109,8 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
type(psb_zspmat_type) :: ac, op_restr, op_prol type(psb_zspmat_type) :: ac, op_restr, op_prol
integer(psb_ipk_) :: nzl, inl integer(psb_ipk_) :: nzl, inl
integer(psb_ipk_) :: debug_level, debug_unit integer(psb_ipk_) :: debug_level, debug_unit
integer(psb_ipk_), save :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
logical, parameter :: do_timings=.false.
name='amg_z_onelev_mat_asb' name='amg_z_onelev_mat_asb'
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
info = psb_success_ info = psb_success_
ctxt = desc_a%get_context() ctxt = desc_a%get_context()
call psb_info(ctxt,me,np) call psb_info(ctxt,me,np)
if ((do_timings).and.(idx_matbld==-1)) &
& idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
if ((do_timings).and.(idx_matasb==-1)) &
& idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
if ((do_timings).and.(idx_mapbld==-1)) &
& idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
call amg_check_def(lv%parms%aggr_prol,'Smoother',& call amg_check_def(lv%parms%aggr_prol,'Smoother',&
& amg_smooth_prol_,is_legal_ml_aggr_prol) & amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! the mapping defined by amg_aggrmap_bld and applying the aggregation ! the mapping defined by amg_aggrmap_bld and applying the aggregation
! algorithm specified by lv%iprcparm(amg_aggr_prol_) ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
! !
if (do_timings) call psb_tic(idx_matbld)
call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,& call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
& lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info) & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
if (do_timings) call psb_toc(idx_matbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb') call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
goto 9999 goto 9999
@ -151,14 +160,17 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
! Now build its descriptor and convert global indices for ! Now build its descriptor and convert global indices for
! ac, op_restr and op_prol ! ac, op_restr and op_prol
! !
if (do_timings) call psb_tic(idx_matasb)
if (info == psb_success_) & if (info == psb_success_) &
& call lv%aggr%mat_asb(lv%parms,a,desc_a,& & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
& lv%ac,lv%desc_ac,op_prol,op_restr,info) & lv%ac,lv%desc_ac,op_prol,op_restr,info)
if (do_timings) call psb_toc(idx_matasb)
if (do_timings) call psb_tic(idx_mapbld)
if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_) if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,& if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
& ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info) & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
if (do_timings) call psb_toc(idx_mapbld)
if(info /= psb_success_) then if(info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld') call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
goto 9999 goto 9999

@ -56,6 +56,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else else

@ -56,6 +56,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='c_gs_solver_bld', ch_err character(len=20) :: name='c_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else else
info = psb_err_missing_override_method_ info = psb_err_missing_override_method_

@ -56,6 +56,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else else

@ -56,6 +56,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_gs_solver_bld', ch_err character(len=20) :: name='d_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else else
info = psb_err_missing_override_method_ info = psb_err_missing_override_method_

@ -56,6 +56,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else else

@ -56,6 +56,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='s_gs_solver_bld', ch_err character(len=20) :: name='s_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else else
info = psb_err_missing_override_method_ info = psb_err_missing_override_method_

@ -56,6 +56,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='d_bwgs_solver_bld', ch_err character(len=20) :: name='d_bwgs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
else else

@ -56,6 +56,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level integer(psb_ipk_) :: np, me, i, err_act, debug_unit, debug_level
character(len=20) :: name='z_gs_solver_bld', ch_err character(len=20) :: name='z_gs_solver_bld', ch_err
integer(psb_ipk_), save :: idx_tril=-1
logical, parameter :: do_timings=.true.
info=psb_success_ info=psb_success_
call psb_erractionsave(err_act) call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
call psb_info(ctxt, me, np) call psb_info(ctxt, me, np)
if (debug_level >= psb_debug_outer_) & if (debug_level >= psb_debug_outer_) &
& write(debug_unit,*) me,' ',trim(name),' start' & write(debug_unit,*) me,' ',trim(name),' start'
if ((do_timings).and.(idx_tril==-1)) &
& idx_tril = psb_get_timer_idx("GS_BLD: tril")
n_row = desc_a%get_local_rows() n_row = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
! This cuts out the off-diagonal part, because it's supposed to ! This cuts out the off-diagonal part, because it's supposed to
! be handled by the outer Jacobi smoother. ! be handled by the outer Jacobi smoother.
! !
!write(0,*) 'Calling A%TRIL in gs_solver_bld'
if (do_timings) call psb_tic(idx_tril)
call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u) call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
if (do_timings) call psb_toc(idx_tril)
!write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
else else
info = psb_err_missing_override_method_ info = psb_err_missing_override_method_

@ -0,0 +1,25 @@
cd amgprec/impl/aggregator/
rm MatchBoxPC.o
rm sendBundledMessages.o
rm initialize.o
rm extractUChunk.o
rm isAlreadyMatched.o
rm findOwnerOfGhost.o
rm computeCandidateMate.o
rm parallelComputeCandidateMateB.o
rm processMatchedVertices.o
rm processCrossEdge.o
rm queueTransfer.o
rm processMessages.o
rm processExposedVertex.o
rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o
rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
cd ../../../
make all
cd samples/advanced/pdegen
make amg_d_pde3d
cd runs
mpirun -np 4 amg_d_pde3d amg_pde3d.inp

@ -93,6 +93,9 @@ contains
& a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod use psb_base_mod
use psb_util_mod use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
! !
! Discretizes the partial differential equation ! Discretizes the partial differential equation
! !
@ -128,7 +131,6 @@ contains
type(psb_d_csc_sparse_mat) :: acsc type(psb_d_csc_sparse_mat) :: acsc
type(psb_d_coo_sparse_mat) :: acoo type(psb_d_coo_sparse_mat) :: acoo
type(psb_d_csr_sparse_mat) :: acsr type(psb_d_csr_sparse_mat) :: acsr
real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
! Process grid ! Process grid
integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) integer(psb_lpk_), allocatable :: myidx(:)
real(psb_dpk_), allocatable :: val(:)
! deltah dimension of each grid cell ! deltah dimension of each grid cell
! deltat discretization time ! deltat discretization time
real(psb_dpk_) :: deltah, sqdeltah, deltah2 real(psb_dpk_) :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
call psb_barrier(ctxt) call psb_barrier(ctxt)
talc = psb_wtime()-t0 talc = psb_wtime()-t0
if (info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='allocation rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
do ii=1, nlr,nb !$omp parallel shared(deltah,myidx,a,desc_a)
ib = min(nb,nlr-ii+1) !
icoeff = 1 block
do k=1,ib integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
i=ii+k-1 integer(psb_lpk_) :: glob_row
! local matrix pointer integer(psb_lpk_), allocatable :: irow(:),icol(:)
glob_row=myidx(i) real(psb_dpk_), allocatable :: val(:)
! compute gridpoint coordinates real(psb_dpk_) :: x,y,z, zt(nb)
call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) #if defined(OPENMP)
! x, y, z coordinates nth = omp_get_num_threads()
x = (ix-1)*deltah ith = omp_get_thread_num()
y = (iy-1)*deltah #else
z = (iz-1)*deltah nth = 1
zt(k) = f_(x,y,z) ith = 0
! internal point: build discretization #endif
! allocate(val(20*nb),irow(20*nb),&
! term depending on (x-1,y,z) &icol(20*nb),stat=info)
! if (info /= psb_success_ ) then
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 info=psb_err_alloc_dealloc_
if (ix == 1) then call psb_errpush(info,name)
zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) !goto 9999
else endif
call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row !$omp do schedule(dynamic)
icoeff = icoeff+1 !
endif do ii=1, nlr, nb
! term depending on (x,y-1,z) if (info /= psb_success_) cycle
val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 ib = min(nb,nlr-ii+1)
if (iy == 1) then icoeff = 1
zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) do k=1,ib
else i=ii+k-1
call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) ! local matrix pointer
irow(icoeff) = glob_row glob_row=myidx(i)
icoeff = icoeff+1 ! compute gridpoint coordinates
endif call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
! term depending on (x,y,z-1) ! x, y, z coordinates
val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 x = (ix-1)*deltah
if (iz == 1) then y = (iy-1)*deltah
zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) z = (iz-1)*deltah
else zt(k) = f_(x,y,z)
call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) ! internal point: build discretization
irow(icoeff) = glob_row !
icoeff = icoeff+1 ! term depending on (x-1,y,z)
endif !
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
! term depending on (x,y,z) if (ix == 1) then
val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
& + c(x,y,z) else
call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row irow(icoeff) = glob_row
icoeff = icoeff+1 icoeff = icoeff+1
! term depending on (x,y,z+1) endif
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 ! term depending on (x,y-1,z)
if (iz == idim) then val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) if (iy == 1) then
else zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k)
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) else
irow(icoeff) = glob_row call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
icoeff = icoeff+1 irow(icoeff) = glob_row
endif icoeff = icoeff+1
! term depending on (x,y+1,z) endif
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 ! term depending on (x,y,z-1)
if (iy == idim) then val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) if (iz == 1) then
else zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k)
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) else
irow(icoeff) = glob_row call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
icoeff = icoeff+1 irow(icoeff) = glob_row
endif icoeff = icoeff+1
! term depending on (x+1,y,z) endif
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then ! term depending on (x,y,z)
zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
else & + c(x,y,z)
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row irow(icoeff) = glob_row
icoeff = icoeff+1 icoeff = icoeff+1
endif ! term depending on (x,y,z+1)
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
if (iz == idim) then
zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1,z)
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
if (iy == idim) then
zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y,z)
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then
zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
!write(0,*) ' Outer in_parallel ',omp_in_parallel()
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=dzero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) !$omp end do
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) deallocate(val,irow,icol)
if(info /= psb_success_) exit end block
zt(:)=dzero !$omp end parallel
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
tgen = psb_wtime()-t1 tgen = psb_wtime()-t1
if(info /= psb_success_) then if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
goto 9999 goto 9999
end if end if
deallocate(val,irow,icol)
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
& a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod use psb_base_mod
use psb_util_mod use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
! !
! Discretizes the partial differential equation ! Discretizes the partial differential equation
! !
@ -591,7 +603,6 @@ contains
type(psb_d_csc_sparse_mat) :: acsc type(psb_d_csc_sparse_mat) :: acsc
type(psb_d_coo_sparse_mat) :: acoo type(psb_d_coo_sparse_mat) :: acoo
type(psb_d_csr_sparse_mat) :: acsr type(psb_d_csr_sparse_mat) :: acsr
real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
! Process grid ! Process grid
integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) integer(psb_lpk_), allocatable :: myidx(:)
real(psb_dpk_), allocatable :: val(:)
! deltah dimension of each grid cell ! deltah dimension of each grid cell
! deltat discretization time ! deltat discretization time
real(psb_dpk_) :: deltah, sqdeltah, deltah2, dd real(psb_dpk_) :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
goto 9999 goto 9999
end if end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
do ii=1, nlr,nb !$omp parallel shared(deltah,myidx,a,desc_a)
ib = min(nb,nlr-ii+1) !
icoeff = 1 block
do k=1,ib integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
i=ii+k-1 integer(psb_lpk_) :: glob_row
! local matrix pointer integer(psb_lpk_), allocatable :: irow(:),icol(:)
glob_row=myidx(i) real(psb_dpk_), allocatable :: val(:)
! compute gridpoint coordinates real(psb_dpk_) :: x,y,z, zt(nb)
call idx2ijk(ix,iy,glob_row,idim,idim) #if defined(OPENMP)
! x, y coordinates nth = omp_get_num_threads()
x = (ix-1)*deltah ith = omp_get_thread_num()
y = (iy-1)*deltah #else
nth = 1
zt(k) = f_(x,y) ith = 0
! internal point: build discretization #endif
! allocate(val(20*nb),irow(20*nb),&
! term depending on (x-1,y) &icol(20*nb),stat=info)
! if (info /= psb_success_ ) then
val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 info=psb_err_alloc_dealloc_
if (ix == 1) then call psb_errpush(info,name)
zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) !goto 9999
else endif
call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
irow(icoeff) = glob_row ! loop over rows belonging to current process in a block
icoeff = icoeff+1 ! distribution.
endif !$omp do schedule(dynamic)
! term depending on (x,y-1) !
val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 do ii=1, nlr,nb
if (iy == 1) then ib = min(nb,nlr-ii+1)
zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) icoeff = 1
else do k=1,ib
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) i=ii+k-1
irow(icoeff) = glob_row ! local matrix pointer
icoeff = icoeff+1 glob_row=myidx(i)
endif ! compute gridpoint coordinates
call idx2ijk(ix,iy,glob_row,idim,idim)
! term depending on (x,y) ! x, y coordinates
val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) x = (ix-1)*deltah
call ijk2idx(icol(icoeff),ix,iy,idim,idim) y = (iy-1)*deltah
irow(icoeff) = glob_row
icoeff = icoeff+1 zt(k) = f_(x,y)
! term depending on (x,y+1) ! internal point: build discretization
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 !
if (iy == idim) then ! term depending on (x-1,y)
zt(k) = g(x,done)*(-val(icoeff)) + zt(k) !
else val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) if (ix == 1) then
irow(icoeff) = glob_row zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
icoeff = icoeff+1 else
endif call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
! term depending on (x+1,y) irow(icoeff) = glob_row
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 icoeff = icoeff+1
if (ix==idim) then endif
zt(k) = g(done,y)*(-val(icoeff)) + zt(k) ! term depending on (x,y-1)
else val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) if (iy == 1) then
zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y)
val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
call ijk2idx(icol(icoeff),ix,iy,idim,idim)
irow(icoeff) = glob_row irow(icoeff) = glob_row
icoeff = icoeff+1 icoeff = icoeff+1
endif ! term depending on (x,y+1)
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
if (iy == idim) then
zt(k) = g(x,done)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y)
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
if (ix==idim) then
zt(k) = g(done,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=dzero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) !$omp end do
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) deallocate(val,irow,icol)
if(info /= psb_success_) exit end block
zt(:)=dzero !$omp end parallel
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
tgen = psb_wtime()-t1 tgen = psb_wtime()-t1
if(info /= psb_success_) then if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
goto 9999 goto 9999
end if end if
deallocate(val,irow,icol)
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
call psb_cdasb(desc_a,info) call psb_cdasb(desc_a,info)

@ -73,6 +73,9 @@ program amg_d_pde2d
use amg_d_pde2d_exp_mod use amg_d_pde2d_exp_mod
use amg_d_pde2d_box_mod use amg_d_pde2d_box_mod
use amg_d_genpde_mod use amg_d_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! input parameters ! input parameters
@ -93,7 +96,7 @@ program amg_d_pde2d
type(psb_d_vect_type) :: x,b,r type(psb_d_vect_type) :: x,b,r
! parallel environment ! parallel environment
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np integer(psb_ipk_) :: iam, np, nth
! solver parameters ! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_d_pde2d
call psb_init(ctxt) call psb_init(ctxt)
call psb_info(ctxt,iam,np) call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then if (iam < 0) then
! This should not happen, but just in case ! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_d_pde2d
call psb_sum(ctxt,precsize) call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit) call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr)
write(psb_out_unit,'("Iterations to convergence : ",i12)') iter write(psb_out_unit,'("Iterations to convergence : ",i12)') iter
write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err
write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs() write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs()
write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier

@ -74,6 +74,9 @@ program amg_d_pde3d
use amg_d_pde3d_exp_mod use amg_d_pde3d_exp_mod
use amg_d_pde3d_gauss_mod use amg_d_pde3d_gauss_mod
use amg_d_genpde_mod use amg_d_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! input parameters ! input parameters
@ -94,7 +97,7 @@ program amg_d_pde3d
type(psb_d_vect_type) :: x,b,r type(psb_d_vect_type) :: x,b,r
! parallel environment ! parallel environment
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np integer(psb_ipk_) :: iam, np, nth
! solver parameters ! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_d_pde3d
! other variables ! other variables
integer(psb_ipk_) :: info, i, k integer(psb_ipk_) :: info, i, k
character(len=20) :: name,ch_err character(len=20) :: name,ch_err
type(psb_d_csr_sparse_mat) :: amold
info=psb_success_ info=psb_success_
call psb_init(ctxt) call psb_init(ctxt)
call psb_info(ctxt,iam,np) call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then if (iam < 0) then
! This should not happen, but just in case ! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_d_pde3d
end if end if
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
call prec%smoothers_build(a,desc_a,info) call prec%smoothers_build(a,desc_a,info,amold=amold)
tprec = psb_wtime()-t1 tprec = psb_wtime()-t1
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld') call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_d_pde3d
call psb_sum(ctxt,precsize) call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit) call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_d_pde3d
write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt() write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt()
end if end if
call psb_print_timers(ctxt)
! !
! cleanup storage and exit ! cleanup storage and exit
! !

@ -93,6 +93,9 @@ contains
& a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod use psb_base_mod
use psb_util_mod use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
! !
! Discretizes the partial differential equation ! Discretizes the partial differential equation
! !
@ -128,7 +131,6 @@ contains
type(psb_s_csc_sparse_mat) :: acsc type(psb_s_csc_sparse_mat) :: acsc
type(psb_s_coo_sparse_mat) :: acoo type(psb_s_coo_sparse_mat) :: acoo
type(psb_s_csr_sparse_mat) :: acsr type(psb_s_csr_sparse_mat) :: acsr
real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
! Process grid ! Process grid
integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) integer(psb_lpk_), allocatable :: myidx(:)
real(psb_spk_), allocatable :: val(:)
! deltah dimension of each grid cell ! deltah dimension of each grid cell
! deltat discretization time ! deltat discretization time
real(psb_spk_) :: deltah, sqdeltah, deltah2 real(psb_spk_) :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
call psb_barrier(ctxt) call psb_barrier(ctxt)
talc = psb_wtime()-t0 talc = psb_wtime()-t0
if (info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='allocation rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
do ii=1, nlr,nb !$omp parallel shared(deltah,myidx,a,desc_a)
ib = min(nb,nlr-ii+1) !
icoeff = 1 block
do k=1,ib integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
i=ii+k-1 integer(psb_lpk_) :: glob_row
! local matrix pointer integer(psb_lpk_), allocatable :: irow(:),icol(:)
glob_row=myidx(i) real(psb_spk_), allocatable :: val(:)
! compute gridpoint coordinates real(psb_spk_) :: x,y,z, zt(nb)
call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) #if defined(OPENMP)
! x, y, z coordinates nth = omp_get_num_threads()
x = (ix-1)*deltah ith = omp_get_thread_num()
y = (iy-1)*deltah #else
z = (iz-1)*deltah nth = 1
zt(k) = f_(x,y,z) ith = 0
! internal point: build discretization #endif
! allocate(val(20*nb),irow(20*nb),&
! term depending on (x-1,y,z) &icol(20*nb),stat=info)
! if (info /= psb_success_ ) then
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 info=psb_err_alloc_dealloc_
if (ix == 1) then call psb_errpush(info,name)
zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) !goto 9999
else endif
call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row !$omp do schedule(dynamic)
icoeff = icoeff+1 !
endif do ii=1, nlr, nb
! term depending on (x,y-1,z) if (info /= psb_success_) cycle
val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 ib = min(nb,nlr-ii+1)
if (iy == 1) then icoeff = 1
zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) do k=1,ib
else i=ii+k-1
call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) ! local matrix pointer
irow(icoeff) = glob_row glob_row=myidx(i)
icoeff = icoeff+1 ! compute gridpoint coordinates
endif call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
! term depending on (x,y,z-1) ! x, y, z coordinates
val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 x = (ix-1)*deltah
if (iz == 1) then y = (iy-1)*deltah
zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) z = (iz-1)*deltah
else zt(k) = f_(x,y,z)
call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) ! internal point: build discretization
irow(icoeff) = glob_row !
icoeff = icoeff+1 ! term depending on (x-1,y,z)
endif !
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
! term depending on (x,y,z) if (ix == 1) then
val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
& + c(x,y,z) else
call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row irow(icoeff) = glob_row
icoeff = icoeff+1 icoeff = icoeff+1
! term depending on (x,y,z+1) endif
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 ! term depending on (x,y-1,z)
if (iz == idim) then val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) if (iy == 1) then
else zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k)
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) else
irow(icoeff) = glob_row call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
icoeff = icoeff+1 irow(icoeff) = glob_row
endif icoeff = icoeff+1
! term depending on (x,y+1,z) endif
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 ! term depending on (x,y,z-1)
if (iy == idim) then val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) if (iz == 1) then
else zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k)
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) else
irow(icoeff) = glob_row call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
icoeff = icoeff+1 irow(icoeff) = glob_row
endif icoeff = icoeff+1
! term depending on (x+1,y,z) endif
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then ! term depending on (x,y,z)
zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
else & + c(x,y,z)
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row irow(icoeff) = glob_row
icoeff = icoeff+1 icoeff = icoeff+1
endif ! term depending on (x,y,z+1)
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
if (iz == idim) then
zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1,z)
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
if (iy == idim) then
zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y,z)
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then
zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
!write(0,*) ' Outer in_parallel ',omp_in_parallel()
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=szero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) !$omp end do
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) deallocate(val,irow,icol)
if(info /= psb_success_) exit end block
zt(:)=szero !$omp end parallel
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
tgen = psb_wtime()-t1 tgen = psb_wtime()-t1
if(info /= psb_success_) then if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
goto 9999 goto 9999
end if end if
deallocate(val,irow,icol)
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
& a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
use psb_base_mod use psb_base_mod
use psb_util_mod use psb_util_mod
#if defined(OPENMP)
use omp_lib
#endif
! !
! Discretizes the partial differential equation ! Discretizes the partial differential equation
! !
@ -591,7 +603,6 @@ contains
type(psb_s_csc_sparse_mat) :: acsc type(psb_s_csc_sparse_mat) :: acsc
type(psb_s_coo_sparse_mat) :: acoo type(psb_s_coo_sparse_mat) :: acoo
type(psb_s_csr_sparse_mat) :: acsr type(psb_s_csr_sparse_mat) :: acsr
real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
! Process grid ! Process grid
integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) integer(psb_lpk_), allocatable :: myidx(:)
real(psb_spk_), allocatable :: val(:)
! deltah dimension of each grid cell ! deltah dimension of each grid cell
! deltat discretization time ! deltat discretization time
real(psb_spk_) :: deltah, sqdeltah, deltah2, dd real(psb_spk_) :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
goto 9999 goto 9999
end if end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
do ii=1, nlr,nb !$omp parallel shared(deltah,myidx,a,desc_a)
ib = min(nb,nlr-ii+1) !
icoeff = 1 block
do k=1,ib integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
i=ii+k-1 integer(psb_lpk_) :: glob_row
! local matrix pointer integer(psb_lpk_), allocatable :: irow(:),icol(:)
glob_row=myidx(i) real(psb_spk_), allocatable :: val(:)
! compute gridpoint coordinates real(psb_spk_) :: x,y,z, zt(nb)
call idx2ijk(ix,iy,glob_row,idim,idim) #if defined(OPENMP)
! x, y coordinates nth = omp_get_num_threads()
x = (ix-1)*deltah ith = omp_get_thread_num()
y = (iy-1)*deltah #else
nth = 1
zt(k) = f_(x,y) ith = 0
! internal point: build discretization #endif
! allocate(val(20*nb),irow(20*nb),&
! term depending on (x-1,y) &icol(20*nb),stat=info)
! if (info /= psb_success_ ) then
val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 info=psb_err_alloc_dealloc_
if (ix == 1) then call psb_errpush(info,name)
zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) !goto 9999
else endif
call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
irow(icoeff) = glob_row ! loop over rows belonging to current process in a block
icoeff = icoeff+1 ! distribution.
endif !$omp do schedule(dynamic)
! term depending on (x,y-1) !
val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 do ii=1, nlr,nb
if (iy == 1) then ib = min(nb,nlr-ii+1)
zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) icoeff = 1
else do k=1,ib
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) i=ii+k-1
irow(icoeff) = glob_row ! local matrix pointer
icoeff = icoeff+1 glob_row=myidx(i)
endif ! compute gridpoint coordinates
call idx2ijk(ix,iy,glob_row,idim,idim)
! term depending on (x,y) ! x, y coordinates
val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) x = (ix-1)*deltah
call ijk2idx(icol(icoeff),ix,iy,idim,idim) y = (iy-1)*deltah
irow(icoeff) = glob_row
icoeff = icoeff+1 zt(k) = f_(x,y)
! term depending on (x,y+1) ! internal point: build discretization
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 !
if (iy == idim) then ! term depending on (x-1,y)
zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) !
else val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) if (ix == 1) then
irow(icoeff) = glob_row zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
icoeff = icoeff+1 else
endif call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
! term depending on (x+1,y) irow(icoeff) = glob_row
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 icoeff = icoeff+1
if (ix==idim) then endif
zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) ! term depending on (x,y-1)
else val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) if (iy == 1) then
zt(k) = g(x,szero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y)
val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
call ijk2idx(icol(icoeff),ix,iy,idim,idim)
irow(icoeff) = glob_row irow(icoeff) = glob_row
icoeff = icoeff+1 icoeff = icoeff+1
endif ! term depending on (x,y+1)
val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
if (iy == idim) then
zt(k) = g(x,sone)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y)
val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
if (ix==idim) then
zt(k) = g(sone,y)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) cycle
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) cycle
zt(:)=szero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) cycle
end do end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) !$omp end do
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) deallocate(val,irow,icol)
if(info /= psb_success_) exit end block
zt(:)=szero !$omp end parallel
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
tgen = psb_wtime()-t1 tgen = psb_wtime()-t1
if(info /= psb_success_) then if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
goto 9999 goto 9999
end if end if
deallocate(val,irow,icol)
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
call psb_cdasb(desc_a,info) call psb_cdasb(desc_a,info)

@ -73,6 +73,9 @@ program amg_s_pde2d
use amg_s_pde2d_exp_mod use amg_s_pde2d_exp_mod
use amg_s_pde2d_box_mod use amg_s_pde2d_box_mod
use amg_s_genpde_mod use amg_s_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! input parameters ! input parameters
@ -93,7 +96,7 @@ program amg_s_pde2d
type(psb_s_vect_type) :: x,b,r type(psb_s_vect_type) :: x,b,r
! parallel environment ! parallel environment
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np integer(psb_ipk_) :: iam, np, nth
! solver parameters ! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_s_pde2d
call psb_init(ctxt) call psb_init(ctxt)
call psb_info(ctxt,iam,np) call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then if (iam < 0) then
! This should not happen, but just in case ! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_s_pde2d
call psb_sum(ctxt,precsize) call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit) call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr) write(psb_out_unit,'("Preconditioner : ",a)') trim(p_choice%descr)
write(psb_out_unit,'("Iterations to convergence : ",i12)') iter write(psb_out_unit,'("Iterations to convergence : ",i12)') iter
write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err write(psb_out_unit,'("Relative error estimate on exit : ",es12.5)') err
write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs() write(psb_out_unit,'("Number of levels in hierarchy : ",i12)') prec%get_nlevs()
write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier write(psb_out_unit,'("Time to build hierarchy : ",es12.5)') thier

@ -74,6 +74,9 @@ program amg_s_pde3d
use amg_s_pde3d_exp_mod use amg_s_pde3d_exp_mod
use amg_s_pde3d_gauss_mod use amg_s_pde3d_gauss_mod
use amg_s_genpde_mod use amg_s_genpde_mod
#if defined(OPENMP)
use omp_lib
#endif
implicit none implicit none
! input parameters ! input parameters
@ -94,7 +97,7 @@ program amg_s_pde3d
type(psb_s_vect_type) :: x,b,r type(psb_s_vect_type) :: x,b,r
! parallel environment ! parallel environment
type(psb_ctxt_type) :: ctxt type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np integer(psb_ipk_) :: iam, np, nth
! solver parameters ! solver parameters
integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_s_pde3d
! other variables ! other variables
integer(psb_ipk_) :: info, i, k integer(psb_ipk_) :: info, i, k
character(len=20) :: name,ch_err character(len=20) :: name,ch_err
type(psb_s_csr_sparse_mat) :: amold
info=psb_success_ info=psb_success_
call psb_init(ctxt) call psb_init(ctxt)
call psb_info(ctxt,iam,np) call psb_info(ctxt,iam,np)
#if defined(OPENMP)
!$OMP parallel shared(nth)
!$OMP master
nth = omp_get_num_threads()
!$OMP end master
!$OMP end parallel
#else
nth = 1
#endif
if (iam < 0) then if (iam < 0) then
! This should not happen, but just in case ! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_s_pde3d
end if end if
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
call prec%smoothers_build(a,desc_a,info) call prec%smoothers_build(a,desc_a,info,amold=amold)
tprec = psb_wtime()-t1 tprec = psb_wtime()-t1
if (info /= psb_success_) then if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld') call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_s_pde3d
call psb_sum(ctxt,precsize) call psb_sum(ctxt,precsize)
call prec%descr(info,iout=psb_out_unit) call prec%descr(info,iout=psb_out_unit)
if (iam == psb_root_) then if (iam == psb_root_) then
write(psb_out_unit,'("Computed solution on ",i8," processors")') np write(psb_out_unit,'("Computed solution on ",i8," process(es)")') np
write(psb_out_unit,'("Number of threads : ",i12)') nth
write(psb_out_unit,'("Total number of tasks : ",i12)') nth*np
write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Linear system size : ",i12)') system_size
write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff) write(psb_out_unit,'("PDE Coefficients : ",a)') trim(pdecoeff)
write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd) write(psb_out_unit,'("Krylov method : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_s_pde3d
write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt() write(psb_out_unit,'("Storage format for DESC_A : ",a )') desc_a%get_fmt()
end if end if
call psb_print_timers(ctxt)
! !
! cleanup storage and exit ! cleanup storage and exit
! !

@ -1,6 +1,6 @@
%%%%%%%%%%% General arguments % Lines starting with % are ignored. %%%%%%%%%%% General arguments % Lines starting with % are ignored.
CSR ! Storage format CSR COO JAD CSR ! Storage format CSR COO JAD
0080 ! IDIM; domain size. Linear system size is IDIM**3 0200 ! IDIM; domain size. Linear system size is IDIM**3
CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE CONST ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
2 ! ISTOPC 2 ! ISTOPC
@ -9,7 +9,7 @@ BICGSTAB ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS F
30 ! IRST (restart for RGMRES and BiCGSTABL) 30 ! IRST (restart for RGMRES and BiCGSTABL)
1.d-6 ! EPS 1.d-6 ! EPS
%%%%%%%%%%% Main preconditioner choices %%%%%%%%%%%%%%%% %%%%%%%%%%% Main preconditioner choices %%%%%%%%%%%%%%%%
ML-VCYCLE-BJAC-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars) ML-VBM-VCYCLE-FBGS-D-BJAC ! Longer descriptive name for preconditioner (up to 20 chars)
ML ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML ML ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML
%%%%%%%%%%% First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%% %%%%%%%%%%% First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%%
FBGS ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous. FBGS ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous.
@ -39,8 +39,8 @@ VCYCLE ! Type of multilevel CYCLE: VCYCLE WCYCLE KCYCLE MUL
-3 ! Max Number of levels in a multilevel preconditioner; if <0, lib default -3 ! Max Number of levels in a multilevel preconditioner; if <0, lib default
-3 ! Target coarse matrix size per process; if <0, lib default -3 ! Target coarse matrix size per process; if <0, lib default
SMOOTHED ! Type of aggregation: SMOOTHED UNSMOOTHED SMOOTHED ! Type of aggregation: SMOOTHED UNSMOOTHED
COUPLED ! Parallel aggregation: DEC, SYMDEC, COUPLED DEC ! Parallel aggregation: DEC, SYMDEC, COUPLED
MATCHBOXP ! aggregation measure SOC1, MATCHBOXP SOC1 ! aggregation measure SOC1, MATCHBOXP
8 ! Requested size of the aggregates for MATCHBOXP 8 ! Requested size of the aggregates for MATCHBOXP
NATURAL ! Ordering of aggregation NATURAL DEGREE NATURAL ! Ordering of aggregation NATURAL DEGREE
-1.5 ! Coarsening ratio, if < 0 use library default -1.5 ! Coarsening ratio, if < 0 use library default

Loading…
Cancel
Save