[WIP] RMA swapdata optimization, nest builder updates, GPU/scorep test tooling
- psi_dswapdata / psb_comm_rma: RMA swapdata optimization (PSCW + Win_allocate) - psb_d_nest_builder: builder updates - cuda: Makefile + psb_d_cuda_vect_mod adjustments - test/comm: spmv/cg drivers, Makefiles, GPU + scorep/strong-scaling scripts - test/nested: halo-regime test + Makefile/CMake updatescommunication_v2
parent
4106f44331
commit
9533c22989
@ -0,0 +1,89 @@
|
||||
#!/bin/bash
|
||||
# ============================================================================
|
||||
# gpu_cg.sh GPU Conjugate Gradient across comm backends -- "sparse ranks"
|
||||
#
|
||||
# Final test of the comm-scheme study: a full CG solve (not just spmv) on GPU,
|
||||
# with few MPI ranks (1 rank = 1 GPU) spread over MANY nodes so the halo
|
||||
# exchange is dominated by inter-node traffic -- the regime where RMA may pay
|
||||
# off. The test sweeps ALL backends (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET,
|
||||
# MPI_PUT) x 3 preconditioners internally, so there is no per-mode loop here.
|
||||
#
|
||||
# It also self-verifies, for every scheme, that an internally-allocated work
|
||||
# vector (like CG's hidden r/p/q/z) inherits the scheme from desc_a%comm_type
|
||||
# -- look for "[OK] internal-style work vector inherited scheme" in run.out,
|
||||
# and abort on "INTERNAL-VECTOR SCHEME MISMATCH".
|
||||
# ============================================================================
|
||||
#SBATCH --job-name=cg_gpu
|
||||
#SBATCH --output=cg_gpu_%j.out
|
||||
#SBATCH --error=cg_gpu_%j.err
|
||||
#SBATCH --nodes=8
|
||||
#SBATCH --ntasks-per-node=4 # MN5-ACC: 4x H100 per node
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --cpus-per-task=20 # 80 cores / 4 ranks
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=acc_debug
|
||||
#SBATCH --account=ehpc580
|
||||
|
||||
# ============================================================================
|
||||
# USER CONFIGURATION
|
||||
# ============================================================================
|
||||
NREP=8 # CG solves per (scheme,prec) for statistics
|
||||
NWARM=1 # warm-up solves (discarded)
|
||||
ITMAX=500 # CG iteration cap
|
||||
IDIM=200 # ignored when --matrix is given
|
||||
|
||||
# --- "sparse" knob: GPUs (= ranks) per node.
|
||||
# RANKS_PER_NODE=1 -> 1 GPU/node, maximally spread, pure inter-node comm.
|
||||
RANKS_PER_NODE=1
|
||||
RANK_POINTS="2 4 8" # total ranks per scale point
|
||||
|
||||
EXE=$HOME/Desktop/scorep/psblas3/test/comm/cg/runs/psb_comm_cg_test
|
||||
MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx
|
||||
|
||||
# ============================================================================
|
||||
# ENVIRONMENT
|
||||
# ============================================================================
|
||||
module purge
|
||||
module load bsc/1.0
|
||||
module load nvidia-hpc-sdk/25.3
|
||||
module load gcc/12.3.0
|
||||
module load ucx/1.16.0-gcc
|
||||
module load openmpi/5.0.5-gcc
|
||||
module load openblas/0.3.27-gcc
|
||||
export PATH="$HOME/scorep-cuda/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
|
||||
|
||||
export OMPI_MCA_coll_hcoll_enable=0
|
||||
export SCOREP_CUDA_ENABLE=yes
|
||||
export SCOREP_CUDA_BUFFER=48M
|
||||
|
||||
RESDIR=$SLURM_SUBMIT_DIR/results_cg_gpu_${SLURM_JOB_ID}
|
||||
mkdir -p $RESDIR
|
||||
|
||||
echo "=== CG GPU (sparse-ranks study) ==="
|
||||
echo " nrep=$NREP nwarm=$NWARM itmax=$ITMAX ranks_per_node=$RANKS_PER_NODE"
|
||||
echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS]"
|
||||
echo "==================================="
|
||||
|
||||
for NRANKS in $RANK_POINTS; do
|
||||
NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE ))
|
||||
STEP_DIR=$RESDIR/${NRANKS}ranks
|
||||
mkdir -p $STEP_DIR
|
||||
|
||||
echo ""
|
||||
echo ">>> CG GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)"
|
||||
|
||||
srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \
|
||||
--gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \
|
||||
$EXE $IDIM $NREP $NWARM $ITMAX --gpu=TRUE --matrix=$MATRIX --fmt=MM \
|
||||
> $STEP_DIR/run.out 2>&1
|
||||
|
||||
echo ">>> exit=$? output=$STEP_DIR/run.out"
|
||||
# quick propagation sanity at submit time:
|
||||
grep -q "INTERNAL-VECTOR SCHEME MISMATCH" $STEP_DIR/run.out \
|
||||
&& echo " !! SCHEME PROPAGATION FAILED at $NRANKS ranks -- check run.out"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== CG GPU DONE. Results: $RESDIR ==="
|
||||
ls -R $RESDIR
|
||||
@ -0,0 +1,206 @@
|
||||
#!/bin/bash
|
||||
# ============================================================================
|
||||
# extract_scorep.sh -- auto-extract comm-scheme measurements from a
|
||||
# results_*_<jobid> directory (swapdata/spmv style).
|
||||
#
|
||||
# For every <N>ranks/ subdir it collects:
|
||||
# 1) run.out wall-clock avg time per backend (the REAL headline number)
|
||||
# 2) cube_stat -r per swap routine from scorep_profile/profile.cubex:
|
||||
# - INCL (total time in the routine, summed over call paths)
|
||||
# - one-time setup (Win_create/Win_allocate/topology_init/
|
||||
# ini_memory_buffer_layout/alltoallv_init)
|
||||
# - steady = INCL - one-time
|
||||
# - every MPI_* component (to scorep_components.csv)
|
||||
#
|
||||
# Output (written into RESULTS_DIR):
|
||||
# runout.csv ranks,backend,total_s,avg_s
|
||||
# scorep_summary.csv ranks,scheme,incl_s,onetime_s,steady_s
|
||||
# scorep_components.csv ranks,scheme,component,time_s
|
||||
# plus pretty pivot tables on stdout.
|
||||
#
|
||||
# Usage:
|
||||
# ./extract_scorep.sh # process ALL <N>ranks/ in current dir
|
||||
# ./extract_scorep.sh 80ranks # process ONE rank-point dir
|
||||
# ./extract_scorep.sh path/to/results # process ALL <N>ranks/ under that dir
|
||||
#
|
||||
# CSVs ACCUMULATE (header written only if absent), so you can run one
|
||||
# rank-point at a time and the pivots grow. To start over: rm *.csv
|
||||
# ============================================================================
|
||||
set -u
|
||||
ARG="${1:-.}"
|
||||
OUTDIR="${OUTDIR:-.}" # where the CSVs go (default: current dir)
|
||||
|
||||
command -v cube_stat >/dev/null 2>&1 || { echo "ERROR: cube_stat not in PATH"; exit 1; }
|
||||
|
||||
# scheme label : mangled routine name used by cube_stat -r
|
||||
SCHEMES=(
|
||||
"baseline:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_baseline_vect"
|
||||
"neighbor:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_topology_vect"
|
||||
"persistent:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_persistent_topology_vect"
|
||||
"rma_pull:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_pull_vect"
|
||||
"rma_push:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_push_vect"
|
||||
)
|
||||
|
||||
RUNOUT="$OUTDIR/runout.csv"
|
||||
SUMMARY="$OUTDIR/scorep_summary.csv"
|
||||
COMPONENTS="$OUTDIR/scorep_components.csv"
|
||||
# write header only if the file does not exist yet (append/accumulate mode)
|
||||
[ -f "$RUNOUT" ] || echo "ranks,backend,total_s,avg_s,setup_s" > "$RUNOUT"
|
||||
[ -f "$SUMMARY" ] || echo "ranks,scheme,incl_s,setup_s,gpuwait_s,pack_s,mpimove_s,mpiwait_s,netmpi_s" > "$SUMMARY"
|
||||
[ -f "$COMPONENTS" ] || echo "ranks,scheme,bucket,component,time_s" > "$COMPONENTS"
|
||||
|
||||
# Select dirs: a single rank-point dir (has scorep_profile/run.out) or, if ARG
|
||||
# is a results dir, all its <N>ranks/ subdirs.
|
||||
if [ -f "$ARG/scorep_profile/profile.cubex" ] || [ -f "$ARG/run.out" ]; then
|
||||
DIRS=( "$ARG" )
|
||||
else
|
||||
DIRS=( $(ls -d "$ARG"/*ranks 2>/dev/null | sort -t/ -k99 -V) )
|
||||
fi
|
||||
[ ${#DIRS[@]} -eq 0 ] && { echo "No rank-point dir(s) found under '$ARG'"; exit 1; }
|
||||
|
||||
for d in "${DIRS[@]}"; do
|
||||
ranks=$(basename "$d" | sed 's/[^0-9]//g')
|
||||
[ -z "$ranks" ] && continue
|
||||
|
||||
# ---- 1) run.out wall-clock avg + setup per backend ----
|
||||
# One run.out holds several "comm backend" blocks in sequence; we accumulate
|
||||
# tot/avg/setup for the current block and flush it when the next block starts
|
||||
# (or at EOF). "setup time" is optional: 0 if the benchmark does not emit it
|
||||
# (i.e. until the warm-up-timing patch lands), so the break-even falls back to
|
||||
# pure steady-state comparison.
|
||||
if [ -f "$d/run.out" ]; then
|
||||
awk -v r="$ranks" '
|
||||
function flush(){ if (be!=""){ print r","be","tot","avg","setup } be="";tot=0;avg=0;setup=0 }
|
||||
/comm backend/ { flush(); split($0,a,":"); be=a[2]; gsub(/ /,"",be) }
|
||||
/total time/ { split($0,a,":"); tot=a[2]+0 }
|
||||
/avg time/ { split($0,a,":"); avg=a[2]+0 }
|
||||
/setup time/ { split($0,a,":"); setup=a[2]+0 }
|
||||
END { flush() }
|
||||
' "$d/run.out" >> "$RUNOUT"
|
||||
fi
|
||||
|
||||
# ---- 2) scorep cube_stat per scheme ----
|
||||
cubex="$d/scorep_profile/profile.cubex"
|
||||
[ -f "$cubex" ] || { echo "WARN: no profile.cubex in $d" >&2; continue; }
|
||||
|
||||
for entry in "${SCHEMES[@]}"; do
|
||||
label="${entry%%:*}"; routine="${entry#*:}"
|
||||
out=$(cube_stat -r "$routine" "$cubex" 2>/dev/null)
|
||||
[ -z "$out" ] && continue
|
||||
printf '%s\n' "$out" | awk -v ranks="$ranks" -v scheme="$label" -v comp="$COMPONENTS" '
|
||||
# Bucketize every child component of the swap routine. cube_stat -r reports,
|
||||
# for each child, its inclusive time within the routine subtree, and the
|
||||
# children partition INCL -- so per-bucket sums are meaningful.
|
||||
#
|
||||
# Why this matters on GPU: the single largest component in EVERY scheme is
|
||||
# d_cuda_device_wait (~0.036s, the GPU stream sync), which is NOT network
|
||||
# time and is ~equal across schemes. The old "steady = INCL - onetime" left
|
||||
# it inside, masking the real differentiator -- the MPI wait/sync time.
|
||||
# Buckets (each component lands in exactly one, order matters):
|
||||
# setup : one-time Win_create / topology_init / alltoallv_init /
|
||||
# ini_memory_buffer_layout
|
||||
# gpuwait : GPU stream sync (device_wait, cuda_sync) -- exclude from MPI
|
||||
# mpiwait : blocking MPI sync (MPI_Wait, Win_wait/post/start/complete)
|
||||
# <-- the discriminating metric ("guarda le wait")
|
||||
# mpimove : actual transfer (Isend/Irecv/Put/Get/Start/[I]neighbor_alltoallv)
|
||||
# pack : gather/scatter/buffer (gthz*, sctb*, new_buffer, realloc)
|
||||
# netmpi = mpimove + mpiwait (honest per-iteration network cost)
|
||||
{
|
||||
k=split($0,a,","); v=a[k]+0 # value = last comma-field
|
||||
nm=$0; sub(/,[^,]*$/,"",nm) # name = everything before it
|
||||
if (nm ~ /^INCL\(/) { incl+=v; next }
|
||||
if (nm ~ /^EXCL\(/) { next }
|
||||
bkt="other"
|
||||
if (nm ~ /Win_create|Win_allocate|topology_init|ini_memory_buffer_layout|alltoallv_init/) { setup+=v; bkt="setup" }
|
||||
else if (nm ~ /device_wait|cuda_sync/) { gpuwait+=v; bkt="gpuwait" }
|
||||
else if (nm ~ /MPI_Wait|Win_wait|Win_post|Win_start|Win_complete/) { mpiwait+=v; bkt="mpiwait" }
|
||||
else if (nm ~ /MPI_Isend|MPI_Irecv|MPI_Send|MPI_Recv|MPI_Put|MPI_Get|MPI_Start|neighbor_alltoallv|Neighbor_alltoallv/) { mpimove+=v; bkt="mpimove" }
|
||||
else if (nm ~ /gthz|sctb|new_buffer|realloc/) { pack+=v; bkt="pack" }
|
||||
cn=nm; sub(/.*::/,"",cn); gsub(/"/,"",cn)
|
||||
print ranks","scheme","bkt","cn","v >> comp
|
||||
}
|
||||
END { printf "%s,%s,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f\n", \
|
||||
ranks, scheme, incl, setup, gpuwait, pack, mpimove, mpiwait, mpimove+mpiwait }
|
||||
' >> "$SUMMARY"
|
||||
done
|
||||
done
|
||||
|
||||
# ---- pretty pivots ----
|
||||
pivot() { # $1=csv $2=value-col-index(1-based, after ranks,key) $3=title
|
||||
awk -F, -v vc="$2" -v title="$3" '
|
||||
NR==1 { next }
|
||||
{ r=$1; k=$2; val=$(2+vc); v[r","k]=val; rs[r]=1; ks[k]=1 }
|
||||
END {
|
||||
n=0; for (k in ks) cols[++n]=k
|
||||
# stable-ish column order
|
||||
order="baseline neighbor persistent rma_pull rma_push P2P NEIGHBOR PNEIGHBOR MPI_GET MPI_PUT"
|
||||
m=split(order,oc," "); delete cols; n=0
|
||||
for (i=1;i<=m;i++) if (oc[i] in ks) cols[++n]=oc[i]
|
||||
printf "\n=== %s ===\n", title
|
||||
printf "%-8s", "ranks"; for (i=1;i<=n;i++) printf "%14s", cols[i]; printf "\n"
|
||||
nr=0; for (r in rs) rr[++nr]=r
|
||||
# numeric sort of ranks
|
||||
for (i=1;i<=nr;i++) for (j=i+1;j<=nr;j++) if (rr[i]+0>rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t}
|
||||
for (i=1;i<=nr;i++){ printf "%-8s", rr[i]; for (c=1;c<=n;c++){key=rr[i]","cols[c]; printf "%14s", (key in v)?v[key]:"-"} printf "\n" }
|
||||
}' "$1"
|
||||
}
|
||||
|
||||
# ---- break-even pivot: iterations after which scheme X beats P2P ----
|
||||
# Model for an iterative solver doing n_it spmv (halo exchanges):
|
||||
# T_X(n) = setup_X + n * steady_X (steady = run.out avg, setup excluded
|
||||
# by the warm-up; setup = run.out "setup time")
|
||||
# X beats the P2P reference when T_X(n) < T_P2P(n). Solving for the crossover:
|
||||
# n* = (setup_X - setup_P2P) / (steady_P2P - steady_X)
|
||||
# Output legend (per cell):
|
||||
# <number> : break-even n* (X wins for n_it > n*) -- more setup, less steady
|
||||
# always : X wins from iteration 1 (cheaper setup AND cheaper steady) -> dominates P2P
|
||||
# never : P2P wins for all n_it (more setup AND slower steady) -> X is dominated
|
||||
# <n* : X wins ONLY for n_it < n* (cheaper setup but slower steady)
|
||||
# ^ this is the GPU/few-ranks RMA case: if RMA steady drops below P2P
|
||||
# you'll instead see a number or "always" here.
|
||||
breakeven() {
|
||||
awk -F, '
|
||||
NR==1 { next }
|
||||
{ r=$1; be=$2; steady[r","be]=$4+0; setp[r","be]=$5+0; rs[r]=1; bes[be]=1 }
|
||||
END {
|
||||
order="NEIGHBOR PNEIGHBOR MPI_GET MPI_PUT"; m=split(order,oc," ")
|
||||
printf "\n=== break-even n* vs P2P (iterations to amortize setup; needs run.out '\''setup time'\'') ===\n"
|
||||
printf "%-8s", "ranks"; for (i=1;i<=m;i++) if (oc[i] in bes) printf "%14s", oc[i]; printf "\n"
|
||||
nr=0; for (r in rs) rr[++nr]=r
|
||||
for (i=1;i<=nr;i++) for (j=i+1;j<=nr;j++) if (rr[i]+0>rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t}
|
||||
anysetup=0
|
||||
for (i=1;i<=nr;i++){
|
||||
r=rr[i]; printf "%-8s", r
|
||||
sref=steady[r",P2P"]; pref=setp[r",P2P"]
|
||||
for (c=1;c<=m;c++){ be=oc[c]; if (!(be in bes)) continue
|
||||
key=r","be
|
||||
if (!(key in steady) || !(r",P2P" in steady)) { printf "%14s","-"; continue }
|
||||
dsteady = sref - steady[key] # >0 : X faster per-iter than P2P
|
||||
dsetup = setp[key] - pref # >0 : X pays more setup than P2P
|
||||
if (setp[key]>0 || pref>0) anysetup=1
|
||||
if (dsteady > 0) printf "%14s", (dsetup<=0 ? "always" : sprintf("%.0f", dsetup/dsteady))
|
||||
else if (dsteady < 0) printf "%14s", (dsetup>=0 ? "never" : sprintf("<%.0f", dsetup/dsteady))
|
||||
else printf "%14s", (dsetup<0 ? "always" : "never")
|
||||
}
|
||||
printf "\n"
|
||||
}
|
||||
if (!anysetup) print "\n NOTE: all setup_s == 0 -> run.out has no \"setup time\" yet (apply the\n warm-up-timing patch to the benchmark). Columns reflect steady-state only."
|
||||
}' "$RUNOUT"
|
||||
}
|
||||
|
||||
echo "Wrote: $RUNOUT $SUMMARY $COMPONENTS"
|
||||
pivot "$RUNOUT" 2 "run.out AVG time per backend [s] (real wall-clock = steady-state per iter)"
|
||||
pivot "$RUNOUT" 3 "run.out SETUP time per backend [s] (one-time, paid in warm-up)"
|
||||
breakeven
|
||||
pivot "$SUMMARY" 6 "scorep MPI WAIT per scheme [s] <== THE discriminator (sync/imbalance; 'guarda le wait')"
|
||||
pivot "$SUMMARY" 7 "scorep NET MPI per scheme [s] (mpimove + mpiwait; excl GPU sync, pack, one-time setup)"
|
||||
pivot "$SUMMARY" 5 "scorep MPI MOVE per scheme [s] (pure transfer: Isend/Irecv/Put/Get/a2av)"
|
||||
pivot "$SUMMARY" 2 "scorep SETUP per scheme [s] (one-time: Win_create/topology_init/alltoallv_init/buf_layout)"
|
||||
pivot "$SUMMARY" 3 "scorep GPU WAIT per scheme [s] (d_cuda_device_wait; sanity: ~equal across schemes)"
|
||||
pivot "$SUMMARY" 1 "scorep INCL per scheme [s] (everything; dominated by GPU wait -> misleading alone)"
|
||||
|
||||
echo
|
||||
echo "Tip: full per-component breakdown (now bucketed) is in $COMPONENTS"
|
||||
echo " (ranks,scheme,bucket,component,time). e.g.:"
|
||||
echo " grep ',rma_pull,' $COMPONENTS | sort -t, -k5 -gr # rma_pull hot spots"
|
||||
echo " awk -F, '\$3==\"mpiwait\"' $COMPONENTS # only the waits"
|
||||
@ -0,0 +1,113 @@
|
||||
#!/bin/bash
|
||||
# ============================================================================
|
||||
# gpu_spmv.sh GPU SpMV across comm backends -- "sparse ranks" study
|
||||
#
|
||||
# Goal: few MPI ranks spread over MANY nodes (1 rank = 1 GPU), so that the
|
||||
# halo exchange is dominated by INTER-node traffic. This is where RMA
|
||||
# (MPI_Get/MPI_Put) may finally pay off: with few, fat messages over the
|
||||
# network the one-time window-creation cost can amortize (break-even n*),
|
||||
# unlike the dense CPU runs where RMA was dominated.
|
||||
#
|
||||
# The spmv test sweeps ALL backends internally (P2P, NEIGHBOR, PNEIGHBOR,
|
||||
# MPI_GET, MPI_PUT) in one run -> no per-mode loop here.
|
||||
# ============================================================================
|
||||
#SBATCH --job-name=spmv_gpu
|
||||
#SBATCH --output=spmv_gpu_%j.out
|
||||
#SBATCH --error=spmv_gpu_%j.err
|
||||
#SBATCH --nodes=8 # reserve up front; we use a subset per point
|
||||
#SBATCH --ntasks-per-node=4 # MN5-ACC has 4x H100 per node
|
||||
#SBATCH --gres=gpu:4 # request all 4 GPUs on each reserved node
|
||||
#SBATCH --cpus-per-task=20 # 80 cores / 4 ranks = 20 cores per rank
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=acc_debug
|
||||
#SBATCH --account=ehpc580
|
||||
|
||||
# ============================================================================
|
||||
# USER CONFIGURATION
|
||||
# ============================================================================
|
||||
TIMES=50 # SpMV repetitions (timed)
|
||||
GPU_FMT=HLG # GPU matrix format (HLG|ELG|CSRG|HDIAG)
|
||||
PROFILE=true # Score-P profiling on/off
|
||||
|
||||
# --- "sparse" knob: how many ranks (= GPUs) to put on each node.
|
||||
# RANKS_PER_NODE=1 -> 1 GPU/node, maximally spread, pure inter-node comm
|
||||
# (this is the configuration that isolates RMA-over-network)
|
||||
# RANKS_PER_NODE=4 -> pack all 4 GPUs/node (denser, more intra-node)
|
||||
RANKS_PER_NODE=1
|
||||
|
||||
# total ranks per scale point. With RANKS_PER_NODE=1 these are also node counts,
|
||||
# so "2 4 8" means 2, 4, 8 nodes each holding a single GPU.
|
||||
RANK_POINTS="2 4 8"
|
||||
|
||||
EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel
|
||||
MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx
|
||||
|
||||
# ============================================================================
|
||||
# ENVIRONMENT
|
||||
# ============================================================================
|
||||
module purge
|
||||
module load bsc/1.0
|
||||
module load nvidia-hpc-sdk/25.3
|
||||
module load gcc/12.3.0
|
||||
module load ucx/1.16.0-gcc
|
||||
module load openmpi/5.0.5-gcc
|
||||
module load openblas/0.3.27-gcc
|
||||
export PATH="$HOME/scorep-cuda/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
|
||||
|
||||
export OMPI_MCA_coll_hcoll_enable=0
|
||||
export SCOREP_CUDA_ENABLE=yes # GPU run: turn the CUDA adapter ON
|
||||
export SCOREP_CUDA_BUFFER=48M # per-stream CUDA record buffer
|
||||
|
||||
RESDIR=$SLURM_SUBMIT_DIR/results_spmv_gpu_${SLURM_JOB_ID}
|
||||
mkdir -p $RESDIR
|
||||
|
||||
# --- Score-P region filter (same as CPU: drop tiny high-frequency USR funcs) ---
|
||||
FILTER=$RESDIR/scorep.filt
|
||||
cat > $FILTER <<'EOF'
|
||||
SCOREP_REGION_NAMES_BEGIN
|
||||
EXCLUDE
|
||||
psb_indx_map_mod::*
|
||||
psb_desc_mod::*
|
||||
psb_error_mod::*
|
||||
psb_gen_block_map_mod::*
|
||||
psi_penv_mod::*
|
||||
psb_hash_mod::*
|
||||
SCOREP_REGION_NAMES_END
|
||||
EOF
|
||||
|
||||
echo "=== SPMV GPU (sparse-ranks study) ==="
|
||||
echo " times=$TIMES gpu_fmt=$GPU_FMT ranks_per_node=$RANKS_PER_NODE"
|
||||
echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE"
|
||||
echo "====================================="
|
||||
|
||||
for NRANKS in $RANK_POINTS; do
|
||||
NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE ))
|
||||
STEP_DIR=$RESDIR/${NRANKS}ranks
|
||||
mkdir -p $STEP_DIR
|
||||
|
||||
if [ "$PROFILE" = "true" ]; then
|
||||
export SCOREP_ENABLE_PROFILING=true
|
||||
export SCOREP_ENABLE_TRACING=false
|
||||
export SCOREP_TOTAL_MEMORY=256M # bumped: CUDA adapter needs more
|
||||
export SCOREP_FILTERING_FILE=$FILTER
|
||||
export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile
|
||||
else
|
||||
export SCOREP_ENABLE_PROFILING=false
|
||||
export SCOREP_ENABLE_TRACING=false
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo ">>> GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)"
|
||||
|
||||
srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \
|
||||
--gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \
|
||||
$EXE --gpu=TRUE --gpu_fmt=$GPU_FMT --matrix=$MATRIX --fmt=MM --times=$TIMES \
|
||||
> $STEP_DIR/run.out 2>&1
|
||||
|
||||
echo ">>> exit=$? output=$STEP_DIR/run.out"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== SPMV GPU DONE. Results: $RESDIR ==="
|
||||
ls -R $RESDIR
|
||||
@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
# ============================================================================
|
||||
# strong_spmv.sbatch STRONG scaling, CPU-only SpMV across comm backends
|
||||
#
|
||||
# Fixed total problem size; MPI ranks grow across scale points. The spmv test
|
||||
# sweeps ALL comm backends internally (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET,
|
||||
# MPI_PUT) in a single run, so there is no per-mode loop.
|
||||
# All scale points run inside ONE reserved allocation (same hardware).
|
||||
# ============================================================================
|
||||
#SBATCH --job-name=spmv_strong
|
||||
#SBATCH --output=spmv_strong_%j.out
|
||||
#SBATCH --error=spmv_strong_%j.err
|
||||
#SBATCH --nodes=8 # reserve the maximum up front
|
||||
#SBATCH --ntasks-per-node=80 # 80 cores per ACC node -> 80 ranks/node
|
||||
#SBATCH --cpus-per-task=1 # CPU-only: one core per rank
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=acc_debug
|
||||
#SBATCH --account=ehpc580
|
||||
|
||||
# ============================================================================
|
||||
# USER CONFIGURATION
|
||||
# ============================================================================
|
||||
DIM=280 # FIXED problem size (idim^3 unknowns)
|
||||
TIMES=50 # SpMV repetitions (timed)
|
||||
CPU_FMT=CSR # CPU matrix storage format
|
||||
PROFILE=true # Score-P profiling on/off
|
||||
RANK_POINTS="80 160 320 640" # total ranks per scale point (multiples of 80)
|
||||
|
||||
EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel
|
||||
|
||||
# ============================================================================
|
||||
# ENVIRONMENT
|
||||
# ============================================================================
|
||||
module purge
|
||||
module load bsc/1.0
|
||||
module load nvidia-hpc-sdk/25.3
|
||||
module load gcc/12.3.0
|
||||
module load ucx/1.16.0-gcc
|
||||
module load openmpi/5.0.5-gcc
|
||||
module load openblas/0.3.27-gcc
|
||||
export PATH="$HOME/scorep-cuda/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
|
||||
|
||||
export OMPI_MCA_coll_hcoll_enable=0
|
||||
export SCOREP_CUDA_ENABLE=no # CPU-only run: silence Score-P CUDA adapter
|
||||
|
||||
RESDIR=$SLURM_SUBMIT_DIR/results_spmv_strong_${SLURM_JOB_ID}
|
||||
mkdir -p $RESDIR
|
||||
|
||||
# --- Score-P region filter: exclude the high-frequency tiny USR functions so
|
||||
# the profile reflects real MPI/comm time, not instrumentation overhead. ---
|
||||
FILTER=$RESDIR/scorep.filt
|
||||
cat > $FILTER <<'EOF'
|
||||
SCOREP_REGION_NAMES_BEGIN
|
||||
EXCLUDE
|
||||
psb_indx_map_mod::*
|
||||
psb_desc_mod::*
|
||||
psb_error_mod::*
|
||||
psb_gen_block_map_mod::*
|
||||
psi_penv_mod::*
|
||||
psb_hash_mod::*
|
||||
SCOREP_REGION_NAMES_END
|
||||
EOF
|
||||
|
||||
echo "=== SPMV STRONG scaling (CPU-only) ==="
|
||||
echo " fixed_dim=$DIM times=$TIMES cpu_fmt=$CPU_FMT"
|
||||
echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE"
|
||||
echo "======================================"
|
||||
|
||||
for NRANKS in $RANK_POINTS; do
|
||||
NNODES=$(( (NRANKS + 79) / 80 )) # 80 ranks per node
|
||||
STEP_DIR=$RESDIR/${NRANKS}ranks
|
||||
mkdir -p $STEP_DIR
|
||||
|
||||
if [ "$PROFILE" = "true" ]; then
|
||||
export SCOREP_ENABLE_PROFILING=true
|
||||
export SCOREP_ENABLE_TRACING=false
|
||||
export SCOREP_TOTAL_MEMORY=128M
|
||||
export SCOREP_FILTERING_FILE=$FILTER
|
||||
export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile
|
||||
else
|
||||
export SCOREP_ENABLE_PROFILING=false
|
||||
export SCOREP_ENABLE_TRACING=false
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo ">>> STRONG point: $NRANKS ranks ($NNODES nodes), fixed dim=$DIM"
|
||||
srun -N $NNODES -n $NRANKS --ntasks-per-node=80 --cpus-per-task=1 \
|
||||
$EXE --gpu=FALSE --dim=$DIM --times=$TIMES --cpu_fmt=$CPU_FMT \
|
||||
> $STEP_DIR/run.out 2>&1
|
||||
echo ">>> exit=$? output=$STEP_DIR/run.out"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== SPMV STRONG DONE. Results: $RESDIR ==="
|
||||
ls -R $RESDIR
|
||||
@ -0,0 +1,398 @@
|
||||
!
|
||||
! Parallel Sparse BLAS version 3.5
|
||||
! (C) Copyright 2006-2018
|
||||
! Salvatore Filippone
|
||||
! Alfredo Buttari
|
||||
!
|
||||
! File: psb_d_nest_halo_regime_test.F90
|
||||
! Author: Simone Staccone (Stack-1)
|
||||
!
|
||||
! Benchmarks the two halo-exchange regimes of a nested (MATNEST-style) operator
|
||||
! on the SAME assembled blocks:
|
||||
!
|
||||
! * UNION (fused): one psb_halo over the composed global descriptor brings in
|
||||
! the union of every field's ghosts at once (the production path used by
|
||||
! psb_spmm / Krylov / AMG4PSBLAS).
|
||||
!
|
||||
! * PER-BLOCK (selective): each present block (i,j) exchanges ONLY its own
|
||||
! off-process columns, through its restricted descriptor block_col_desc(i,j),
|
||||
! one psb_halo per block.
|
||||
!
|
||||
! The two products must coincide to machine precision. The test reports, for
|
||||
! each regime: the number of halo exchanges, the full-matvec time, and the
|
||||
! PURE-COMMUNICATION time (only the psb_halo calls), so the cost of the message
|
||||
! aggregation can be isolated.
|
||||
!
|
||||
! Saddle-point 2x2 operator [ A B^T ; B 0 ] with TWO fields of DIFFERENT
|
||||
! size (n1 = field 1 / "velocity", n2 = field 2 / "pressure", n1 /= n2 allowed);
|
||||
! the (2,2) block is absent. The blocks have REAL, distinct per-block halos (so
|
||||
! per-block vs union is meaningful): A (1,1) tridiagonal on field 1; the
|
||||
! rectangular B^T (1,2) and B (2,1) couple every row to a column half a field
|
||||
! away, which lands on another process.
|
||||
!
|
||||
! Run: mpirun -np <P> ./psb_d_nest_halo_regime_test [n1] [n2] [n_reps] [stride]
|
||||
! n1 : global rows of field 1 (default 2000000)
|
||||
! n2 : global rows of field 2 (default 500000)
|
||||
! n_reps : timed repetitions (default 50)
|
||||
! stride : B/B^T coupling distance (default 1)
|
||||
! small => local halo => LATENCY-bound (aggregation wins)
|
||||
! ~n/2 => huge halo => BANDWIDTH-bound (union ~ per-block)
|
||||
!
|
||||
program psb_d_nest_halo_regime_test
|
||||
use psb_base_mod
|
||||
use psb_d_nest_mod
|
||||
implicit none
|
||||
|
||||
type(psb_ctxt_type) :: context
|
||||
integer(psb_ipk_) :: my_rank, num_procs, info, i_local_row
|
||||
integer(psb_ipk_) :: entry_idx, field1_local_rows, field2_local_rows
|
||||
integer(psb_ipk_) :: n_local_rows, n_local_cols, n_exch, n_warm
|
||||
integer(psb_ipk_) :: n_reps, narg
|
||||
integer(psb_lpk_) :: global_row, field1_size, field2_size, gcol, stride
|
||||
character(len=64) :: arg
|
||||
|
||||
type(psb_d_nest_matrix) :: nested_matrix
|
||||
integer(psb_lpk_), allocatable :: entry_rows(:), entry_cols(:)
|
||||
integer(psb_lpk_), allocatable :: field1_rows(:), field2_rows(:)
|
||||
real(psb_dpk_), allocatable :: entry_vals(:)
|
||||
real(psb_dpk_), allocatable :: x_arr(:), work_x(:), y_union(:), y_perblock(:)
|
||||
real(psb_dpk_) :: mismatch_norm, t_u, t_b
|
||||
logical :: ok_u, ok_b
|
||||
integer(psb_ipk_) :: s_idx, n_schemes
|
||||
integer(psb_ipk_), allocatable :: schemes(:)
|
||||
character(len=34), allocatable :: scheme_names(:)
|
||||
type(psb_d_vect_type) :: xg_vect ! global-local halo vector
|
||||
type(psb_d_vect_type), allocatable :: bvect(:,:) ! one per present block
|
||||
real(psb_dpk_), parameter :: tolerance = 1.0e-10_psb_dpk_
|
||||
|
||||
call psb_init(context)
|
||||
call psb_info(context, my_rank, num_procs)
|
||||
|
||||
! -------- runtime parameters (rank 0 parses, then broadcast) --------
|
||||
field1_size = 2000000_psb_lpk_
|
||||
field2_size = 500000_psb_lpk_
|
||||
n_reps = 50
|
||||
stride = 1_psb_lpk_ ! B/B^T coupling distance: small => local halo
|
||||
! (latency-bound), large (~n/2) => big halo
|
||||
! (bandwidth-bound)
|
||||
if (my_rank == 0) then
|
||||
narg = command_argument_count()
|
||||
if (narg >= 1) then
|
||||
call get_command_argument(1, arg); read(arg,*) field1_size
|
||||
end if
|
||||
if (narg >= 2) then
|
||||
call get_command_argument(2, arg); read(arg,*) field2_size
|
||||
end if
|
||||
if (narg >= 3) then
|
||||
call get_command_argument(3, arg); read(arg,*) n_reps
|
||||
end if
|
||||
if (narg >= 4) then
|
||||
call get_command_argument(4, arg); read(arg,*) stride
|
||||
end if
|
||||
end if
|
||||
call psb_bcast(context, field1_size)
|
||||
call psb_bcast(context, field2_size)
|
||||
call psb_bcast(context, n_reps)
|
||||
call psb_bcast(context, stride)
|
||||
n_warm = max(5, n_reps/10)
|
||||
|
||||
!---------------------------------------------------------------
|
||||
! 1) build the 2x2 nested operator with real per-block halos
|
||||
!---------------------------------------------------------------
|
||||
call nested_matrix%init(context, [field1_size, field2_size], info)
|
||||
if (info /= psb_success_) then
|
||||
if (my_rank==0) write(*,*) 'FAIL: init info=', info; goto 9999
|
||||
end if
|
||||
field1_rows = nested_matrix%get_owned_rows(1)
|
||||
field2_rows = nested_matrix%get_owned_rows(2)
|
||||
field1_local_rows = size(field1_rows)
|
||||
field2_local_rows = size(field2_rows)
|
||||
|
||||
! A = tridiag(-1,2,-1) -> block (1,1)
|
||||
allocate(entry_rows(3*field1_local_rows), entry_cols(3*field1_local_rows), entry_vals(3*field1_local_rows))
|
||||
entry_idx = 0
|
||||
do i_local_row = 1, field1_local_rows
|
||||
global_row = field1_rows(i_local_row)
|
||||
entry_idx = entry_idx + 1
|
||||
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row; entry_vals(entry_idx) = 2.0_psb_dpk_
|
||||
if (global_row > 1) then
|
||||
entry_idx = entry_idx + 1
|
||||
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row - 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_
|
||||
end if
|
||||
if (global_row < field1_size) then
|
||||
entry_idx = entry_idx + 1
|
||||
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row + 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_
|
||||
end if
|
||||
end do
|
||||
call nested_matrix%ins(1, 1, entry_idx, entry_rows, entry_cols, entry_vals, info)
|
||||
deallocate(entry_rows, entry_cols, entry_vals)
|
||||
|
||||
! B^T -> block (1,2): rows in field 1, columns in field 2 (RECTANGULAR n1 x n2).
|
||||
! Each row r couples to field-2 columns (r mod n2) and (r + n2/2 mod n2); the
|
||||
! second one lands on another process => real, distinct halo for this block.
|
||||
allocate(entry_rows(2*field1_local_rows), entry_cols(2*field1_local_rows), entry_vals(2*field1_local_rows))
|
||||
entry_idx = 0
|
||||
do i_local_row = 1, field1_local_rows
|
||||
global_row = field1_rows(i_local_row)
|
||||
entry_idx = entry_idx + 1
|
||||
gcol = mod(global_row - 1_psb_lpk_, field2_size) + 1_psb_lpk_
|
||||
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.5_psb_dpk_
|
||||
entry_idx = entry_idx + 1
|
||||
gcol = mod(global_row - 1_psb_lpk_ + stride, field2_size) + 1_psb_lpk_
|
||||
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.25_psb_dpk_
|
||||
end do
|
||||
call nested_matrix%ins(1, 2, entry_idx, entry_rows, entry_cols, entry_vals, info)
|
||||
deallocate(entry_rows, entry_cols, entry_vals)
|
||||
|
||||
! B -> block (2,1): rows in field 2, columns in field 1 (RECTANGULAR n2 x n1).
|
||||
allocate(entry_rows(2*field2_local_rows), entry_cols(2*field2_local_rows), entry_vals(2*field2_local_rows))
|
||||
entry_idx = 0
|
||||
do i_local_row = 1, field2_local_rows
|
||||
global_row = field2_rows(i_local_row)
|
||||
entry_idx = entry_idx + 1
|
||||
gcol = mod(global_row - 1_psb_lpk_, field1_size) + 1_psb_lpk_
|
||||
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.3_psb_dpk_
|
||||
entry_idx = entry_idx + 1
|
||||
gcol = mod(global_row - 1_psb_lpk_ + stride, field1_size) + 1_psb_lpk_
|
||||
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.15_psb_dpk_
|
||||
end do
|
||||
call nested_matrix%ins(2, 1, entry_idx, entry_rows, entry_cols, entry_vals, info)
|
||||
deallocate(entry_rows, entry_cols, entry_vals)
|
||||
|
||||
call nested_matrix%asb(info)
|
||||
if (info /= psb_success_) then
|
||||
if (my_rank==0) write(*,*) 'FAIL: asb info=', info; goto 9999
|
||||
end if
|
||||
|
||||
!---------------------------------------------------------------
|
||||
! 2) global-local work vectors (x[g] = g on the owned entries)
|
||||
!---------------------------------------------------------------
|
||||
n_local_rows = nested_matrix%desc_glob%get_local_rows()
|
||||
n_local_cols = nested_matrix%desc_glob%get_local_cols()
|
||||
allocate(x_arr(n_local_cols), work_x(n_local_cols), &
|
||||
& y_union(n_local_cols), y_perblock(n_local_cols))
|
||||
x_arr = dzero
|
||||
do i_local_row = 1, n_local_rows
|
||||
call nested_matrix%desc_glob%l2g(i_local_row, global_row, info)
|
||||
x_arr(i_local_row) = real(global_row, psb_dpk_)
|
||||
end do
|
||||
|
||||
!---------------------------------------------------------------
|
||||
! 3) correctness: union vs per-block on the same x
|
||||
!---------------------------------------------------------------
|
||||
work_x = x_arr
|
||||
call psb_spmm(done, nested_matrix%a_glob, work_x, dzero, y_union, nested_matrix%desc_glob, info)
|
||||
if (info /= psb_success_) then
|
||||
if (my_rank==0) write(*,*) 'FAIL: psb_spmm (union) info=', info; goto 9999
|
||||
end if
|
||||
call nest_spmv_perblock(nested_matrix, done, x_arr, dzero, y_perblock, info, n_exch, .false.)
|
||||
if (info /= psb_success_) then
|
||||
if (my_rank==0) write(*,*) 'FAIL: per-block driver info=', info; goto 9999
|
||||
end if
|
||||
mismatch_norm = dzero
|
||||
do i_local_row = 1, n_local_rows
|
||||
mismatch_norm = max(mismatch_norm, abs(y_union(i_local_row) - y_perblock(i_local_row)))
|
||||
end do
|
||||
call psb_amx(context, mismatch_norm)
|
||||
|
||||
!---------------------------------------------------------------
|
||||
! 4) per-scheme PURE COMMUNICATION sweep.
|
||||
! The comm scheme is selected per descriptor (desc%set_comm_scheme) and is
|
||||
! honoured ONLY by the encapsulated vect path; the array path used in (3)
|
||||
! is always baseline. For each scheme we time, on persistent vects:
|
||||
! union = one psb_halo over desc_glob
|
||||
! per-block = one psb_halo per present block_col_desc(i,j)
|
||||
!---------------------------------------------------------------
|
||||
n_schemes = 5
|
||||
allocate(schemes(n_schemes), scheme_names(n_schemes))
|
||||
schemes = [ psb_comm_isend_irecv_, psb_comm_ineighbor_alltoallv_, &
|
||||
& psb_comm_persistent_ineighbor_alltoallv_, &
|
||||
& psb_comm_rma_pull_, psb_comm_rma_push_ ]
|
||||
scheme_names = [ character(len=34) :: 'isend_irecv (baseline)', &
|
||||
& 'ineighbor_alltoallv', 'persistent_ineighbor_alltoallv', &
|
||||
& 'rma_pull', 'rma_push' ]
|
||||
allocate(bvect(nested_matrix%n_fields, nested_matrix%n_fields))
|
||||
|
||||
if (my_rank == 0) then
|
||||
write(*,'(a,i0,a,i0,a,i0,a,i0,a,i0)') ' np=', num_procs, ' n1=', field1_size, &
|
||||
& ' n2=', field2_size, ' stride=', stride, ' reps=', n_reps
|
||||
write(*,'(a,es12.4)') ' max|y_union - y_perblock| = ', mismatch_norm
|
||||
if (mismatch_norm <= tolerance) then
|
||||
write(*,*) ' PASS: regimes agree'
|
||||
else
|
||||
write(*,*) ' FAIL: regimes disagree'
|
||||
end if
|
||||
write(*,'(a,i0)') ' halo exchanges: union = 1 per-block = ', n_exch
|
||||
write(*,*)
|
||||
write(*,'(a)') ' pure halo communication time [s] (min over reps, slowest rank)'
|
||||
write(*,'(a)') ' scheme union per-block ratio'
|
||||
end if
|
||||
|
||||
do s_idx = 1, n_schemes
|
||||
call set_scheme(schemes(s_idx))
|
||||
call build_vects()
|
||||
call time_comm('U', t_u, ok_u)
|
||||
call time_comm('B', t_b, ok_b)
|
||||
call free_vects()
|
||||
if (my_rank == 0) then
|
||||
if (ok_u .and. ok_b) then
|
||||
write(*,'(1x,a34,es15.4,es15.4,3x,f7.2)') scheme_names(s_idx), t_u, t_b, &
|
||||
& t_b/max(t_u, tiny(t_u))
|
||||
else
|
||||
write(*,'(1x,a34,a)') scheme_names(s_idx), ' (unavailable on this build/MPI)'
|
||||
end if
|
||||
end if
|
||||
end do
|
||||
|
||||
deallocate(bvect, schemes, scheme_names)
|
||||
call nested_matrix%free(info)
|
||||
9999 continue
|
||||
call psb_exit(context)
|
||||
|
||||
contains
|
||||
|
||||
! Set the communication scheme on the global descriptor and on every present
|
||||
! per-block descriptor (the per-block exchanges must use the same scheme).
|
||||
subroutine set_scheme(scheme)
|
||||
integer(psb_ipk_), intent(in) :: scheme
|
||||
integer(psb_ipk_) :: i, j, linfo
|
||||
call nested_matrix%desc_glob%set_comm_scheme(scheme, linfo)
|
||||
do j = 1, nested_matrix%n_fields
|
||||
do i = 1, nested_matrix%n_fields
|
||||
if (nested_matrix%block_storage%has_block(i,j)) &
|
||||
& call nested_matrix%block_col_desc(i,j)%set_comm_scheme(scheme, linfo)
|
||||
end do
|
||||
end do
|
||||
end subroutine set_scheme
|
||||
|
||||
! Fresh persistent halo vectors (the comm_handle, hence the scheme, is created
|
||||
! from desc%comm_type on the first psb_halo and then reused across reps).
|
||||
subroutine build_vects()
|
||||
integer(psb_ipk_) :: i, j, linfo
|
||||
call psb_geall(xg_vect, nested_matrix%desc_glob, linfo)
|
||||
call psb_geasb(xg_vect, nested_matrix%desc_glob, linfo)
|
||||
do j = 1, nested_matrix%n_fields
|
||||
do i = 1, nested_matrix%n_fields
|
||||
if (.not. nested_matrix%block_storage%has_block(i,j)) cycle
|
||||
call psb_geall(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
|
||||
call psb_geasb(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
|
||||
end do
|
||||
end do
|
||||
end subroutine build_vects
|
||||
|
||||
subroutine free_vects()
|
||||
integer(psb_ipk_) :: i, j, linfo
|
||||
call xg_vect%free(linfo)
|
||||
do j = 1, nested_matrix%n_fields
|
||||
do i = 1, nested_matrix%n_fields
|
||||
if (nested_matrix%block_storage%has_block(i,j)) call bvect(i,j)%free(linfo)
|
||||
end do
|
||||
end do
|
||||
end subroutine free_vects
|
||||
|
||||
! Time the pure communication of one regime: n_warm warm-up runs, then min over
|
||||
! n_reps of the slowest rank's wall time (psb_amx = max across ranks).
|
||||
subroutine time_comm(code, t_min, ok)
|
||||
character, intent(in) :: code ! 'U' union / 'B' per-block
|
||||
real(psb_dpk_), intent(out) :: t_min
|
||||
logical, intent(out) :: ok
|
||||
integer(psb_ipk_) :: rep, linfo
|
||||
real(psb_dpk_) :: t0, dt
|
||||
ok = .true.
|
||||
do rep = 1, n_warm
|
||||
call do_comm(code, linfo); if (linfo /= 0) ok = .false.
|
||||
end do
|
||||
t_min = huge(t_min)
|
||||
do rep = 1, n_reps
|
||||
call psb_barrier(context)
|
||||
t0 = psb_wtime()
|
||||
call do_comm(code, linfo)
|
||||
dt = psb_wtime() - t0
|
||||
call psb_amx(context, dt)
|
||||
t_min = min(t_min, dt)
|
||||
if (linfo /= 0) ok = .false.
|
||||
end do
|
||||
if (.not. ok) t_min = dzero
|
||||
end subroutine time_comm
|
||||
|
||||
subroutine do_comm(code, linfo)
|
||||
character, intent(in) :: code
|
||||
integer(psb_ipk_), intent(out) :: linfo
|
||||
integer(psb_ipk_) :: i, j
|
||||
linfo = 0
|
||||
if (code == 'U') then
|
||||
call psb_halo(xg_vect, nested_matrix%desc_glob, linfo)
|
||||
else
|
||||
do j = 1, nested_matrix%n_fields
|
||||
do i = 1, nested_matrix%n_fields
|
||||
if (.not. nested_matrix%block_storage%has_block(i,j)) cycle
|
||||
call psb_halo(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
|
||||
if (linfo /= 0) return
|
||||
end do
|
||||
end do
|
||||
end if
|
||||
end subroutine do_comm
|
||||
|
||||
! Per-block nested matvec, exchanging one block's halo at a time. x, y are in
|
||||
! the composed global-local layout. halo_only=.true. performs ONLY the
|
||||
! per-block psb_halo calls (for the pure-communication timing).
|
||||
subroutine nest_spmv_perblock(op, alpha, x, beta, y, info, n_exchanges, halo_only)
|
||||
type(psb_d_nest_matrix), intent(inout) :: op
|
||||
real(psb_dpk_), intent(in) :: x(:)
|
||||
real(psb_dpk_), intent(inout) :: y(:)
|
||||
real(psb_dpk_), intent(in) :: alpha, beta
|
||||
integer(psb_ipk_), intent(out) :: info, n_exchanges
|
||||
logical, intent(in) :: halo_only
|
||||
|
||||
integer(psb_ipk_) :: nf, i, j, k, nrl, nownj, nowni, nfc, nbc
|
||||
integer(psb_ipk_), allocatable :: owned_offset(:)
|
||||
real(psb_dpk_), allocatable :: xs(:), xf(:), yb(:)
|
||||
|
||||
info = psb_success_; n_exchanges = 0
|
||||
nf = op%n_fields
|
||||
allocate(owned_offset(nf+1))
|
||||
owned_offset(1) = 0
|
||||
do j = 1, nf
|
||||
owned_offset(j+1) = owned_offset(j) + op%field_desc(j)%get_local_rows()
|
||||
end do
|
||||
nrl = owned_offset(nf+1)
|
||||
|
||||
if (.not. halo_only) then
|
||||
if (beta == dzero) then
|
||||
y(1:nrl) = dzero
|
||||
else if (beta /= done) then
|
||||
y(1:nrl) = beta * y(1:nrl)
|
||||
end if
|
||||
end if
|
||||
|
||||
do j = 1, nf
|
||||
nownj = op%field_desc(j)%get_local_rows()
|
||||
nfc = op%field_desc(j)%get_local_cols()
|
||||
do i = 1, nf
|
||||
if (.not. op%block_storage%has_block(i,j)) cycle
|
||||
nbc = op%block_col_desc(i,j)%get_local_cols()
|
||||
allocate(xs(nbc)); xs = dzero
|
||||
xs(1:nownj) = x(owned_offset(j)+1 : owned_offset(j)+nownj)
|
||||
call psb_halo(xs, op%block_col_desc(i,j), info)
|
||||
if (info /= psb_success_) return
|
||||
n_exchanges = n_exchanges + 1
|
||||
if (.not. halo_only) then
|
||||
allocate(xf(nfc)); xf = dzero
|
||||
do k = 1, nbc
|
||||
xf(op%blk2field(i,j)%map(k)) = xs(k)
|
||||
end do
|
||||
nowni = op%field_desc(i)%get_local_rows()
|
||||
allocate(yb(nowni)); yb = dzero
|
||||
call op%block_storage%mats(i,j)%a%csmv(alpha, xf, dzero, yb, info)
|
||||
if (info /= psb_success_) return
|
||||
y(owned_offset(i)+1 : owned_offset(i)+nowni) = &
|
||||
& y(owned_offset(i)+1 : owned_offset(i)+nowni) + yb
|
||||
deallocate(xf, yb)
|
||||
end if
|
||||
deallocate(xs)
|
||||
end do
|
||||
end do
|
||||
deallocate(owned_offset)
|
||||
end subroutine nest_spmv_perblock
|
||||
|
||||
end program psb_d_nest_halo_regime_test
|
||||
Loading…
Reference in New Issue