[UPDATE] Updated communicationtest to read also external matrices

communication_v2
Stack-1 3 weeks ago
parent be6f27069a
commit 52ad95461d

1
.gitignore vendored

@ -28,4 +28,5 @@ autom4te.cache
# the executable from tests
runs
data

@ -34,4 +34,12 @@ From this directory:
- `make run` (defaults: `NP=4`, `IDIM=40`)
- `make run NP=8 IDIM=80`
The program accepts one optional CLI argument: `IDIM`.
Default PDE-generated matrix:
- `./runs/psb_comm_cg_test [idim] [nrep] [nwarm] [itmax] [--gpu=TRUE|FALSE]`
External matrix input:
- `./runs/psb_comm_cg_test [idim] [nrep] [nwarm] [itmax] --matrix=<path> [--fmt=MM|HB] [--gpu=TRUE|FALSE]`
When `--matrix` is provided, the test reads and distributes that matrix and ignores the PDE generator.

@ -1,5 +1,6 @@
program psb_comm_cg_test
use psb_base_mod
use psb_util_mod
#ifdef PSB_HAVE_CUDA
use psb_cuda_mod
#endif
@ -44,9 +45,12 @@ program psb_comm_cg_test
character(len=20) :: prec_name(n_precs)
character(len=5) :: afmt
character(len=256) :: arg
character(len=256) :: matrix_file
character(len=2) :: matrix_fmt
character(len=16) :: gpu_arg
logical :: setup_done
logical :: use_gpu
logical :: use_external_matrix
info = psb_success_
afmt = 'CSR'
@ -58,6 +62,9 @@ program psb_comm_cg_test
itrace = -1
istop = 2
eps = 1.d-6
matrix_file = ''
matrix_fmt = 'MM'
use_external_matrix = .false.
#ifdef PSB_HAVE_CUDA
use_gpu = .true.
#else
@ -111,6 +118,12 @@ program psb_comm_cg_test
write(psb_err_unit,'("Invalid value for --gpu option. Use --gpu=TRUE or --gpu=FALSE")')
stop 1
end if
call parse_matrix_arg(matrix_file, matrix_fmt, info)
if (info /= psb_success_) then
write(psb_err_unit,'("Invalid matrix options. Use --matrix=<path> [--fmt=MM|HB]")')
stop 1
end if
use_external_matrix = (len_trim(matrix_file) > 0)
! call psb_set_debug_level(psb_debug_ext_)
@ -142,7 +155,12 @@ program psb_comm_cg_test
if (my_rank == psb_root_) then
write(psb_out_unit,*) 'Welcome to PSBLAS version: ', psb_version_string_
write(psb_out_unit,*) 'This is the comm/cg test program'
write(psb_out_unit,'("Grid dimensions : ",i4," x ",i4," x ",i4)') idim,idim,idim
if (use_external_matrix) then
write(psb_out_unit,'("Input matrix : ",a)') trim(matrix_file)
write(psb_out_unit,'("Input format : ",a)') trim(matrix_fmt)
else
write(psb_out_unit,'("Grid dimensions : ",i4," x ",i4," x ",i4)') idim,idim,idim
end if
write(psb_out_unit,'("Number of processors : ",i0)') np
write(psb_out_unit,'("Iterative method : CG")')
write(psb_out_unit,'("Preconditioners : NONE, DIAG")')
@ -151,14 +169,19 @@ program psb_comm_cg_test
write(psb_out_unit,'("Warmup solves : ",i0)') nwarm
write(psb_out_unit,'("GPU enabled : ",l1)') use_gpu
write(psb_out_unit,'(" ")')
write(psb_out_unit,'("Usage: ./psb_comm_cg_test [idim] [nrep] [nwarm] [itmax] [--gpu=TRUE|FALSE]")')
write(psb_out_unit,'("Usage: ./psb_comm_cg_test [idim] [nrep] [nwarm] [itmax] ",&
&"[--gpu=TRUE|FALSE] [--matrix=<path>] [--fmt=MM|HB]")')
write(psb_out_unit,'(" ")')
end if
call psb_barrier(ctxt)
! call probe_ieee('before psb_d_gen_pde3d')
call psb_d_gen_pde3d(ctxt,idim,a,b,x,desc_a,afmt,info)
! call probe_ieee('after psb_d_gen_pde3d')
if (use_external_matrix) then
call load_external_matrix(ctxt, matrix_file, matrix_fmt, a, b, x, desc_a, afmt, info)
else
! call probe_ieee('before psb_d_gen_pde3d')
call psb_d_gen_pde3d(ctxt,idim,a,b,x,desc_a,afmt,info)
! call probe_ieee('after psb_d_gen_pde3d')
end if
if (info /= psb_success_) goto 9999
#ifdef PSB_HAVE_CUDA
@ -168,9 +191,9 @@ program psb_comm_cg_test
if (info /= psb_success_) goto 9999
call desc_a%cnv(mold=imold)
if (info /= psb_success_) goto 9999
call psb_geasb(x,desc_a,info,mold=vmold)
call x%cnv(mold=vmold)
if (info /= psb_success_) goto 9999
call psb_geasb(b,desc_a,info,mold=vmold)
call b%cnv(mold=vmold)
if (info /= psb_success_) goto 9999
end if
#endif
@ -514,6 +537,113 @@ contains
end do
end subroutine parse_gpu_arg
subroutine parse_matrix_arg(matrix_file, matrix_fmt, info)
character(len=*), intent(inout) :: matrix_file
character(len=*), intent(inout) :: matrix_fmt
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, argc
character(len=256) :: carg, uarg, val
info = psb_success_
argc = command_argument_count()
do i = 1, argc
call get_command_argument(i,carg)
uarg = psb_toupper(trim(carg))
if (index(uarg,'--MATRIX=') == 1) then
matrix_file = adjustl(carg(10:len_trim(carg)))
else if (trim(uarg) == '--MATRIX') then
if (i < argc) then
call get_command_argument(i+1,matrix_file)
else
info = psb_err_internal_error_
return
end if
else if (index(uarg,'--FMT=') == 1) then
val = psb_toupper(adjustl(carg(7:len_trim(carg))))
if ((trim(val) == 'MM') .or. (trim(val) == 'HB')) then
matrix_fmt = trim(val)
else
info = psb_err_internal_error_
return
end if
else if (trim(uarg) == '--FMT') then
if (i < argc) then
call get_command_argument(i+1,val)
val = psb_toupper(trim(val))
if ((trim(val) == 'MM') .or. (trim(val) == 'HB')) then
matrix_fmt = trim(val)
else
info = psb_err_internal_error_
return
end if
else
info = psb_err_internal_error_
return
end if
end if
end do
end subroutine parse_matrix_arg
subroutine load_external_matrix(ctxt, matrix_file, matrix_fmt, a, bv, xv, desc_a, afmt, info)
type(psb_ctxt_type), intent(in) :: ctxt
character(len=*), intent(in) :: matrix_file
character(len=*), intent(in) :: matrix_fmt
type(psb_dspmat_type), intent(out) :: a
type(psb_d_vect_type), intent(out) :: bv, xv
type(psb_desc_type), intent(out) :: desc_a
character(len=*), intent(in) :: afmt
integer(psb_ipk_), intent(out) :: info
type(psb_ldspmat_type) :: aux_a
real(psb_dpk_), allocatable :: rhs_glob(:), x_glob(:)
integer(psb_lpk_) :: nrows, ncols
info = psb_success_
select case(psb_toupper(trim(matrix_fmt)))
case('MM')
call mm_mat_read(aux_a,info,filename=trim(matrix_file))
case('HB')
call hb_read(aux_a,info,filename=trim(matrix_file))
case default
info = psb_err_internal_error_
return
end select
if (info /= psb_success_) return
nrows = aux_a%get_nrows()
ncols = aux_a%get_ncols()
if (nrows /= ncols) then
write(psb_err_unit,'("Input matrix must be square for CG: ",a)') trim(matrix_file)
info = psb_err_internal_error_
return
end if
call psb_matdist(aux_a, a, ctxt, desc_a, info, fmt=afmt, parts=part_block)
if (info /= psb_success_) return
call psb_geall(xv,desc_a,info)
if (info /= psb_success_) return
call psb_geall(bv,desc_a,info)
if (info /= psb_success_) return
allocate(rhs_glob(nrows), x_glob(ncols), stat=info)
if (info /= psb_success_) then
info = psb_err_alloc_dealloc_
return
end if
rhs_glob = done
x_glob = dzero
call psb_scatter(rhs_glob,bv,desc_a,info,root=psb_root_)
if (info /= psb_success_) return
call psb_scatter(x_glob,xv,desc_a,info,root=psb_root_)
if (info /= psb_success_) return
deallocate(rhs_glob, x_glob)
end subroutine load_external_matrix
subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info)
implicit none
integer(psb_ipk_), intent(in) :: idim

@ -19,3 +19,13 @@ Communication schemes compared:
Unlike `swapdata/`, which checks direct halo exchange, this test covers the
overlapped SpMV workflow.
Run options
-----------
- Default PDE-generated matrix: `./runs/psb_spmv_kernel [--gpu=TRUE|FALSE] [--nooverlap]`
- External matrix: `./runs/psb_spmv_kernel [--gpu=TRUE|FALSE] --matrix=<path> [--fmt=MM|HB] [--nooverlap]`
The overlap path is enabled by default; pass `--nooverlap` to force the non-overlapped halo-update path.
When `--matrix` is provided, the benchmark reads and distributes that matrix instead of generating the 3D PDE test matrix.

@ -1,8 +1,5 @@
!> Test program for overlapping communication and computation with psb_spmm.
!!
!! This benchmark compares two equivalent SpMV paths:
!! 1. Serialized halo exchange + compute
!! 2. Overlapped psb_spmm(..., doswap=.true.)
!!
module psb_spmv_overlap_test
@ -532,7 +529,7 @@ contains
return
end subroutine psb_d_gen_pde3d
subroutine run_spmv_kernel(ctxt,use_gpu)
subroutine run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_in,times_in,do_swap)
use psb_base_mod
#ifdef PSB_HAVE_CUDA
use psb_cuda_mod
@ -541,73 +538,112 @@ contains
type(psb_ctxt_type), intent(in) :: ctxt
logical, intent(in) :: use_gpu
character(len=*), intent(in) :: matrix_file
character(len=*), intent(in) :: matrix_fmt
character(len=*), intent(in) :: cpu_fmt
character(len=*), intent(in) :: gpu_fmt
integer(psb_ipk_), intent(in) :: idim_in, times_in
logical, intent(in) :: do_swap
type(psb_dspmat_type) :: a
type(psb_d_vect_type) :: x, y
type(psb_desc_type) :: desc_a
character(len=5) :: afmt
character(len=8) :: afmt
character(len=64) :: env_buf
integer(psb_ipk_) :: my_rank, np, info, err_act
integer(psb_ipk_) :: idim, times, i, n_global
integer :: env_len, env_status, ios
real(psb_dpk_) :: alpha, beta, t0, t1, dt, avg_t
logical :: use_external_matrix
#ifdef PSB_HAVE_CUDA
type(psb_d_vect_cuda) :: vmold
type(psb_i_vect_cuda) :: imold
type(psb_d_cuda_hlg_sparse_mat), target :: ahlg
class(psb_d_base_sparse_mat), pointer :: agmold
type(psb_d_vect_cuda) :: cuda_vector_mold
type(psb_i_vect_cuda) :: cuda_index_mold
type(psb_d_cuda_elg_sparse_mat), target :: cuda_ell_sparse_mold
type(psb_d_cuda_csrg_sparse_mat), target :: cuda_csr_sparse_mold
type(psb_d_cuda_hdiag_sparse_mat), target :: cuda_hdia_sparse_mold
type(psb_d_cuda_hlg_sparse_mat), target :: cuda_hll_sparse_mold
class(psb_d_base_sparse_mat), pointer :: cuda_sparse_mold
#endif
info = psb_success_
afmt = 'CSR'
idim = 10
times = 100
afmt = psb_toupper(trim(cpu_fmt))
if (len_trim(afmt) == 0) afmt = 'CSR'
if (idim_in > 0) then
idim = idim_in
else
idim = 10
end if
if (times_in > 0) then
times = times_in
else
times = 100
end if
alpha = done
beta = dzero
call psb_erractionsave(err_act)
call psb_info(ctxt, my_rank, np)
use_external_matrix = (len_trim(matrix_file) > 0)
call get_environment_variable('IDIM', env_buf, length=env_len, status=env_status)
if ((env_status == 0) .and. (env_len > 0)) then
read(env_buf(1:env_len), *, iostat=ios) idim
if ((ios /= 0) .or. (idim < 2)) idim = 10
end if
call get_environment_variable('TIMES', env_buf, length=env_len, status=env_status)
if ((env_status == 0) .and. (env_len > 0)) then
read(env_buf(1:env_len), *, iostat=ios) times
if ((ios /= 0) .or. (times < 1)) times = 100
if (idim_in <= 0) then
call get_environment_variable('IDIM', env_buf, length=env_len, status=env_status)
if ((env_status == 0) .and. (env_len > 0)) then
read(env_buf(1:env_len), *, iostat=ios) idim
if ((ios /= 0) .or. (idim < 2)) idim = 10
end if
end if
n_global = idim * idim * idim
if (times_in <= 0) then
call get_environment_variable('TIMES', env_buf, length=env_len, status=env_status)
if ((env_status == 0) .and. (env_len > 0)) then
read(env_buf(1:env_len), *, iostat=ios) times
if ((ios /= 0) .or. (times < 1)) times = 100
end if
end if
call psb_barrier(ctxt)
call psb_d_gen_pde3d(ctxt,idim,a,y,x,desc_a,afmt,info)
if (use_external_matrix) then
call load_external_matrix(ctxt, matrix_file, matrix_fmt, a, y, x, desc_a, afmt, info)
n_global = int(a%get_nrows(),kind=psb_ipk_)
else
call psb_d_gen_pde3d(ctxt,idim,a,y,x,desc_a,afmt,info)
n_global = idim * idim * idim
end if
if (info /= psb_success_) goto 9999
#ifdef PSB_HAVE_CUDA
if (use_gpu) then
agmold => ahlg
call a%cscnv(info,mold=agmold)
select case(psb_toupper(trim(gpu_fmt)))
case('ELG')
cuda_sparse_mold => cuda_ell_sparse_mold
case('CSRG')
cuda_sparse_mold => cuda_csr_sparse_mold
case('HDIAG','HDIA')
cuda_sparse_mold => cuda_hdia_sparse_mold
case default
cuda_sparse_mold => cuda_hll_sparse_mold
end select
call a%cscnv(info,mold=cuda_sparse_mold)
if (info /= psb_success_) goto 9999
call desc_a%cnv(mold=imold)
call desc_a%cnv(mold=cuda_index_mold)
if (info /= psb_success_) goto 9999
call psb_geasb(x,desc_a,info,mold=vmold)
call x%cnv(mold=cuda_vector_mold)
if (info /= psb_success_) goto 9999
call psb_geasb(y,desc_a,info,mold=vmold)
call y%cnv(mold=cuda_vector_mold)
if (info /= psb_success_) goto 9999
end if
#endif
! warm-up
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.false.)
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap)
if (info /= psb_success_) goto 9999
call psb_barrier(ctxt)
t0 = psb_wtime()
do i = 1, times
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.false.)
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap)
if (info /= psb_success_) exit
end do
t1 = psb_wtime()
@ -618,8 +654,19 @@ contains
avg_t = dt / real(times, psb_dpk_)
if (my_rank == psb_root_) then
write(psb_out_unit,'(/,"SpMV benchmark (no overlap)")')
write(psb_out_unit,'(" idim : ",i0)') idim
if (do_swap) then
write(psb_out_unit,'(/,"SpMV benchmark (overlap)")')
else
write(psb_out_unit,'(/,"SpMV benchmark (no overlap)")')
end if
write(psb_out_unit,'(" cpu matrix fmt : ",a)') trim(afmt)
if (use_gpu) write(psb_out_unit,'(" gpu matrix fmt : ",a)') trim(psb_toupper(trim(gpu_fmt)))
if (use_external_matrix) then
write(psb_out_unit,'(" matrix file : ",a)') trim(matrix_file)
write(psb_out_unit,'(" matrix format : ",a)') trim(matrix_fmt)
else
write(psb_out_unit,'(" idim : ",i0)') idim
end if
write(psb_out_unit,'(" global unknowns : ",i0)') n_global
write(psb_out_unit,'(" repetitions : ",i0)') times
write(psb_out_unit,'(" total time [s] : ",es12.5)') dt
@ -637,6 +684,65 @@ contains
call psb_error_handler(ctxt, err_act)
end subroutine run_spmv_kernel
subroutine load_external_matrix(ctxt, matrix_file, matrix_fmt, a, bv, xv, desc_a, afmt, info)
type(psb_ctxt_type), intent(in) :: ctxt
character(len=*), intent(in) :: matrix_file
character(len=*), intent(in) :: matrix_fmt
type(psb_dspmat_type), intent(out) :: a
type(psb_d_vect_type), intent(out) :: bv, xv
type(psb_desc_type), intent(out) :: desc_a
character(len=*), intent(in) :: afmt
integer(psb_ipk_), intent(out) :: info
type(psb_ldspmat_type) :: aux_a
real(psb_dpk_), allocatable :: rhs_glob(:), x_glob(:)
integer(psb_lpk_) :: nrows, ncols
info = psb_success_
select case(psb_toupper(trim(matrix_fmt)))
case('MM')
call mm_mat_read(aux_a,info,filename=trim(matrix_file))
case('HB')
call hb_read(aux_a,info,filename=trim(matrix_file))
case default
info = psb_err_internal_error_
return
end select
if (info /= psb_success_) return
nrows = aux_a%get_nrows()
ncols = aux_a%get_ncols()
if (nrows /= ncols) then
write(psb_err_unit,'("Input matrix must be square: ",a)') trim(matrix_file)
info = psb_err_internal_error_
return
end if
call psb_matdist(aux_a, a, ctxt, desc_a, info, fmt=afmt, parts=part_block)
if (info /= psb_success_) return
call psb_geall(xv,desc_a,info)
if (info /= psb_success_) return
call psb_geall(bv,desc_a,info)
if (info /= psb_success_) return
allocate(rhs_glob(nrows), x_glob(ncols), stat=info)
if (info /= psb_success_) then
info = psb_err_alloc_dealloc_
return
end if
rhs_glob = done
x_glob = dzero
call psb_scatter(rhs_glob,bv,desc_a,info,root=psb_root_)
if (info /= psb_success_) return
call psb_scatter(x_glob,xv,desc_a,info,root=psb_root_)
if (info /= psb_success_) return
deallocate(rhs_glob, x_glob)
end subroutine load_external_matrix
end module psb_spmv_overlap_test
program psb_spmv_kernel
@ -650,7 +756,22 @@ program psb_spmv_kernel
type(psb_ctxt_type) :: ctxt
logical :: use_gpu
integer(psb_ipk_) :: my_rank, np, k
integer :: ios
character(len=256) :: arg
character(len=256) :: matrix_file
character(len=2) :: matrix_fmt
character(len=8) :: cpu_fmt
character(len=8) :: gpu_fmt
integer(psb_ipk_) :: idim_arg, times_arg
logical :: do_swap
idim_arg = -1
times_arg = -1
matrix_file = ''
matrix_fmt = 'MM'
cpu_fmt = 'CSR'
gpu_fmt = 'HLG'
do_swap = .true.
call psb_init(ctxt)
call psb_info(ctxt, my_rank, np)
@ -670,6 +791,71 @@ program psb_spmv_kernel
case ('FALSE','F','0','NO','N','OFF')
use_gpu = .false.
end select
else if (index(psb_toupper(trim(arg)), '--MATRIX=') == 1) then
matrix_file = adjustl(arg(10:len_trim(arg)))
else if (index(psb_toupper(trim(arg)), '--FMT=') == 1) then
arg = psb_toupper(adjustl(arg(7:len_trim(arg))))
if ((trim(arg) == 'MM') .or. (trim(arg) == 'HB')) matrix_fmt = trim(arg)
else if (index(psb_toupper(trim(arg)), '--MTX_FMT=') == 1) then
arg = psb_toupper(adjustl(arg(10:len_trim(arg))))
if ((trim(arg) == 'MM') .or. (trim(arg) == 'HB')) matrix_fmt = trim(arg)
else if (index(psb_toupper(trim(arg)), '--DIM=') == 1) then
read(arg(7:len_trim(arg)),*,iostat=ios) idim_arg
if ((ios /= 0) .or. (idim_arg < 2)) idim_arg = -1
else if (index(psb_toupper(trim(arg)), '--TIMES=') == 1) then
read(arg(9:len_trim(arg)),*,iostat=ios) times_arg
if ((ios /= 0) .or. (times_arg < 1)) times_arg = -1
else if (index(psb_toupper(trim(arg)), '--ITERS=') == 1) then
read(arg(9:len_trim(arg)),*,iostat=ios) times_arg
if ((ios /= 0) .or. (times_arg < 1)) times_arg = -1
else if (index(psb_toupper(trim(arg)), '--CPU_FORMAT=') == 1) then
cpu_fmt = psb_toupper(adjustl(arg(14:len_trim(arg))))
else if (index(psb_toupper(trim(arg)), '--CPU_FMT=') == 1) then
cpu_fmt = psb_toupper(adjustl(arg(11:len_trim(arg))))
else if (index(psb_toupper(trim(arg)), '--GPU_FORMAT=') == 1) then
gpu_fmt = psb_toupper(adjustl(arg(14:len_trim(arg))))
else if (index(psb_toupper(trim(arg)), '--GPU_FMT=') == 1) then
gpu_fmt = psb_toupper(adjustl(arg(11:len_trim(arg))))
else if ((trim(psb_toupper(arg)) == '--NOOVERLAP') .or. (trim(psb_toupper(arg)) == '--NO_OVERLAP')) then
do_swap = .false.
else if ((trim(psb_toupper(arg)) == '--OVERLAP') .or. (trim(psb_toupper(arg)) == '--SWAP')) then
do_swap = .true.
else if (trim(psb_toupper(arg)) == '--MATRIX') then
if (k < command_argument_count()) call get_command_argument(k+1,matrix_file)
else if (trim(psb_toupper(arg)) == '--FMT') then
if (k < command_argument_count()) then
call get_command_argument(k+1,arg)
arg = psb_toupper(trim(arg))
if ((trim(arg) == 'MM') .or. (trim(arg) == 'HB')) matrix_fmt = trim(arg)
end if
else if (trim(psb_toupper(arg)) == '--MTX_FMT') then
if (k < command_argument_count()) then
call get_command_argument(k+1,arg)
arg = psb_toupper(trim(arg))
if ((trim(arg) == 'MM') .or. (trim(arg) == 'HB')) matrix_fmt = trim(arg)
end if
else if (trim(psb_toupper(arg)) == '--DIM') then
if (k < command_argument_count()) then
call get_command_argument(k+1,arg)
read(arg,*,iostat=ios) idim_arg
if ((ios /= 0) .or. (idim_arg < 2)) idim_arg = -1
end if
else if ((trim(psb_toupper(arg)) == '--TIMES') .or. (trim(psb_toupper(arg)) == '--ITERS')) then
if (k < command_argument_count()) then
call get_command_argument(k+1,arg)
read(arg,*,iostat=ios) times_arg
if ((ios /= 0) .or. (times_arg < 1)) times_arg = -1
end if
else if ((trim(psb_toupper(arg)) == '--CPU_FORMAT') .or. (trim(psb_toupper(arg)) == '--CPU_FMT')) then
if (k < command_argument_count()) then
call get_command_argument(k+1,arg)
cpu_fmt = psb_toupper(trim(arg))
end if
else if ((trim(psb_toupper(arg)) == '--GPU_FORMAT') .or. (trim(psb_toupper(arg)) == '--GPU_FMT')) then
if (k < command_argument_count()) then
call get_command_argument(k+1,arg)
gpu_fmt = psb_toupper(trim(arg))
end if
end if
end do
@ -683,9 +869,12 @@ program psb_spmv_kernel
write(psb_out_unit,*) 'Welcome to PSBLAS version: ', psb_version_string_
write(psb_out_unit,*) 'This is the psb_spmv_kernel sample program'
write(psb_out_unit,'("GPU enabled : ",l1)') use_gpu
write(psb_out_unit,'("Usage: ./psb_spmv_kernel [--gpu=TRUE|FALSE] [--dim=N] [--times=N] ",&
&"[--cpu_fmt=CSR|COO|CSC|ELL|HLL] [--gpu_fmt=HLL|ELL|CSR|HDIA] [--matrix=<path>] [--fmt=MM|HB] ",&
&"[--overlap|--nooverlap]")')
end if
call run_spmv_kernel(ctxt,use_gpu)
call run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_arg,times_arg,do_swap)
#ifdef PSB_HAVE_CUDA
if (use_gpu) call psb_cuda_exit()

@ -17,3 +17,13 @@ Communication patterns exercised:
This test validates the low-level communication behavior in isolation, without
the full SpMV overlap pipeline.
Run options
-----------
- Synthetic 3D stencil descriptor (default):
- `./runs/psb_comm_test [--dim N] [--iters N] [--mode both|baseline|neighbor|persistent]`
- External matrix descriptor:
- `./runs/psb_comm_test [--iters N] [--mode both|baseline|neighbor|persistent] --matrix <path> [--fmt MM|HB]`
With `--matrix`, the descriptor is built from the distributed input matrix pattern.

@ -17,6 +17,7 @@
!
program psb_comm_test
use psb_base_mod
use psb_util_mod
use psb_error_mod, only: psb_set_debug_level, psb_debug_ext_
use psi_mod
use psb_comm_factory_mod, only: psb_comm_set, psb_comm_free
@ -29,9 +30,12 @@ program psb_comm_test
integer(psb_ipk_) :: idim
integer(psb_ipk_) :: argc
integer(psb_ipk_) :: iters
character(len=32) :: arg
character(len=256) :: arg
character(len=16) :: mode
character(len=256) :: matrix_file
character(len=2) :: matrix_fmt
logical :: debug_swapdata
logical :: use_external_matrix
! ---- descriptor / context ----
type(psb_ctxt_type) :: ctxt
@ -39,6 +43,8 @@ program psb_comm_test
integer(psb_ipk_) :: my_rank, np, info, i, nr, number_of_local_rows
integer(psb_lpk_) :: m, nt
integer(psb_lpk_), allocatable :: myidx(:)
type(psb_dspmat_type) :: a_mat
type(psb_ldspmat_type) :: aux_a
! ---- vectors ----
type(psb_d_vect_type) :: v_baseline, v_neighbor, v_neighbor_persistent
@ -55,6 +61,7 @@ program psb_comm_test
! ---- error / reporting ----
integer(psb_ipk_) :: n_pass, n_total, imode
logical :: run_baseline, run_neighbor, run_persistent
logical :: mat_allocated
logical :: comm_ok
real(psb_dpk_) :: err, tol
real(psb_dpk_) :: t0, t1, dt, tsum_baseline, tsum_neighbor, tsum_neighbor_persistent
@ -70,6 +77,10 @@ program psb_comm_test
iters = 5
mode = 'both'
debug_swapdata = .false.
matrix_file = ''
matrix_fmt = 'MM'
use_external_matrix = .false.
mat_allocated = .false.
! ---- parse command-line argument for idim ----
idim = 10
@ -86,8 +97,24 @@ program psb_comm_test
call get_command_argument(i+1, arg)
read(arg, *) iters
end if
else if (index(psb_toupper(trim(arg)),'--MATRIX=') == 1) then
matrix_file = adjustl(arg(10:len_trim(arg)))
else if (trim(psb_toupper(arg)) == '--MATRIX') then
if (i < argc) then
call get_command_argument(i+1, matrix_file)
end if
else if (index(psb_toupper(trim(arg)),'--FMT=') == 1) then
arg = psb_toupper(adjustl(arg(7:len_trim(arg))))
if ((trim(arg) == 'MM') .or. (trim(arg) == 'HB')) matrix_fmt = trim(arg)
else if (trim(psb_toupper(arg)) == '--FMT') then
if (i < argc) then
call get_command_argument(i+1, arg)
arg = psb_toupper(trim(arg))
if ((trim(arg) == 'MM') .or. (trim(arg) == 'HB')) matrix_fmt = trim(arg)
end if
end if
end do
use_external_matrix = (len_trim(matrix_file) > 0)
! parse optional mode flag
do i = 1, argc
@ -126,7 +153,7 @@ program psb_comm_test
run_persistent = .true.
end select
if (idim <= 0) then
if ((.not.use_external_matrix) .and. (idim <= 0)) then
write(*,*) 'Invalid dimension specified. Usage: --dim <positive integer>'
call psb_abort(ctxt)
end if
@ -140,48 +167,83 @@ program psb_comm_test
write(psb_out_unit,'("================================================")')
write(psb_out_unit,'(" Test: D-type halo baseline vs neighbor topo")')
write(psb_out_unit,'(" Processes : ",i0)') np
write(psb_out_unit,'(" Grid : ",i0," x ",i0," x ",i0)') idim,idim,idim
if (use_external_matrix) then
write(psb_out_unit,'(" Matrix : ",a)') trim(matrix_file)
write(psb_out_unit,'(" Format : ",a)') trim(matrix_fmt)
else
write(psb_out_unit,'(" Grid : ",i0," x ",i0," x ",i0)') idim,idim,idim
end if
write(psb_out_unit,'(" Usage : ./psb_comm_test [--dim N] [--iters N] [--mode ...] ",&
&"[--matrix <path>] [--fmt MM|HB]")')
write(psb_out_unit,'("================================================")')
end if
! ==================================================================
! 2. Build descriptor with 7-point stencil connectivity
! ==================================================================
m = (1_psb_lpk_ * idim) * idim * idim
nt = (m + np - 1) / np
nr = max(0, min(int(nt,psb_ipk_), int(m - (my_rank * nt),psb_ipk_)))
call psb_cdall(ctxt, desc_a, info, nl=nr)
if (info /= psb_success_) then
write(psb_err_unit,*) my_rank, 'cdall error:', info
call psb_abort(ctxt)
end if
if (use_external_matrix) then
select case(psb_toupper(trim(matrix_fmt)))
case('MM')
call mm_mat_read(aux_a,info,filename=trim(matrix_file))
case('HB')
call hb_read(aux_a,info,filename=trim(matrix_file))
case default
info = psb_err_internal_error_
end select
if (info /= psb_success_) then
write(psb_err_unit,*) my_rank, 'matrix read error:', info
call psb_abort(ctxt)
end if
if (aux_a%get_nrows() /= aux_a%get_ncols()) then
write(psb_err_unit,*) my_rank, 'matrix must be square for this test'
call psb_abort(ctxt)
end if
m = aux_a%get_nrows()
call psb_matdist(aux_a, a_mat, ctxt, desc_a, info, fmt='CSR', parts=part_block)
if (info /= psb_success_) then
write(psb_err_unit,*) my_rank, 'matdist error:', info
call psb_abort(ctxt)
end if
mat_allocated = .true.
myidx = desc_a%get_global_indices()
number_of_local_rows = size(myidx)
else
m = (1_psb_lpk_ * idim) * idim * idim
nt = (m + np - 1) / np
nr = max(0, min(int(nt,psb_ipk_), int(m - (my_rank * nt),psb_ipk_)))
myidx = desc_a%get_global_indices()
number_of_local_rows = size(myidx)
call psb_cdall(ctxt, desc_a, info, nl=nr)
if (info /= psb_success_) then
write(psb_err_unit,*) my_rank, 'cdall error:', info
call psb_abort(ctxt)
end if
do i = 1, number_of_local_rows
call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)/), desc_a, info)
if (myidx(i) > 1) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)-1/), desc_a, info)
if (myidx(i) < m) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)+1/), desc_a, info)
if (myidx(i) > idim) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)-idim/), desc_a, info)
if (myidx(i) + idim <= m) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)+idim/), desc_a, info)
if (myidx(i) > int(idim,psb_lpk_)*idim) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), &
& (/myidx(i) - int(idim,psb_lpk_)*idim/), desc_a, info)
if (myidx(i) + int(idim,psb_lpk_)*idim <= m) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), &
& (/myidx(i) + int(idim,psb_lpk_)*idim/), desc_a, info)
end do
myidx = desc_a%get_global_indices()
number_of_local_rows = size(myidx)
do i = 1, number_of_local_rows
call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)/), desc_a, info)
if (myidx(i) > 1) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)-1/), desc_a, info)
if (myidx(i) < m) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)+1/), desc_a, info)
if (myidx(i) > idim) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)-idim/), desc_a, info)
if (myidx(i) + idim <= m) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), (/myidx(i)+idim/), desc_a, info)
if (myidx(i) > int(idim,psb_lpk_)*idim) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), &
& (/myidx(i) - int(idim,psb_lpk_)*idim/), desc_a, info)
if (myidx(i) + int(idim,psb_lpk_)*idim <= m) &
& call psb_cdins(1_psb_ipk_, (/myidx(i)/), &
& (/myidx(i) + int(idim,psb_lpk_)*idim/), desc_a, info)
end do
call psb_cdasb(desc_a, info)
if (info /= psb_success_) then
write(psb_err_unit,*) my_rank, 'cdasb error:', info
call psb_abort(ctxt)
call psb_cdasb(desc_a, info)
if (info /= psb_success_) then
write(psb_err_unit,*) my_rank, 'cdasb error:', info
call psb_abort(ctxt)
end if
end if
nrow = desc_a%get_local_rows() ! owned
@ -574,6 +636,7 @@ program psb_comm_test
9999 call psb_gefree(v_baseline, desc_a, info)
call psb_gefree(v_neighbor, desc_a, info)
call psb_gefree(v_neighbor_persistent, desc_a, info)
if (mat_allocated) call psb_spfree(a_mat, desc_a, info)
call psb_cdfree(desc_a, info)
call psb_exit(ctxt)

@ -837,14 +837,6 @@ program psb_d_pde3d
& err=err,itrace=itrace,&
& istop=istopc)
case('BICGSTAB','BICGSTABL','BICG','CG','CGS','FCG','GCR','RGMRES')
call psb_comm_set(psb_comm_persistent_ineighbor_alltoallv_,xxv%v%comm_handle,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='comm init'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
call psb_krylov(kmethd,a,prec,bv,xxv,eps,&
& desc_a,info,itmax=itmax,iter=iter,err=err,itrace=itrace,&
& istop=istopc,irst=irst)

Loading…
Cancel
Save