[WIP] RMA swapdata optimization, nest builder updates, GPU/scorep test tooling

- psi_dswapdata / psb_comm_rma: RMA swapdata optimization (PSCW + Win_allocate)
- psb_d_nest_builder: builder updates
- cuda: Makefile + psb_d_cuda_vect_mod adjustments
- test/comm: spmv/cg drivers, Makefiles, GPU + scorep/strong-scaling scripts
- test/nested: halo-regime test + Makefile/CMake updates
communication_v2
Stack-1 1 day ago
parent 4106f44331
commit 9533c22989

@ -1009,6 +1009,8 @@ contains
integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size
integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
integer(psb_mpk_), allocatable :: grp_ranks(:)
logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
type(psb_comm_rma_handle), pointer :: rma_handle
character(len=30) :: name
@ -1077,59 +1079,42 @@ contains
end if
end if
! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
! combuf is the pinned/registered buffer, so RMA uses pinned memory.
if (buffer_size > 0) then
if (.not. allocated(y%combuf)) then
! Allocate buffer with size of comm_indexes%v to include all metadata,
! matching baseline and topology buffer layout
call y%new_buffer(ione*size(comm_indexes%v), info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
else if (size(y%combuf) < size(comm_indexes%v)) then
! Need a larger exposed memory area: recreate the RMA window first,
! then reallocate combuf and lazily create a new window below.
if (rma_handle%window_open) then
call mpi_win_unlock_all(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_open = .false.
end if
if (rma_handle%window_ready) then
call mpi_win_free(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_ready = .false.
rma_handle%win = mpi_win_null
rma_handle%window_ready = .false.
end if
! Allocate buffer with size of comm_indexes%v to include all metadata,
! matching baseline and topology buffer layout
call y%new_buffer(ione*size(comm_indexes%v), info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
end if
end if
if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
! Expose combuf once and keep the window around until the descriptor changes.
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
if (.not. rma_handle%window_ready) then
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_ready = .true.
end if
rma_handle%window_ready = .true.
end if
if (buffer_size > 0) then
@ -1138,7 +1123,19 @@ contains
end if
call y%device_wait()
! Pull data from each peer with per-neighbor passive lock (neighbor-only sync).
! Open exposure (post) + access (start) epochs over the CACHED neighbor
! group (built once with the layout in the handle) -- no per-call build.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end if
do neighbor_idx=1, num_neighbors
proc_to_comm = rma_handle%peer_proc(neighbor_idx)
recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -1161,12 +1158,6 @@ contains
end if
if (recv_count > 0) then
call mpi_win_lock(MPI_LOCK_SHARED, prc_rank, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
remote_disp = int(remote_base - 1, kind=MPI_ADDRESS_KIND)
call mpi_get(y%combuf(recv_pos), recv_count, psb_mpi_r_dpk_, prc_rank, remote_disp, recv_count, psb_mpi_r_dpk_, &
& rma_handle%win, iret)
@ -1175,12 +1166,6 @@ contains
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
call mpi_win_unlock(prc_rank, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
end if
else
if (send_count /= recv_count) then
@ -1191,6 +1176,19 @@ contains
y%combuf(recv_pos:recv_pos+recv_count-1) = y%combuf(send_pos:send_pos+send_count-1)
end if
end do
! Close access epoch (my GETs done) then exposure epoch (peers done
! reading my window): received data is now in y%combuf. Cached group, no free.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_complete(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_wait(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end if
end if
end if
@ -1235,7 +1233,8 @@ contains
integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size
integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
integer(psb_mpk_), parameter :: rma_push_notify_tag = 914_psb_mpk_
integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
integer(psb_mpk_), allocatable :: grp_ranks(:)
logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
type(psb_comm_rma_handle), pointer :: rma_handle
character(len=30) :: name
@ -1302,59 +1301,42 @@ contains
end if
end if
! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
! combuf is the pinned/registered buffer, so RMA uses pinned memory.
if (buffer_size > 0) then
if (.not. allocated(y%combuf)) then
! Allocate buffer with size of comm_indexes%v to include all metadata,
! matching baseline and topology buffer layout
call y%new_buffer(ione*size(comm_indexes%v), info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
else if (size(y%combuf) < size(comm_indexes%v)) then
! Need a larger exposed memory area: recreate the RMA window first,
! then reallocate combuf and lazily create a new window below.
if (rma_handle%window_open) then
call mpi_win_unlock_all(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_open = .false.
end if
if (rma_handle%window_ready) then
call mpi_win_free(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_ready = .false.
rma_handle%win = mpi_win_null
rma_handle%window_ready = .false.
end if
! Allocate buffer with size of comm_indexes%v to include all metadata,
! matching baseline and topology buffer layout
call y%new_buffer(ione*size(comm_indexes%v), info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
end if
end if
if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
! Keep the window alive across repetitions: it is created once and reused.
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
if (.not. rma_handle%window_ready) then
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_ready = .true.
end if
rma_handle%window_ready = .true.
end if
if (buffer_size > 0) then
@ -1363,34 +1345,20 @@ contains
end if
call y%device_wait()
! Pre-post notification receives before opening the window (prevents isend/irecv ordering issues).
if (num_neighbors > 0) then
rma_handle%notify_recv_reqs(1:num_neighbors) = MPI_REQUEST_NULL
rma_handle%notify_send_reqs(1:num_neighbors) = MPI_REQUEST_NULL
end if
do neighbor_idx=1, num_neighbors
proc_to_comm = rma_handle%peer_proc(neighbor_idx)
if (proc_to_comm /= my_rank) then
prc_rank = rma_handle%peer_mpi_rank(neighbor_idx)
call mpi_irecv(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
& rma_push_notify_tag, icomm, rma_handle%notify_recv_reqs(neighbor_idx), iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
! Open exposure (post) + access (start) epochs over the CACHED neighbor
! group (built once with the layout in the handle) -- no per-call build.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end do
call mpi_win_lock_all(0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_open = .true.
! Push data to each peer; after flush send a P2P notification so target knows data arrived.
! Issue every PUT (remote) or perform the self-copy (local).
do neighbor_idx=1, num_neighbors
proc_to_comm = rma_handle%peer_proc(neighbor_idx)
recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -1422,19 +1390,6 @@ contains
goto 9999
end if
end if
call mpi_win_flush(prc_rank, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
call mpi_isend(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
& rma_push_notify_tag, icomm, rma_handle%notify_send_reqs(neighbor_idx), iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
else
if (send_count /= recv_count) then
info = psb_err_internal_error_
@ -1444,36 +1399,24 @@ contains
y%combuf(recv_pos:recv_pos+recv_count-1) = y%combuf(send_pos:send_pos+send_count-1)
end if
end do
end if
end if
! WAIT phase: close epoch, wait for P2P notifications, then scatter.
if (do_wait) then
if (rma_handle%window_open) then
call mpi_win_unlock_all(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_open = .false.
end if
if (num_neighbors > 0) then
call mpi_waitall(num_neighbors, rma_handle%notify_recv_reqs, MPI_STATUSES_IGNORE, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
call mpi_waitall(num_neighbors, rma_handle%notify_send_reqs, MPI_STATUSES_IGNORE, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
! Close access epoch (my PUTs done) then exposure epoch (peers' PUTs into
! me done): receive area now holds the pushed data. Cached group, no free.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_complete(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_wait(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end if
end if
end if
! WAIT phase: data already delivered by PSCW in START; just scatter into Y.
if (do_wait) then
if (total_recv > 0) then
call y%sct(int(total_recv,psb_mpk_), rma_handle%peer_recv_indexes, y%combuf(total_send+1:total_send+total_recv), beta)
end if
@ -2341,6 +2284,8 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size, total_send_, total_recv_
integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
integer(psb_mpk_), allocatable :: grp_ranks(:)
logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
type(psb_comm_rma_handle), pointer :: rma_handle
character(len=30) :: name
@ -2406,33 +2351,42 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
end if
end if
! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
! combuf is the pinned/registered buffer, so RMA uses pinned memory.
if (buffer_size > 0) then
if (.not. allocated(y%combuf)) then
call y%new_buffer(buffer_size, info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
else if (size(y%combuf) < buffer_size) then
if (rma_handle%window_open) then
call mpi_win_unlock_all(rma_handle%win, iret)
rma_handle%window_open = .false.
end if
if (rma_handle%window_ready) then
call mpi_win_free(rma_handle%win, iret)
rma_handle%win = mpi_win_null
rma_handle%window_ready = .false.
end if
call y%new_buffer(buffer_size, info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
end if
end if
if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
if (.not. rma_handle%window_ready) then
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_ready = .true.
end if
rma_handle%window_ready = .true.
end if
if (buffer_size > 0) then
@ -2441,7 +2395,19 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
end if
call y%device_wait()
! Pull data from each peer with per-neighbor passive lock (neighbor-only sync).
! Open exposure (post) + access (start) epochs over the CACHED neighbor
! group (built once with the layout in the handle) -- no per-call build.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end if
do neighbor_idx=1, num_neighbors
proc_to_comm = rma_handle%peer_proc(neighbor_idx)
recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -2458,12 +2424,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
goto 9999
end if
if (recv_count > 0) then
call mpi_win_lock(MPI_LOCK_SHARED, prc_rank, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
remote_disp = int((remote_base - 1) * n, kind=MPI_ADDRESS_KIND)
call mpi_get(y%combuf(recv_pos), recv_count*n, psb_mpi_r_dpk_, prc_rank, remote_disp, recv_count*n, psb_mpi_r_dpk_, &
& rma_handle%win, iret)
@ -2472,12 +2432,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
call mpi_win_unlock(prc_rank, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
end if
else
if (send_count /= recv_count) then
@ -2488,6 +2442,19 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
y%combuf(recv_pos:recv_pos+recv_count*n-1) = y%combuf(send_pos:send_pos+send_count*n-1)
end if
end do
! Close access epoch (my GETs done) then exposure epoch (peers done
! reading my window): received data is now in y%combuf. Cached group, no free.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_complete(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_wait(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end if
end if
end if
@ -2531,7 +2498,8 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size, total_send_, total_recv_
integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
integer(psb_mpk_), parameter :: rma_push_notify_tag = 914_psb_mpk_
integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
integer(psb_mpk_), allocatable :: grp_ranks(:)
logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
type(psb_comm_rma_handle), pointer :: rma_handle
character(len=30) :: name
@ -2597,33 +2565,42 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
end if
end if
! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
! combuf is the pinned/registered buffer, so RMA uses pinned memory.
if (buffer_size > 0) then
if (.not. allocated(y%combuf)) then
call y%new_buffer(buffer_size, info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
else if (size(y%combuf) < buffer_size) then
if (rma_handle%window_open) then
call mpi_win_unlock_all(rma_handle%win, iret)
rma_handle%window_open = .false.
end if
if (rma_handle%window_ready) then
call mpi_win_free(rma_handle%win, iret)
rma_handle%win = mpi_win_null
rma_handle%window_ready = .false.
end if
call y%new_buffer(buffer_size, info)
if (info /= psb_success_) then
call psb_errpush(psb_err_alloc_dealloc_,name)
goto 9999
call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
end if
end if
end if
if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
if (.not. rma_handle%window_ready) then
element_bytes = storage_size(y%combuf(1))/8
exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
& mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_ready = .true.
end if
rma_handle%window_ready = .true.
end if
if (buffer_size > 0) then
@ -2632,33 +2609,20 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
end if
call y%device_wait()
! Pre-post notification receives before opening the window.
if (num_neighbors > 0) then
rma_handle%notify_recv_reqs(1:num_neighbors) = MPI_REQUEST_NULL
rma_handle%notify_send_reqs(1:num_neighbors) = MPI_REQUEST_NULL
end if
do neighbor_idx=1, num_neighbors
proc_to_comm = rma_handle%peer_proc(neighbor_idx)
if (proc_to_comm /= my_rank) then
prc_rank = rma_handle%peer_mpi_rank(neighbor_idx)
call mpi_irecv(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
& rma_push_notify_tag, icomm, rma_handle%notify_recv_reqs(neighbor_idx), iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
! Open exposure (post) + access (start) epochs over the CACHED neighbor
! group (built once with the layout in the handle) -- no per-call build.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end do
call mpi_win_lock_all(0, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_open = .true.
! Issue every PUT (remote) or perform the self-copy (local).
do neighbor_idx=1, num_neighbors
proc_to_comm = rma_handle%peer_proc(neighbor_idx)
recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -2684,19 +2648,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
goto 9999
end if
end if
call mpi_win_flush(prc_rank, rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
call mpi_isend(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
& rma_push_notify_tag, icomm, rma_handle%notify_send_reqs(neighbor_idx), iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
else
if (send_count /= recv_count) then
info = psb_err_internal_error_
@ -2706,34 +2657,24 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
y%combuf(recv_pos:recv_pos+recv_count*n-1) = y%combuf(send_pos:send_pos+send_count*n-1)
end if
end do
! Close access epoch (my PUTs done) then exposure epoch (peers' PUTs into
! me done): receive area now holds the pushed data. Cached group, no free.
if (rma_handle%nbr_grp /= mpi_group_null) then
call mpi_win_complete(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
call mpi_win_wait(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
end if
end if
end if
end if
! WAIT phase: close epoch, wait for P2P notifications, then scatter.
! WAIT phase: data already delivered by PSCW in START; just scatter into Y.
if (do_wait) then
if (rma_handle%window_open) then
call mpi_win_unlock_all(rma_handle%win, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
rma_handle%window_open = .false.
end if
if (num_neighbors > 0) then
call mpi_waitall(num_neighbors, rma_handle%notify_recv_reqs, MPI_STATUSES_IGNORE, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
call mpi_waitall(num_neighbors, rma_handle%notify_send_reqs, MPI_STATUSES_IGNORE, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
call psb_errpush(info,name,m_err=(/iret/))
goto 9999
end if
end if
if (total_recv > 0) then
call y%sct(int(total_recv,psb_mpk_), rma_handle%peer_recv_indexes, y%combuf(total_send_+1:total_send_+total_recv_), beta)
end if

@ -3,6 +3,7 @@ module psb_comm_rma_mod
use psb_desc_const_mod, only: psb_proc_id_, psb_n_elem_recv_, psb_elem_recv_, &
& psb_n_elem_send_, psb_elem_send_
use psb_error_mod
use, intrinsic :: iso_c_binding, only: c_ptr, c_null_ptr
#ifdef PSB_MPI_MOD
use mpi
#endif
@ -18,6 +19,9 @@ module psb_comm_rma_mod
integer(psb_mpk_) :: win = mpi_win_null
logical :: window_ready = .false.
logical :: window_open = .false.
type(c_ptr) :: win_base = c_null_ptr
integer(psb_ipk_) :: win_nelem = 0
integer(psb_mpk_) :: nbr_grp = mpi_group_null ! neighbor MPI group, built once with the layout
logical :: layout_ready = .false.
integer(psb_ipk_) :: layout_nnbr = -1
integer(psb_ipk_) :: layout_send = -1
@ -57,6 +61,8 @@ contains
this%win = mpi_win_null
this%window_ready = .false.
this%window_open = .false.
this%win_base = c_null_ptr
this%win_nelem = 0
this%layout_ready = .false.
this%layout_nnbr = -1
this%layout_send = -1
@ -67,8 +73,14 @@ contains
subroutine psb_comm_rma_clear_memory_buffer_layout(this, info)
class(psb_comm_rma_handle), intent(inout) :: this
integer(psb_ipk_), intent(out) :: info
integer(psb_mpk_) :: iret
info = psb_success_
! Neighbors change when the layout is rebuilt: free the cached group.
if (this%nbr_grp /= mpi_group_null) then
call mpi_group_free(this%nbr_grp, iret)
this%nbr_grp = mpi_group_null
end if
if (allocated(this%peer_proc)) deallocate(this%peer_proc)
if (allocated(this%peer_send_counts)) deallocate(this%peer_send_counts)
if (allocated(this%peer_recv_counts)) deallocate(this%peer_recv_counts)
@ -105,6 +117,8 @@ contains
integer(psb_ipk_) :: list_pos, send_offset, recv_offset
integer(psb_ipk_) :: proc_to_comm, recv_count, send_count
integer(psb_ipk_) :: local_meta(4), remote_meta(4)
integer(psb_mpk_) :: comm_grp, n_off, grp_idx
integer(psb_mpk_), allocatable :: grp_ranks(:)
call this%clear_memory_buffer_layout(info)
if (info /= psb_success_) return
@ -206,6 +220,39 @@ contains
return
end if
! Build the neighbor MPI group ONCE (off-diagonal peers); reused by every
! RMA exchange (PSCW post/start) instead of rebuilding it each call.
n_off = 0
do neighbor_idx = 1, n_neighbors
if (this%peer_proc(neighbor_idx) /= my_rank) n_off = n_off + 1
end do
if (n_off > 0) then
allocate(grp_ranks(n_off), stat=iret)
if (iret /= 0) then
info = psb_err_alloc_dealloc_
return
end if
grp_idx = 0
do neighbor_idx = 1, n_neighbors
if (this%peer_proc(neighbor_idx) /= my_rank) then
grp_idx = grp_idx + 1
grp_ranks(grp_idx) = this%peer_mpi_rank(neighbor_idx)
end if
end do
call mpi_comm_group(icomm, comm_grp, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
return
end if
call mpi_group_incl(comm_grp, n_off, grp_ranks, this%nbr_grp, iret)
if (iret /= mpi_success) then
info = psb_err_mpi_error_
return
end if
call mpi_group_free(comm_grp, iret)
deallocate(grp_ranks, stat=iret)
end if
this%layout_nnbr = n_neighbors
this%layout_send = send_total
this%layout_recv = recv_total
@ -229,6 +276,8 @@ contains
call mpi_win_free(this%win, iret)
this%win = mpi_win_null
end if
this%win_base = c_null_ptr
this%win_nelem = 0
this%window_ready = .false.
this%layout_ready = .false.
this%layout_nnbr = -1

@ -83,6 +83,15 @@ module psb_d_nest_builder_mod
real(psb_dpk_), allocatable :: entry_vals(:)
end type psb_d_nest_block_buffer
! per-block selective halo support: scatter map from a block's own (restricted)
! column descriptor onto the field-j column space the block matrix is localized
! against. map(k) is the field-j local column position of the k-th local column
! of block_col_desc(i,j); used to scatter a per-block psb_halo result into the
! field vector consumed by the (field-localized) block csmv.
type :: psb_d_nest_blk2field_map
integer(psb_ipk_), allocatable :: map(:)
end type psb_d_nest_blk2field_map
type :: psb_d_nest_matrix
type(psb_ctxt_type) :: context
integer(psb_ipk_) :: n_fields = 0
@ -95,6 +104,14 @@ module psb_d_nest_builder_mod
type(psb_desc_nest_type) :: grid_desc
type(psb_dspmat_type) :: a_glob ! the matrix to hand to Krylov
type(psb_desc_type) :: desc_glob ! the global descriptor
! per-block (selective) halo-exchange support. block_col_desc(i,j) carries
! ONLY block (i,j)'s own off-process columns (a subset of field j's union
! halo), so its halo can be exchanged independently of the rest of column j;
! blk2field(i,j) maps its local columns onto the field-j local columns the
! block matrix is localized against. These coexist with a_glob/desc_glob
! (the fused union path) WITHOUT a second copy of the blocks.
type(psb_desc_type), allocatable :: block_col_desc(:,:)
type(psb_d_nest_blk2field_map), allocatable :: blk2field(:,:)
contains
procedure, pass(op) :: init => psb_d_nest_op_init
procedure, pass(op) :: ins => psb_d_nest_op_ins
@ -107,7 +124,7 @@ module psb_d_nest_builder_mod
end type psb_d_nest_matrix
private
public :: psb_d_nest_matrix
public :: psb_d_nest_matrix, psb_d_nest_blk2field_map
contains
@ -201,6 +218,8 @@ contains
type(psb_d_nest_base_mat) :: nest_operator
integer(psb_ipk_) :: n_fields, i_field, j_field
integer(psb_ipk_) :: field_local_rows, n_block_cols, k_col, field_col
integer(psb_lpk_) :: global_col_idx
character(len=24) :: name
info = psb_success_
@ -215,6 +234,50 @@ contains
end if
end do
! 1b) per-block (restricted-halo) column descriptors + scatter maps, for the
! selective halo-exchange regime. Each present block (i,j) gets a
! descriptor whose halo is ONLY block (i,j)'s own off-process columns
! (registered from that single block's buffer, NOT the column-wide union),
! sharing field j's owned distribution. blk2field(i,j)%map sends its
! local columns onto field j's local columns (the space the block matrix
! is localized against), so a per-block psb_halo can be scattered into the
! field vector consumed by the block csmv. The buffers are still alive
! here (freed in step 6).
allocate(op%block_col_desc(n_fields,n_fields), op%blk2field(n_fields,n_fields), stat=info)
if (info /= 0) then
info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return
end if
do j_field = 1, n_fields
field_local_rows = op%field_desc(j_field)%get_local_rows()
do i_field = 1, n_fields
if (op%block_buffer(i_field,j_field)%n_entries <= 0) cycle
! owned distribution identical to field j (same nl => same global ranges)
call psb_cdall(op%context, op%block_col_desc(i_field,j_field), info, nl=field_local_rows)
if (info == psb_success_) &
& call psb_cdins(op%block_buffer(i_field,j_field)%n_entries, &
& op%block_buffer(i_field,j_field)%entry_cols(1:op%block_buffer(i_field,j_field)%n_entries), &
& op%block_col_desc(i_field,j_field), info)
if (info == psb_success_) call psb_cdasb(op%block_col_desc(i_field,j_field), info)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_, name, a_err='per-block cdasb'); return
end if
! block-local column k -> field-j local column position
n_block_cols = op%block_col_desc(i_field,j_field)%get_local_cols()
allocate(op%blk2field(i_field,j_field)%map(n_block_cols), stat=info)
if (info /= 0) then
info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return
end if
do k_col = 1, n_block_cols
call op%block_col_desc(i_field,j_field)%l2g(k_col, global_col_idx, info)
if (info == psb_success_) call op%field_desc(j_field)%g2l(global_col_idx, field_col, info)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_, name, a_err='blk2field l2g/g2l'); return
end if
op%blk2field(i_field,j_field)%map(k_col) = field_col
end do
end do
end do
! 2) build the local blocks (generally rectangular) from the triplets
op%block_storage%nrblocks = n_fields
op%block_storage%ncblocks = n_fields
@ -300,6 +363,17 @@ contains
call op%desc_glob%free(local_info)
call op%grid_desc%free(local_info)
end if
if (allocated(op%block_col_desc)) then
do j_field = 1, size(op%block_col_desc,2)
do i_field = 1, size(op%block_col_desc,1)
! only present blocks were assembled (and carry a blk2field map)
if (allocated(op%blk2field) .and. allocated(op%blk2field(i_field,j_field)%map)) &
& call op%block_col_desc(i_field,j_field)%free(local_info)
end do
end do
deallocate(op%block_col_desc, stat=local_info)
end if
if (allocated(op%blk2field)) deallocate(op%blk2field, stat=local_info)
if (allocated(op%field_desc)) then
do i_field = 1, size(op%field_desc)
call op%field_desc(i_field)%free(local_info)

@ -106,7 +106,7 @@ svectordev.o: svectordev.h vectordev.h
dvectordev.o: dvectordev.h vectordev.h
cvectordev.o: cvectordev.h vectordev.h
zvectordev.o: zvectordev.h vectordev.h
psb_cuda_env_mod.o: base_cusparse_mod.o
psb_cuda_env_mod.o: base_cusparse_mod.o core_mod.o
psb_cuda_mod.o: psb_cuda_env_mod.o psb_i_cuda_vect_mod.o\
psb_d_cuda_vect_mod.o psb_s_cuda_vect_mod.o\
psb_z_cuda_vect_mod.o psb_c_cuda_vect_mod.o\

@ -77,6 +77,7 @@ module psb_d_cuda_vect_mod
procedure, pass(x) :: set_sync => d_cuda_set_sync
procedure, pass(x) :: set_scal => d_cuda_set_scal
!!$ procedure, pass(x) :: set_vect => d_cuda_set_vect
procedure, pass(x) :: gthzv => d_cuda_gthzv
procedure, pass(x) :: gthzv_x => d_cuda_gthzv_x
procedure, pass(y) :: sctb => d_cuda_sctb
procedure, pass(y) :: sctb_x => d_cuda_sctb_x
@ -231,6 +232,45 @@ contains
end select
end subroutine d_cuda_check_addr
subroutine d_cuda_gthzv(n,idx,x,y)
! GPU override of the plain-array-index gather y(1:n) = x(idx(1:n)).
! Without it the generic gth(n,idx(:),buf) -- used ONLY by the RMA swap path
! (psi_d_swapdata, rma_pull/push) -- fell back to the host d_base_gthzv, which
! calls x%sync() and copies the WHOLE vector device->host on EVERY swap. Here
! we gather on the device straight from x%deviceVect (mirrors the class-default
! branch of d_cuda_gthzv_x): only the n boundary indices move H2D, no full D2H.
use psb_cuda_env_mod
use psi_serial_mod
implicit none
integer(psb_mpk_) :: n
integer(psb_ipk_) :: idx(:)
real(psb_dpk_) :: y(:)
class(psb_d_vect_cuda) :: x
integer :: info, ni
info = 0
if (x%is_host()) call x%sync() ! ensure device copy is current (no D2H of the whole vector)
ni = size(idx)
if (x%i_buf_sz < ni) then
if (c_associated(x%i_buf)) then
call freeInt(x%i_buf); x%i_buf = c_null_ptr
end if
info = allocateInt(x%i_buf,ni); x%i_buf_sz = ni
end if
if (x%dt_buf_sz < n) then
if (c_associated(x%dt_buf)) then
call freeDouble(x%dt_buf); x%dt_buf = c_null_ptr
end if
info = allocateDouble(x%dt_buf,n); x%dt_buf_sz = n
end if
if (info == 0) info = writeInt(x%i_buf,idx,ni)
if (info == 0) info = igathMultiVecDeviceDouble(x%deviceVect, 0, n, 1, x%i_buf, 1, x%dt_buf, 1)
if (info == 0) info = readDouble(x%dt_buf,y,n)
end subroutine d_cuda_gthzv
subroutine d_cuda_gthzv_x(i,n,idx,x,y)
use psb_cuda_env_mod
use psi_serial_mod

@ -0,0 +1,89 @@
#!/bin/bash
# ============================================================================
# gpu_cg.sh GPU Conjugate Gradient across comm backends -- "sparse ranks"
#
# Final test of the comm-scheme study: a full CG solve (not just spmv) on GPU,
# with few MPI ranks (1 rank = 1 GPU) spread over MANY nodes so the halo
# exchange is dominated by inter-node traffic -- the regime where RMA may pay
# off. The test sweeps ALL backends (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET,
# MPI_PUT) x 3 preconditioners internally, so there is no per-mode loop here.
#
# It also self-verifies, for every scheme, that an internally-allocated work
# vector (like CG's hidden r/p/q/z) inherits the scheme from desc_a%comm_type
# -- look for "[OK] internal-style work vector inherited scheme" in run.out,
# and abort on "INTERNAL-VECTOR SCHEME MISMATCH".
# ============================================================================
#SBATCH --job-name=cg_gpu
#SBATCH --output=cg_gpu_%j.out
#SBATCH --error=cg_gpu_%j.err
#SBATCH --nodes=8
#SBATCH --ntasks-per-node=4 # MN5-ACC: 4x H100 per node
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=20 # 80 cores / 4 ranks
#SBATCH --time=00:30:00
#SBATCH --qos=acc_debug
#SBATCH --account=ehpc580
# ============================================================================
# USER CONFIGURATION
# ============================================================================
NREP=8 # CG solves per (scheme,prec) for statistics
NWARM=1 # warm-up solves (discarded)
ITMAX=500 # CG iteration cap
IDIM=200 # ignored when --matrix is given
# --- "sparse" knob: GPUs (= ranks) per node.
# RANKS_PER_NODE=1 -> 1 GPU/node, maximally spread, pure inter-node comm.
RANKS_PER_NODE=1
RANK_POINTS="2 4 8" # total ranks per scale point
EXE=$HOME/Desktop/scorep/psblas3/test/comm/cg/runs/psb_comm_cg_test
MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx
# ============================================================================
# ENVIRONMENT
# ============================================================================
module purge
module load bsc/1.0
module load nvidia-hpc-sdk/25.3
module load gcc/12.3.0
module load ucx/1.16.0-gcc
module load openmpi/5.0.5-gcc
module load openblas/0.3.27-gcc
export PATH="$HOME/scorep-cuda/bin:$PATH"
export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
export OMPI_MCA_coll_hcoll_enable=0
export SCOREP_CUDA_ENABLE=yes
export SCOREP_CUDA_BUFFER=48M
RESDIR=$SLURM_SUBMIT_DIR/results_cg_gpu_${SLURM_JOB_ID}
mkdir -p $RESDIR
echo "=== CG GPU (sparse-ranks study) ==="
echo " nrep=$NREP nwarm=$NWARM itmax=$ITMAX ranks_per_node=$RANKS_PER_NODE"
echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS]"
echo "==================================="
for NRANKS in $RANK_POINTS; do
NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE ))
STEP_DIR=$RESDIR/${NRANKS}ranks
mkdir -p $STEP_DIR
echo ""
echo ">>> CG GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)"
srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \
--gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \
$EXE $IDIM $NREP $NWARM $ITMAX --gpu=TRUE --matrix=$MATRIX --fmt=MM \
> $STEP_DIR/run.out 2>&1
echo ">>> exit=$? output=$STEP_DIR/run.out"
# quick propagation sanity at submit time:
grep -q "INTERNAL-VECTOR SCHEME MISMATCH" $STEP_DIR/run.out \
&& echo " !! SCHEME PROPAGATION FAILED at $NRANKS ranks -- check run.out"
done
echo ""
echo "=== CG GPU DONE. Results: $RESDIR ==="
ls -R $RESDIR

@ -18,6 +18,8 @@ program psb_comm_cg_test
type(psb_dspmat_type) :: a
type(psb_desc_type) :: desc_a
type(psb_d_vect_type) :: b, x
type(psb_d_vect_type) :: probe ! mimics a hidden CG work vector for scheme-propagation check
integer(psb_ipk_) :: probe_got
type(psb_dprec_type) :: prec
#ifdef PSB_HAVE_CUDA
type(psb_d_vect_cuda) :: vmold
@ -30,7 +32,7 @@ program psb_comm_cg_test
integer(psb_ipk_) :: desc_me, desc_np
integer(psb_ipk_) :: idim, itmax, itrace, istop, iter
integer(psb_ipk_) :: scheme_idx, prec_idx, rep, nrep, nwarm
integer(psb_ipk_), parameter :: n_schemes=5, n_precs=2
integer(psb_ipk_), parameter :: n_schemes=5, n_precs=3
integer(psb_ipk_), allocatable :: iter_count(:,:,:), solve_info(:,:,:)
integer(psb_ipk_) :: scheme_type(n_schemes)
real(psb_dpk_) :: eps, err, t_start, t_elapsed
@ -81,8 +83,10 @@ program psb_comm_cg_test
prec_type(1) = 'NONE'
prec_type(2) = 'DIAG'
prec_type(3) = 'BJAC'
prec_name(1) = 'none'
prec_name(2) = 'diag'
prec_name(3) = 'bjac'
call get_command_argument(1,arg)
if (len_trim(arg) > 0) then
@ -167,7 +171,7 @@ program psb_comm_cg_test
end if
write(psb_out_unit,'("Number of processors : ",i0)') np
write(psb_out_unit,'("Iterative method : CG")')
write(psb_out_unit,'("Preconditioners : NONE, DIAG")')
write(psb_out_unit,'("Preconditioners : NONE, DIAG, BJAC")')
write(psb_out_unit,'("Max iterations (CG) : ",i0)') itmax
write(psb_out_unit,'("Repetitions : ",i0)') nrep
write(psb_out_unit,'("Warmup solves : ",i0)') nwarm
@ -280,6 +284,36 @@ program psb_comm_cg_test
end if
end if
! ---- DECISIVE CHECK: scheme propagation to a HIDDEN-style work vector ----
! The x check above can be silently skipped: inside CG, x is only updated
! with axpby (never haloed), so x%v%comm_handle may stay unallocated. The
! vectors that ARE haloed (r,p,q,z) are allocated and freed *inside* psb_dcg,
! so the test cannot inspect them directly. Here we replicate exactly what
! psb_dcg does to those work vectors -- psb_geall against desc_a + geasb with
! mold=x%v (so it is GPU-resident too when use_gpu) -- then force one halo.
! Its comm_handle must lazy-init from desc_a%comm_type, proving that any
! internally-allocated vector inherits the chosen scheme.
call psb_geall(probe, desc_a, info)
if (info == psb_success_) call psb_geasb(probe, desc_a, info, mold=x%v)
if (info == psb_success_) call psb_geaxpby(done, b, dzero, probe, desc_a, info)
if (info == psb_success_) call psb_halo(probe, desc_a, info) ! triggers dispatch
if (info /= psb_success_) goto 9999
probe_got = -1
if (allocated(probe%v%comm_handle)) probe_got = probe%v%comm_handle%comm_type
if (probe_got /= scheme_type(scheme_idx)) then
if (my_rank == psb_root_) &
write(psb_err_unit,'("INTERNAL-VECTOR SCHEME MISMATCH rank=",i0," expected=",i0," got=",i0)') &
my_rank, scheme_type(scheme_idx), probe_got
info = psb_err_internal_error_
goto 9999
else if (my_rank == psb_root_ .and. rep == 1) then
write(psb_out_unit,'(" [OK] internal-style work vector inherited scheme: ",a)') &
trim(scheme_name(scheme_idx))
end if
call psb_gefree(probe, desc_a, info)
if (info /= psb_success_) goto 9999
! ------------------------------------------------------------------------
call psb_geaxpby(dzero,b,dzero,x,desc_a,info)
if (info /= psb_success_) goto 9999

@ -15,7 +15,7 @@ TOBJS=psb_spmv_test.o
EXEDIR=./runs
all: runsd spmv_overlap
all: runsd psb_spmv_kernel
runsd:
(if test ! -d runs ; then mkdir runs; fi)
@ -23,10 +23,9 @@ runsd:
psb_spmv_test.o: psb_spmv_test.f90
$(FC) $(FCOPT) -cpp $(FINCLUDES) $(FDEFINES) $(FCUDEFINES) -c $< -o $@
spmv_overlap: $(TOBJS)
$(FLINK) $(LOPT) $(TOBJS) -o spmv_overlap $(PSBLAS_LIB) $(LDLIBS)
/bin/mv spmv_overlap $(EXEDIR)
/bin/cp -f $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel
psb_spmv_kernel: $(TOBJS)
$(FLINK) $(LOPT) $(TOBJS) -o psb_spmv_kernel $(PSBLAS_LIB) $(LDLIBS)
/bin/mv psb_spmv_kernel $(EXEDIR)
clean:
/bin/rm -f $(TOBJS) $(TOBJS_API) *$(.mod) $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel

@ -0,0 +1,206 @@
#!/bin/bash
# ============================================================================
# extract_scorep.sh -- auto-extract comm-scheme measurements from a
# results_*_<jobid> directory (swapdata/spmv style).
#
# For every <N>ranks/ subdir it collects:
# 1) run.out wall-clock avg time per backend (the REAL headline number)
# 2) cube_stat -r per swap routine from scorep_profile/profile.cubex:
# - INCL (total time in the routine, summed over call paths)
# - one-time setup (Win_create/Win_allocate/topology_init/
# ini_memory_buffer_layout/alltoallv_init)
# - steady = INCL - one-time
# - every MPI_* component (to scorep_components.csv)
#
# Output (written into RESULTS_DIR):
# runout.csv ranks,backend,total_s,avg_s
# scorep_summary.csv ranks,scheme,incl_s,onetime_s,steady_s
# scorep_components.csv ranks,scheme,component,time_s
# plus pretty pivot tables on stdout.
#
# Usage:
# ./extract_scorep.sh # process ALL <N>ranks/ in current dir
# ./extract_scorep.sh 80ranks # process ONE rank-point dir
# ./extract_scorep.sh path/to/results # process ALL <N>ranks/ under that dir
#
# CSVs ACCUMULATE (header written only if absent), so you can run one
# rank-point at a time and the pivots grow. To start over: rm *.csv
# ============================================================================
set -u
ARG="${1:-.}"
OUTDIR="${OUTDIR:-.}" # where the CSVs go (default: current dir)
command -v cube_stat >/dev/null 2>&1 || { echo "ERROR: cube_stat not in PATH"; exit 1; }
# scheme label : mangled routine name used by cube_stat -r
SCHEMES=(
"baseline:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_baseline_vect"
"neighbor:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_topology_vect"
"persistent:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_persistent_topology_vect"
"rma_pull:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_pull_vect"
"rma_push:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_push_vect"
)
RUNOUT="$OUTDIR/runout.csv"
SUMMARY="$OUTDIR/scorep_summary.csv"
COMPONENTS="$OUTDIR/scorep_components.csv"
# write header only if the file does not exist yet (append/accumulate mode)
[ -f "$RUNOUT" ] || echo "ranks,backend,total_s,avg_s,setup_s" > "$RUNOUT"
[ -f "$SUMMARY" ] || echo "ranks,scheme,incl_s,setup_s,gpuwait_s,pack_s,mpimove_s,mpiwait_s,netmpi_s" > "$SUMMARY"
[ -f "$COMPONENTS" ] || echo "ranks,scheme,bucket,component,time_s" > "$COMPONENTS"
# Select dirs: a single rank-point dir (has scorep_profile/run.out) or, if ARG
# is a results dir, all its <N>ranks/ subdirs.
if [ -f "$ARG/scorep_profile/profile.cubex" ] || [ -f "$ARG/run.out" ]; then
DIRS=( "$ARG" )
else
DIRS=( $(ls -d "$ARG"/*ranks 2>/dev/null | sort -t/ -k99 -V) )
fi
[ ${#DIRS[@]} -eq 0 ] && { echo "No rank-point dir(s) found under '$ARG'"; exit 1; }
for d in "${DIRS[@]}"; do
ranks=$(basename "$d" | sed 's/[^0-9]//g')
[ -z "$ranks" ] && continue
# ---- 1) run.out wall-clock avg + setup per backend ----
# One run.out holds several "comm backend" blocks in sequence; we accumulate
# tot/avg/setup for the current block and flush it when the next block starts
# (or at EOF). "setup time" is optional: 0 if the benchmark does not emit it
# (i.e. until the warm-up-timing patch lands), so the break-even falls back to
# pure steady-state comparison.
if [ -f "$d/run.out" ]; then
awk -v r="$ranks" '
function flush(){ if (be!=""){ print r","be","tot","avg","setup } be="";tot=0;avg=0;setup=0 }
/comm backend/ { flush(); split($0,a,":"); be=a[2]; gsub(/ /,"",be) }
/total time/ { split($0,a,":"); tot=a[2]+0 }
/avg time/ { split($0,a,":"); avg=a[2]+0 }
/setup time/ { split($0,a,":"); setup=a[2]+0 }
END { flush() }
' "$d/run.out" >> "$RUNOUT"
fi
# ---- 2) scorep cube_stat per scheme ----
cubex="$d/scorep_profile/profile.cubex"
[ -f "$cubex" ] || { echo "WARN: no profile.cubex in $d" >&2; continue; }
for entry in "${SCHEMES[@]}"; do
label="${entry%%:*}"; routine="${entry#*:}"
out=$(cube_stat -r "$routine" "$cubex" 2>/dev/null)
[ -z "$out" ] && continue
printf '%s\n' "$out" | awk -v ranks="$ranks" -v scheme="$label" -v comp="$COMPONENTS" '
# Bucketize every child component of the swap routine. cube_stat -r reports,
# for each child, its inclusive time within the routine subtree, and the
# children partition INCL -- so per-bucket sums are meaningful.
#
# Why this matters on GPU: the single largest component in EVERY scheme is
# d_cuda_device_wait (~0.036s, the GPU stream sync), which is NOT network
# time and is ~equal across schemes. The old "steady = INCL - onetime" left
# it inside, masking the real differentiator -- the MPI wait/sync time.
# Buckets (each component lands in exactly one, order matters):
# setup : one-time Win_create / topology_init / alltoallv_init /
# ini_memory_buffer_layout
# gpuwait : GPU stream sync (device_wait, cuda_sync) -- exclude from MPI
# mpiwait : blocking MPI sync (MPI_Wait, Win_wait/post/start/complete)
# <-- the discriminating metric ("guarda le wait")
# mpimove : actual transfer (Isend/Irecv/Put/Get/Start/[I]neighbor_alltoallv)
# pack : gather/scatter/buffer (gthz*, sctb*, new_buffer, realloc)
# netmpi = mpimove + mpiwait (honest per-iteration network cost)
{
k=split($0,a,","); v=a[k]+0 # value = last comma-field
nm=$0; sub(/,[^,]*$/,"",nm) # name = everything before it
if (nm ~ /^INCL\(/) { incl+=v; next }
if (nm ~ /^EXCL\(/) { next }
bkt="other"
if (nm ~ /Win_create|Win_allocate|topology_init|ini_memory_buffer_layout|alltoallv_init/) { setup+=v; bkt="setup" }
else if (nm ~ /device_wait|cuda_sync/) { gpuwait+=v; bkt="gpuwait" }
else if (nm ~ /MPI_Wait|Win_wait|Win_post|Win_start|Win_complete/) { mpiwait+=v; bkt="mpiwait" }
else if (nm ~ /MPI_Isend|MPI_Irecv|MPI_Send|MPI_Recv|MPI_Put|MPI_Get|MPI_Start|neighbor_alltoallv|Neighbor_alltoallv/) { mpimove+=v; bkt="mpimove" }
else if (nm ~ /gthz|sctb|new_buffer|realloc/) { pack+=v; bkt="pack" }
cn=nm; sub(/.*::/,"",cn); gsub(/"/,"",cn)
print ranks","scheme","bkt","cn","v >> comp
}
END { printf "%s,%s,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f\n", \
ranks, scheme, incl, setup, gpuwait, pack, mpimove, mpiwait, mpimove+mpiwait }
' >> "$SUMMARY"
done
done
# ---- pretty pivots ----
pivot() { # $1=csv $2=value-col-index(1-based, after ranks,key) $3=title
awk -F, -v vc="$2" -v title="$3" '
NR==1 { next }
{ r=$1; k=$2; val=$(2+vc); v[r","k]=val; rs[r]=1; ks[k]=1 }
END {
n=0; for (k in ks) cols[++n]=k
# stable-ish column order
order="baseline neighbor persistent rma_pull rma_push P2P NEIGHBOR PNEIGHBOR MPI_GET MPI_PUT"
m=split(order,oc," "); delete cols; n=0
for (i=1;i<=m;i++) if (oc[i] in ks) cols[++n]=oc[i]
printf "\n=== %s ===\n", title
printf "%-8s", "ranks"; for (i=1;i<=n;i++) printf "%14s", cols[i]; printf "\n"
nr=0; for (r in rs) rr[++nr]=r
# numeric sort of ranks
for (i=1;i<=nr;i++) for (j=i+1;j<=nr;j++) if (rr[i]+0>rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t}
for (i=1;i<=nr;i++){ printf "%-8s", rr[i]; for (c=1;c<=n;c++){key=rr[i]","cols[c]; printf "%14s", (key in v)?v[key]:"-"} printf "\n" }
}' "$1"
}
# ---- break-even pivot: iterations after which scheme X beats P2P ----
# Model for an iterative solver doing n_it spmv (halo exchanges):
# T_X(n) = setup_X + n * steady_X (steady = run.out avg, setup excluded
# by the warm-up; setup = run.out "setup time")
# X beats the P2P reference when T_X(n) < T_P2P(n). Solving for the crossover:
# n* = (setup_X - setup_P2P) / (steady_P2P - steady_X)
# Output legend (per cell):
# <number> : break-even n* (X wins for n_it > n*) -- more setup, less steady
# always : X wins from iteration 1 (cheaper setup AND cheaper steady) -> dominates P2P
# never : P2P wins for all n_it (more setup AND slower steady) -> X is dominated
# <n* : X wins ONLY for n_it < n* (cheaper setup but slower steady)
# ^ this is the GPU/few-ranks RMA case: if RMA steady drops below P2P
# you'll instead see a number or "always" here.
breakeven() {
awk -F, '
NR==1 { next }
{ r=$1; be=$2; steady[r","be]=$4+0; setp[r","be]=$5+0; rs[r]=1; bes[be]=1 }
END {
order="NEIGHBOR PNEIGHBOR MPI_GET MPI_PUT"; m=split(order,oc," ")
printf "\n=== break-even n* vs P2P (iterations to amortize setup; needs run.out '\''setup time'\'') ===\n"
printf "%-8s", "ranks"; for (i=1;i<=m;i++) if (oc[i] in bes) printf "%14s", oc[i]; printf "\n"
nr=0; for (r in rs) rr[++nr]=r
for (i=1;i<=nr;i++) for (j=i+1;j<=nr;j++) if (rr[i]+0>rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t}
anysetup=0
for (i=1;i<=nr;i++){
r=rr[i]; printf "%-8s", r
sref=steady[r",P2P"]; pref=setp[r",P2P"]
for (c=1;c<=m;c++){ be=oc[c]; if (!(be in bes)) continue
key=r","be
if (!(key in steady) || !(r",P2P" in steady)) { printf "%14s","-"; continue }
dsteady = sref - steady[key] # >0 : X faster per-iter than P2P
dsetup = setp[key] - pref # >0 : X pays more setup than P2P
if (setp[key]>0 || pref>0) anysetup=1
if (dsteady > 0) printf "%14s", (dsetup<=0 ? "always" : sprintf("%.0f", dsetup/dsteady))
else if (dsteady < 0) printf "%14s", (dsetup>=0 ? "never" : sprintf("<%.0f", dsetup/dsteady))
else printf "%14s", (dsetup<0 ? "always" : "never")
}
printf "\n"
}
if (!anysetup) print "\n NOTE: all setup_s == 0 -> run.out has no \"setup time\" yet (apply the\n warm-up-timing patch to the benchmark). Columns reflect steady-state only."
}' "$RUNOUT"
}
echo "Wrote: $RUNOUT $SUMMARY $COMPONENTS"
pivot "$RUNOUT" 2 "run.out AVG time per backend [s] (real wall-clock = steady-state per iter)"
pivot "$RUNOUT" 3 "run.out SETUP time per backend [s] (one-time, paid in warm-up)"
breakeven
pivot "$SUMMARY" 6 "scorep MPI WAIT per scheme [s] <== THE discriminator (sync/imbalance; 'guarda le wait')"
pivot "$SUMMARY" 7 "scorep NET MPI per scheme [s] (mpimove + mpiwait; excl GPU sync, pack, one-time setup)"
pivot "$SUMMARY" 5 "scorep MPI MOVE per scheme [s] (pure transfer: Isend/Irecv/Put/Get/a2av)"
pivot "$SUMMARY" 2 "scorep SETUP per scheme [s] (one-time: Win_create/topology_init/alltoallv_init/buf_layout)"
pivot "$SUMMARY" 3 "scorep GPU WAIT per scheme [s] (d_cuda_device_wait; sanity: ~equal across schemes)"
pivot "$SUMMARY" 1 "scorep INCL per scheme [s] (everything; dominated by GPU wait -> misleading alone)"
echo
echo "Tip: full per-component breakdown (now bucketed) is in $COMPONENTS"
echo " (ranks,scheme,bucket,component,time). e.g.:"
echo " grep ',rma_pull,' $COMPONENTS | sort -t, -k5 -gr # rma_pull hot spots"
echo " awk -F, '\$3==\"mpiwait\"' $COMPONENTS # only the waits"

@ -0,0 +1,113 @@
#!/bin/bash
# ============================================================================
# gpu_spmv.sh GPU SpMV across comm backends -- "sparse ranks" study
#
# Goal: few MPI ranks spread over MANY nodes (1 rank = 1 GPU), so that the
# halo exchange is dominated by INTER-node traffic. This is where RMA
# (MPI_Get/MPI_Put) may finally pay off: with few, fat messages over the
# network the one-time window-creation cost can amortize (break-even n*),
# unlike the dense CPU runs where RMA was dominated.
#
# The spmv test sweeps ALL backends internally (P2P, NEIGHBOR, PNEIGHBOR,
# MPI_GET, MPI_PUT) in one run -> no per-mode loop here.
# ============================================================================
#SBATCH --job-name=spmv_gpu
#SBATCH --output=spmv_gpu_%j.out
#SBATCH --error=spmv_gpu_%j.err
#SBATCH --nodes=8 # reserve up front; we use a subset per point
#SBATCH --ntasks-per-node=4 # MN5-ACC has 4x H100 per node
#SBATCH --gres=gpu:4 # request all 4 GPUs on each reserved node
#SBATCH --cpus-per-task=20 # 80 cores / 4 ranks = 20 cores per rank
#SBATCH --time=00:30:00
#SBATCH --qos=acc_debug
#SBATCH --account=ehpc580
# ============================================================================
# USER CONFIGURATION
# ============================================================================
TIMES=50 # SpMV repetitions (timed)
GPU_FMT=HLG # GPU matrix format (HLG|ELG|CSRG|HDIAG)
PROFILE=true # Score-P profiling on/off
# --- "sparse" knob: how many ranks (= GPUs) to put on each node.
# RANKS_PER_NODE=1 -> 1 GPU/node, maximally spread, pure inter-node comm
# (this is the configuration that isolates RMA-over-network)
# RANKS_PER_NODE=4 -> pack all 4 GPUs/node (denser, more intra-node)
RANKS_PER_NODE=1
# total ranks per scale point. With RANKS_PER_NODE=1 these are also node counts,
# so "2 4 8" means 2, 4, 8 nodes each holding a single GPU.
RANK_POINTS="2 4 8"
EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel
MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx
# ============================================================================
# ENVIRONMENT
# ============================================================================
module purge
module load bsc/1.0
module load nvidia-hpc-sdk/25.3
module load gcc/12.3.0
module load ucx/1.16.0-gcc
module load openmpi/5.0.5-gcc
module load openblas/0.3.27-gcc
export PATH="$HOME/scorep-cuda/bin:$PATH"
export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
export OMPI_MCA_coll_hcoll_enable=0
export SCOREP_CUDA_ENABLE=yes # GPU run: turn the CUDA adapter ON
export SCOREP_CUDA_BUFFER=48M # per-stream CUDA record buffer
RESDIR=$SLURM_SUBMIT_DIR/results_spmv_gpu_${SLURM_JOB_ID}
mkdir -p $RESDIR
# --- Score-P region filter (same as CPU: drop tiny high-frequency USR funcs) ---
FILTER=$RESDIR/scorep.filt
cat > $FILTER <<'EOF'
SCOREP_REGION_NAMES_BEGIN
EXCLUDE
psb_indx_map_mod::*
psb_desc_mod::*
psb_error_mod::*
psb_gen_block_map_mod::*
psi_penv_mod::*
psb_hash_mod::*
SCOREP_REGION_NAMES_END
EOF
echo "=== SPMV GPU (sparse-ranks study) ==="
echo " times=$TIMES gpu_fmt=$GPU_FMT ranks_per_node=$RANKS_PER_NODE"
echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE"
echo "====================================="
for NRANKS in $RANK_POINTS; do
NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE ))
STEP_DIR=$RESDIR/${NRANKS}ranks
mkdir -p $STEP_DIR
if [ "$PROFILE" = "true" ]; then
export SCOREP_ENABLE_PROFILING=true
export SCOREP_ENABLE_TRACING=false
export SCOREP_TOTAL_MEMORY=256M # bumped: CUDA adapter needs more
export SCOREP_FILTERING_FILE=$FILTER
export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile
else
export SCOREP_ENABLE_PROFILING=false
export SCOREP_ENABLE_TRACING=false
fi
echo ""
echo ">>> GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)"
srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \
--gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \
$EXE --gpu=TRUE --gpu_fmt=$GPU_FMT --matrix=$MATRIX --fmt=MM --times=$TIMES \
> $STEP_DIR/run.out 2>&1
echo ">>> exit=$? output=$STEP_DIR/run.out"
done
echo ""
echo "=== SPMV GPU DONE. Results: $RESDIR ==="
ls -R $RESDIR

@ -525,7 +525,7 @@ contains
return
end subroutine psb_d_gen_pde3d
subroutine run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_in,times_in,do_swap,comm_mode)
subroutine run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_in,times_in,comm_mode)
use psb_base_mod
#ifdef PSB_HAVE_CUDA
use psb_cuda_mod
@ -539,7 +539,6 @@ contains
character(len=*), intent(in) :: cpu_fmt
character(len=*), intent(in) :: gpu_fmt
integer(psb_ipk_), intent(in) :: idim_in, times_in
logical, intent(in) :: do_swap
character(len=*), intent(in) :: comm_mode
type(psb_dspmat_type) :: a
@ -628,9 +627,6 @@ contains
end if
if (info /= psb_success_) goto 9999
call psb_comm_set(comm_type,x%v%comm_handle,info)
if (info /= psb_success_) goto 9999
#ifdef PSB_HAVE_CUDA
if (use_gpu) then
select case(psb_toupper(trim(gpu_fmt)))
@ -654,14 +650,22 @@ contains
end if
#endif
! Set the comm scheme on the DESCRIPTOR, not the vector. desc_a%comm_type
! survives the GPU x%cnv above (which allocates a fresh x%v WITHOUT copying
! comm_handle); x then lazy-inits its handle from desc_a%comm_type at the
! first halo. Setting psb_comm_set on x%v%comm_handle *before* cnv silently
! reverted every backend to baseline on GPU (CPU was unaffected: no cnv).
call desc_a%set_comm_scheme(comm_type, info)
if (info /= psb_success_) goto 9999
! warm-up
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap)
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.true.)
if (info /= psb_success_) goto 9999
call psb_barrier(ctxt)
t0 = psb_wtime()
do i = 1, times
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap)
call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.true.)
if (info /= psb_success_) exit
end do
t1 = psb_wtime()
@ -672,11 +676,7 @@ contains
avg_t = dt / real(times, psb_dpk_)
if (my_rank == psb_root_) then
if (do_swap) then
write(psb_out_unit,'(/,"SpMV benchmark (overlap)")')
else
write(psb_out_unit,'(/,"SpMV benchmark (no overlap)")')
end if
write(psb_out_unit,'(/,"SpMV benchmark")')
write(psb_out_unit,'(" cpu matrix fmt : ",a)') trim(afmt)
if (use_gpu) write(psb_out_unit,'(" gpu matrix fmt : ",a)') trim(psb_toupper(trim(gpu_fmt)))
if (use_external_matrix) then
@ -784,7 +784,6 @@ program psb_spmv_kernel
character(len=8) :: cpu_fmt
character(len=8) :: gpu_fmt
integer(psb_ipk_) :: idim_arg, times_arg
logical :: do_overlap
integer :: kmode
integer, parameter :: n_comm_modes = 5
character(len=20), parameter :: comm_modes(n_comm_modes) = [character(len=20) :: &
@ -797,7 +796,6 @@ program psb_spmv_kernel
matrix_fmt = 'MM'
cpu_fmt = 'CSR'
gpu_fmt = 'HLG'
do_overlap = .true.
call psb_init(ctxt)
call psb_info(ctxt, my_rank, np)
@ -842,10 +840,6 @@ program psb_spmv_kernel
gpu_fmt = psb_toupper(adjustl(arg(14:len_trim(arg))))
else if (index(psb_toupper(trim(arg)), '--GPU_FMT=') == 1) then
gpu_fmt = psb_toupper(adjustl(arg(11:len_trim(arg))))
else if ((trim(psb_toupper(arg)) == '--NOOVERLAP') .or. (trim(psb_toupper(arg)) == '--NO_OVERLAP')) then
do_overlap = .false.
else if ((trim(psb_toupper(arg)) == '--OVERLAP') .or. (trim(psb_toupper(arg)) == '--SWAP')) then
do_overlap = .true.
else if (trim(psb_toupper(arg)) == '--MATRIX') then
if (k < command_argument_count()) call get_command_argument(k+1,matrix_file)
else if (trim(psb_toupper(arg)) == '--FMT') then
@ -897,7 +891,7 @@ program psb_spmv_kernel
write(psb_out_unit,'("GPU enabled : ",l1)') use_gpu
write(psb_out_unit,'("Usage: ./psb_spmv_kernel [--gpu=TRUE|FALSE] [--dim=N] [--times=N] ",&
&"[--cpu_fmt=CSR|COO|CSC|ELL|HLL] [--gpu_fmt=HLL|ELL|CSR|HDIA] [--matrix=<path>] [--fmt=MM|HB] ",&
&"[--overlap|--nooverlap] (runs all comm backends)")')
&"(runs all comm backends)")')
end if
do kmode = 1, n_comm_modes
@ -905,7 +899,7 @@ program psb_spmv_kernel
write(psb_out_unit,'(/,"=== Backend sweep: ",a," ===")') trim(comm_modes(kmode))
end if
call run_spmv_kernel(ctxt, use_gpu, matrix_file, matrix_fmt, cpu_fmt, gpu_fmt, &
& idim_arg, times_arg, do_overlap, comm_modes(kmode))
& idim_arg, times_arg, comm_modes(kmode))
end do
#ifdef PSB_HAVE_CUDA

@ -0,0 +1,96 @@
#!/bin/bash
# ============================================================================
# strong_spmv.sbatch STRONG scaling, CPU-only SpMV across comm backends
#
# Fixed total problem size; MPI ranks grow across scale points. The spmv test
# sweeps ALL comm backends internally (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET,
# MPI_PUT) in a single run, so there is no per-mode loop.
# All scale points run inside ONE reserved allocation (same hardware).
# ============================================================================
#SBATCH --job-name=spmv_strong
#SBATCH --output=spmv_strong_%j.out
#SBATCH --error=spmv_strong_%j.err
#SBATCH --nodes=8 # reserve the maximum up front
#SBATCH --ntasks-per-node=80 # 80 cores per ACC node -> 80 ranks/node
#SBATCH --cpus-per-task=1 # CPU-only: one core per rank
#SBATCH --time=00:30:00
#SBATCH --qos=acc_debug
#SBATCH --account=ehpc580
# ============================================================================
# USER CONFIGURATION
# ============================================================================
DIM=280 # FIXED problem size (idim^3 unknowns)
TIMES=50 # SpMV repetitions (timed)
CPU_FMT=CSR # CPU matrix storage format
PROFILE=true # Score-P profiling on/off
RANK_POINTS="80 160 320 640" # total ranks per scale point (multiples of 80)
EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel
# ============================================================================
# ENVIRONMENT
# ============================================================================
module purge
module load bsc/1.0
module load nvidia-hpc-sdk/25.3
module load gcc/12.3.0
module load ucx/1.16.0-gcc
module load openmpi/5.0.5-gcc
module load openblas/0.3.27-gcc
export PATH="$HOME/scorep-cuda/bin:$PATH"
export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
export OMPI_MCA_coll_hcoll_enable=0
export SCOREP_CUDA_ENABLE=no # CPU-only run: silence Score-P CUDA adapter
RESDIR=$SLURM_SUBMIT_DIR/results_spmv_strong_${SLURM_JOB_ID}
mkdir -p $RESDIR
# --- Score-P region filter: exclude the high-frequency tiny USR functions so
# the profile reflects real MPI/comm time, not instrumentation overhead. ---
FILTER=$RESDIR/scorep.filt
cat > $FILTER <<'EOF'
SCOREP_REGION_NAMES_BEGIN
EXCLUDE
psb_indx_map_mod::*
psb_desc_mod::*
psb_error_mod::*
psb_gen_block_map_mod::*
psi_penv_mod::*
psb_hash_mod::*
SCOREP_REGION_NAMES_END
EOF
echo "=== SPMV STRONG scaling (CPU-only) ==="
echo " fixed_dim=$DIM times=$TIMES cpu_fmt=$CPU_FMT"
echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE"
echo "======================================"
for NRANKS in $RANK_POINTS; do
NNODES=$(( (NRANKS + 79) / 80 )) # 80 ranks per node
STEP_DIR=$RESDIR/${NRANKS}ranks
mkdir -p $STEP_DIR
if [ "$PROFILE" = "true" ]; then
export SCOREP_ENABLE_PROFILING=true
export SCOREP_ENABLE_TRACING=false
export SCOREP_TOTAL_MEMORY=128M
export SCOREP_FILTERING_FILE=$FILTER
export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile
else
export SCOREP_ENABLE_PROFILING=false
export SCOREP_ENABLE_TRACING=false
fi
echo ""
echo ">>> STRONG point: $NRANKS ranks ($NNODES nodes), fixed dim=$DIM"
srun -N $NNODES -n $NRANKS --ntasks-per-node=80 --cpus-per-task=1 \
$EXE --gpu=FALSE --dim=$DIM --times=$TIMES --cpu_fmt=$CPU_FMT \
> $STEP_DIR/run.out 2>&1
echo ">>> exit=$? output=$STEP_DIR/run.out"
done
echo ""
echo "=== SPMV STRONG DONE. Results: $RESDIR ==="
ls -R $RESDIR

@ -26,6 +26,7 @@ file(MAKE_DIRECTORY ${EXEDIR})
set(SOURCES_D_NEST_GLOB_TEST psb_d_nest_glob_test.F90)
set(SOURCES_D_NEST_RECT_TEST psb_d_nest_rect_test.F90)
set(SOURCES_D_NEST_CG_TEST psb_d_nest_cg_test.F90)
set(SOURCES_D_NEST_HALO_TEST psb_d_nest_halo_regime_test.F90)
add_executable(psb_d_nest_glob_test ${SOURCES_D_NEST_GLOB_TEST})
target_link_libraries(psb_d_nest_glob_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base)
@ -36,8 +37,11 @@ target_link_libraries(psb_d_nest_rect_test psblas::util psblas::linsolve psblas:
add_executable(psb_d_nest_cg_test ${SOURCES_D_NEST_CG_TEST})
target_link_libraries(psb_d_nest_cg_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base)
add_executable(psb_d_nest_halo_regime_test ${SOURCES_D_NEST_HALO_TEST})
target_link_libraries(psb_d_nest_halo_regime_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base)
# Set output directory for executables
foreach(target psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test)
foreach(target psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test psb_d_nest_halo_regime_test)
set_target_properties(${target} PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${EXEDIR}
)

@ -16,11 +16,15 @@ FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG).
EXEDIR=./runs
all: runsd psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test
all: runsd psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test psb_d_nest_halo_regime_test
runsd:
(if test ! -d runs ; then mkdir runs; fi)
psb_d_nest_halo_regime_test: psb_d_nest_halo_regime_test.o
$(FLINK) psb_d_nest_halo_regime_test.o -o psb_d_nest_halo_regime_test $(PSBLAS_LIB) $(LDLIBS)
/bin/mv psb_d_nest_halo_regime_test $(EXEDIR)
psb_d_nest_glob_test: psb_d_nest_glob_test.o
$(FLINK) psb_d_nest_glob_test.o -o psb_d_nest_glob_test $(PSBLAS_LIB) $(LDLIBS)
/bin/mv psb_d_nest_glob_test $(EXEDIR)
@ -34,8 +38,9 @@ psb_d_nest_cg_test: psb_d_nest_cg_test.o
/bin/mv psb_d_nest_cg_test $(EXEDIR)
clean:
/bin/rm -f psb_d_nest_glob_test.o psb_d_nest_rect_test.o psb_d_nest_cg_test.o *$(.mod) \
$(EXEDIR)/psb_d_nest_glob_test $(EXEDIR)/psb_d_nest_rect_test $(EXEDIR)/psb_d_nest_cg_test
/bin/rm -f psb_d_nest_glob_test.o psb_d_nest_rect_test.o psb_d_nest_cg_test.o psb_d_nest_halo_regime_test.o *$(.mod) \
$(EXEDIR)/psb_d_nest_glob_test $(EXEDIR)/psb_d_nest_rect_test $(EXEDIR)/psb_d_nest_cg_test \
$(EXEDIR)/psb_d_nest_halo_regime_test
verycleanlib:
(cd ../..; make veryclean)
lib:

@ -0,0 +1,398 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! File: psb_d_nest_halo_regime_test.F90
! Author: Simone Staccone (Stack-1)
!
! Benchmarks the two halo-exchange regimes of a nested (MATNEST-style) operator
! on the SAME assembled blocks:
!
! * UNION (fused): one psb_halo over the composed global descriptor brings in
! the union of every field's ghosts at once (the production path used by
! psb_spmm / Krylov / AMG4PSBLAS).
!
! * PER-BLOCK (selective): each present block (i,j) exchanges ONLY its own
! off-process columns, through its restricted descriptor block_col_desc(i,j),
! one psb_halo per block.
!
! The two products must coincide to machine precision. The test reports, for
! each regime: the number of halo exchanges, the full-matvec time, and the
! PURE-COMMUNICATION time (only the psb_halo calls), so the cost of the message
! aggregation can be isolated.
!
! Saddle-point 2x2 operator [ A B^T ; B 0 ] with TWO fields of DIFFERENT
! size (n1 = field 1 / "velocity", n2 = field 2 / "pressure", n1 /= n2 allowed);
! the (2,2) block is absent. The blocks have REAL, distinct per-block halos (so
! per-block vs union is meaningful): A (1,1) tridiagonal on field 1; the
! rectangular B^T (1,2) and B (2,1) couple every row to a column half a field
! away, which lands on another process.
!
! Run: mpirun -np <P> ./psb_d_nest_halo_regime_test [n1] [n2] [n_reps] [stride]
! n1 : global rows of field 1 (default 2000000)
! n2 : global rows of field 2 (default 500000)
! n_reps : timed repetitions (default 50)
! stride : B/B^T coupling distance (default 1)
! small => local halo => LATENCY-bound (aggregation wins)
! ~n/2 => huge halo => BANDWIDTH-bound (union ~ per-block)
!
program psb_d_nest_halo_regime_test
use psb_base_mod
use psb_d_nest_mod
implicit none
type(psb_ctxt_type) :: context
integer(psb_ipk_) :: my_rank, num_procs, info, i_local_row
integer(psb_ipk_) :: entry_idx, field1_local_rows, field2_local_rows
integer(psb_ipk_) :: n_local_rows, n_local_cols, n_exch, n_warm
integer(psb_ipk_) :: n_reps, narg
integer(psb_lpk_) :: global_row, field1_size, field2_size, gcol, stride
character(len=64) :: arg
type(psb_d_nest_matrix) :: nested_matrix
integer(psb_lpk_), allocatable :: entry_rows(:), entry_cols(:)
integer(psb_lpk_), allocatable :: field1_rows(:), field2_rows(:)
real(psb_dpk_), allocatable :: entry_vals(:)
real(psb_dpk_), allocatable :: x_arr(:), work_x(:), y_union(:), y_perblock(:)
real(psb_dpk_) :: mismatch_norm, t_u, t_b
logical :: ok_u, ok_b
integer(psb_ipk_) :: s_idx, n_schemes
integer(psb_ipk_), allocatable :: schemes(:)
character(len=34), allocatable :: scheme_names(:)
type(psb_d_vect_type) :: xg_vect ! global-local halo vector
type(psb_d_vect_type), allocatable :: bvect(:,:) ! one per present block
real(psb_dpk_), parameter :: tolerance = 1.0e-10_psb_dpk_
call psb_init(context)
call psb_info(context, my_rank, num_procs)
! -------- runtime parameters (rank 0 parses, then broadcast) --------
field1_size = 2000000_psb_lpk_
field2_size = 500000_psb_lpk_
n_reps = 50
stride = 1_psb_lpk_ ! B/B^T coupling distance: small => local halo
! (latency-bound), large (~n/2) => big halo
! (bandwidth-bound)
if (my_rank == 0) then
narg = command_argument_count()
if (narg >= 1) then
call get_command_argument(1, arg); read(arg,*) field1_size
end if
if (narg >= 2) then
call get_command_argument(2, arg); read(arg,*) field2_size
end if
if (narg >= 3) then
call get_command_argument(3, arg); read(arg,*) n_reps
end if
if (narg >= 4) then
call get_command_argument(4, arg); read(arg,*) stride
end if
end if
call psb_bcast(context, field1_size)
call psb_bcast(context, field2_size)
call psb_bcast(context, n_reps)
call psb_bcast(context, stride)
n_warm = max(5, n_reps/10)
!---------------------------------------------------------------
! 1) build the 2x2 nested operator with real per-block halos
!---------------------------------------------------------------
call nested_matrix%init(context, [field1_size, field2_size], info)
if (info /= psb_success_) then
if (my_rank==0) write(*,*) 'FAIL: init info=', info; goto 9999
end if
field1_rows = nested_matrix%get_owned_rows(1)
field2_rows = nested_matrix%get_owned_rows(2)
field1_local_rows = size(field1_rows)
field2_local_rows = size(field2_rows)
! A = tridiag(-1,2,-1) -> block (1,1)
allocate(entry_rows(3*field1_local_rows), entry_cols(3*field1_local_rows), entry_vals(3*field1_local_rows))
entry_idx = 0
do i_local_row = 1, field1_local_rows
global_row = field1_rows(i_local_row)
entry_idx = entry_idx + 1
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row; entry_vals(entry_idx) = 2.0_psb_dpk_
if (global_row > 1) then
entry_idx = entry_idx + 1
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row - 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_
end if
if (global_row < field1_size) then
entry_idx = entry_idx + 1
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row + 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_
end if
end do
call nested_matrix%ins(1, 1, entry_idx, entry_rows, entry_cols, entry_vals, info)
deallocate(entry_rows, entry_cols, entry_vals)
! B^T -> block (1,2): rows in field 1, columns in field 2 (RECTANGULAR n1 x n2).
! Each row r couples to field-2 columns (r mod n2) and (r + n2/2 mod n2); the
! second one lands on another process => real, distinct halo for this block.
allocate(entry_rows(2*field1_local_rows), entry_cols(2*field1_local_rows), entry_vals(2*field1_local_rows))
entry_idx = 0
do i_local_row = 1, field1_local_rows
global_row = field1_rows(i_local_row)
entry_idx = entry_idx + 1
gcol = mod(global_row - 1_psb_lpk_, field2_size) + 1_psb_lpk_
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.5_psb_dpk_
entry_idx = entry_idx + 1
gcol = mod(global_row - 1_psb_lpk_ + stride, field2_size) + 1_psb_lpk_
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.25_psb_dpk_
end do
call nested_matrix%ins(1, 2, entry_idx, entry_rows, entry_cols, entry_vals, info)
deallocate(entry_rows, entry_cols, entry_vals)
! B -> block (2,1): rows in field 2, columns in field 1 (RECTANGULAR n2 x n1).
allocate(entry_rows(2*field2_local_rows), entry_cols(2*field2_local_rows), entry_vals(2*field2_local_rows))
entry_idx = 0
do i_local_row = 1, field2_local_rows
global_row = field2_rows(i_local_row)
entry_idx = entry_idx + 1
gcol = mod(global_row - 1_psb_lpk_, field1_size) + 1_psb_lpk_
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.3_psb_dpk_
entry_idx = entry_idx + 1
gcol = mod(global_row - 1_psb_lpk_ + stride, field1_size) + 1_psb_lpk_
entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.15_psb_dpk_
end do
call nested_matrix%ins(2, 1, entry_idx, entry_rows, entry_cols, entry_vals, info)
deallocate(entry_rows, entry_cols, entry_vals)
call nested_matrix%asb(info)
if (info /= psb_success_) then
if (my_rank==0) write(*,*) 'FAIL: asb info=', info; goto 9999
end if
!---------------------------------------------------------------
! 2) global-local work vectors (x[g] = g on the owned entries)
!---------------------------------------------------------------
n_local_rows = nested_matrix%desc_glob%get_local_rows()
n_local_cols = nested_matrix%desc_glob%get_local_cols()
allocate(x_arr(n_local_cols), work_x(n_local_cols), &
& y_union(n_local_cols), y_perblock(n_local_cols))
x_arr = dzero
do i_local_row = 1, n_local_rows
call nested_matrix%desc_glob%l2g(i_local_row, global_row, info)
x_arr(i_local_row) = real(global_row, psb_dpk_)
end do
!---------------------------------------------------------------
! 3) correctness: union vs per-block on the same x
!---------------------------------------------------------------
work_x = x_arr
call psb_spmm(done, nested_matrix%a_glob, work_x, dzero, y_union, nested_matrix%desc_glob, info)
if (info /= psb_success_) then
if (my_rank==0) write(*,*) 'FAIL: psb_spmm (union) info=', info; goto 9999
end if
call nest_spmv_perblock(nested_matrix, done, x_arr, dzero, y_perblock, info, n_exch, .false.)
if (info /= psb_success_) then
if (my_rank==0) write(*,*) 'FAIL: per-block driver info=', info; goto 9999
end if
mismatch_norm = dzero
do i_local_row = 1, n_local_rows
mismatch_norm = max(mismatch_norm, abs(y_union(i_local_row) - y_perblock(i_local_row)))
end do
call psb_amx(context, mismatch_norm)
!---------------------------------------------------------------
! 4) per-scheme PURE COMMUNICATION sweep.
! The comm scheme is selected per descriptor (desc%set_comm_scheme) and is
! honoured ONLY by the encapsulated vect path; the array path used in (3)
! is always baseline. For each scheme we time, on persistent vects:
! union = one psb_halo over desc_glob
! per-block = one psb_halo per present block_col_desc(i,j)
!---------------------------------------------------------------
n_schemes = 5
allocate(schemes(n_schemes), scheme_names(n_schemes))
schemes = [ psb_comm_isend_irecv_, psb_comm_ineighbor_alltoallv_, &
& psb_comm_persistent_ineighbor_alltoallv_, &
& psb_comm_rma_pull_, psb_comm_rma_push_ ]
scheme_names = [ character(len=34) :: 'isend_irecv (baseline)', &
& 'ineighbor_alltoallv', 'persistent_ineighbor_alltoallv', &
& 'rma_pull', 'rma_push' ]
allocate(bvect(nested_matrix%n_fields, nested_matrix%n_fields))
if (my_rank == 0) then
write(*,'(a,i0,a,i0,a,i0,a,i0,a,i0)') ' np=', num_procs, ' n1=', field1_size, &
& ' n2=', field2_size, ' stride=', stride, ' reps=', n_reps
write(*,'(a,es12.4)') ' max|y_union - y_perblock| = ', mismatch_norm
if (mismatch_norm <= tolerance) then
write(*,*) ' PASS: regimes agree'
else
write(*,*) ' FAIL: regimes disagree'
end if
write(*,'(a,i0)') ' halo exchanges: union = 1 per-block = ', n_exch
write(*,*)
write(*,'(a)') ' pure halo communication time [s] (min over reps, slowest rank)'
write(*,'(a)') ' scheme union per-block ratio'
end if
do s_idx = 1, n_schemes
call set_scheme(schemes(s_idx))
call build_vects()
call time_comm('U', t_u, ok_u)
call time_comm('B', t_b, ok_b)
call free_vects()
if (my_rank == 0) then
if (ok_u .and. ok_b) then
write(*,'(1x,a34,es15.4,es15.4,3x,f7.2)') scheme_names(s_idx), t_u, t_b, &
& t_b/max(t_u, tiny(t_u))
else
write(*,'(1x,a34,a)') scheme_names(s_idx), ' (unavailable on this build/MPI)'
end if
end if
end do
deallocate(bvect, schemes, scheme_names)
call nested_matrix%free(info)
9999 continue
call psb_exit(context)
contains
! Set the communication scheme on the global descriptor and on every present
! per-block descriptor (the per-block exchanges must use the same scheme).
subroutine set_scheme(scheme)
integer(psb_ipk_), intent(in) :: scheme
integer(psb_ipk_) :: i, j, linfo
call nested_matrix%desc_glob%set_comm_scheme(scheme, linfo)
do j = 1, nested_matrix%n_fields
do i = 1, nested_matrix%n_fields
if (nested_matrix%block_storage%has_block(i,j)) &
& call nested_matrix%block_col_desc(i,j)%set_comm_scheme(scheme, linfo)
end do
end do
end subroutine set_scheme
! Fresh persistent halo vectors (the comm_handle, hence the scheme, is created
! from desc%comm_type on the first psb_halo and then reused across reps).
subroutine build_vects()
integer(psb_ipk_) :: i, j, linfo
call psb_geall(xg_vect, nested_matrix%desc_glob, linfo)
call psb_geasb(xg_vect, nested_matrix%desc_glob, linfo)
do j = 1, nested_matrix%n_fields
do i = 1, nested_matrix%n_fields
if (.not. nested_matrix%block_storage%has_block(i,j)) cycle
call psb_geall(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
call psb_geasb(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
end do
end do
end subroutine build_vects
subroutine free_vects()
integer(psb_ipk_) :: i, j, linfo
call xg_vect%free(linfo)
do j = 1, nested_matrix%n_fields
do i = 1, nested_matrix%n_fields
if (nested_matrix%block_storage%has_block(i,j)) call bvect(i,j)%free(linfo)
end do
end do
end subroutine free_vects
! Time the pure communication of one regime: n_warm warm-up runs, then min over
! n_reps of the slowest rank's wall time (psb_amx = max across ranks).
subroutine time_comm(code, t_min, ok)
character, intent(in) :: code ! 'U' union / 'B' per-block
real(psb_dpk_), intent(out) :: t_min
logical, intent(out) :: ok
integer(psb_ipk_) :: rep, linfo
real(psb_dpk_) :: t0, dt
ok = .true.
do rep = 1, n_warm
call do_comm(code, linfo); if (linfo /= 0) ok = .false.
end do
t_min = huge(t_min)
do rep = 1, n_reps
call psb_barrier(context)
t0 = psb_wtime()
call do_comm(code, linfo)
dt = psb_wtime() - t0
call psb_amx(context, dt)
t_min = min(t_min, dt)
if (linfo /= 0) ok = .false.
end do
if (.not. ok) t_min = dzero
end subroutine time_comm
subroutine do_comm(code, linfo)
character, intent(in) :: code
integer(psb_ipk_), intent(out) :: linfo
integer(psb_ipk_) :: i, j
linfo = 0
if (code == 'U') then
call psb_halo(xg_vect, nested_matrix%desc_glob, linfo)
else
do j = 1, nested_matrix%n_fields
do i = 1, nested_matrix%n_fields
if (.not. nested_matrix%block_storage%has_block(i,j)) cycle
call psb_halo(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
if (linfo /= 0) return
end do
end do
end if
end subroutine do_comm
! Per-block nested matvec, exchanging one block's halo at a time. x, y are in
! the composed global-local layout. halo_only=.true. performs ONLY the
! per-block psb_halo calls (for the pure-communication timing).
subroutine nest_spmv_perblock(op, alpha, x, beta, y, info, n_exchanges, halo_only)
type(psb_d_nest_matrix), intent(inout) :: op
real(psb_dpk_), intent(in) :: x(:)
real(psb_dpk_), intent(inout) :: y(:)
real(psb_dpk_), intent(in) :: alpha, beta
integer(psb_ipk_), intent(out) :: info, n_exchanges
logical, intent(in) :: halo_only
integer(psb_ipk_) :: nf, i, j, k, nrl, nownj, nowni, nfc, nbc
integer(psb_ipk_), allocatable :: owned_offset(:)
real(psb_dpk_), allocatable :: xs(:), xf(:), yb(:)
info = psb_success_; n_exchanges = 0
nf = op%n_fields
allocate(owned_offset(nf+1))
owned_offset(1) = 0
do j = 1, nf
owned_offset(j+1) = owned_offset(j) + op%field_desc(j)%get_local_rows()
end do
nrl = owned_offset(nf+1)
if (.not. halo_only) then
if (beta == dzero) then
y(1:nrl) = dzero
else if (beta /= done) then
y(1:nrl) = beta * y(1:nrl)
end if
end if
do j = 1, nf
nownj = op%field_desc(j)%get_local_rows()
nfc = op%field_desc(j)%get_local_cols()
do i = 1, nf
if (.not. op%block_storage%has_block(i,j)) cycle
nbc = op%block_col_desc(i,j)%get_local_cols()
allocate(xs(nbc)); xs = dzero
xs(1:nownj) = x(owned_offset(j)+1 : owned_offset(j)+nownj)
call psb_halo(xs, op%block_col_desc(i,j), info)
if (info /= psb_success_) return
n_exchanges = n_exchanges + 1
if (.not. halo_only) then
allocate(xf(nfc)); xf = dzero
do k = 1, nbc
xf(op%blk2field(i,j)%map(k)) = xs(k)
end do
nowni = op%field_desc(i)%get_local_rows()
allocate(yb(nowni)); yb = dzero
call op%block_storage%mats(i,j)%a%csmv(alpha, xf, dzero, yb, info)
if (info /= psb_success_) return
y(owned_offset(i)+1 : owned_offset(i)+nowni) = &
& y(owned_offset(i)+1 : owned_offset(i)+nowni) + yb
deallocate(xf, yb)
end if
deallocate(xs)
end do
end do
deallocate(owned_offset)
end subroutine nest_spmv_perblock
end program psb_d_nest_halo_regime_test
Loading…
Cancel
Save