From 9533c229895d8c15f89ecfd286b12dffe928e330 Mon Sep 17 00:00:00 2001 From: Stack-1 Date: Wed, 1 Jul 2026 12:05:54 +0200 Subject: [PATCH] [WIP] RMA swapdata optimization, nest builder updates, GPU/scorep test tooling - psi_dswapdata / psb_comm_rma: RMA swapdata optimization (PSCW + Win_allocate) - psb_d_nest_builder: builder updates - cuda: Makefile + psb_d_cuda_vect_mod adjustments - test/comm: spmv/cg drivers, Makefiles, GPU + scorep/strong-scaling scripts - test/nested: halo-regime test + Makefile/CMake updates --- base/comm/internals/psi_dswapdata.F90 | 439 ++++++++---------- .../comm/comm_schemes/psb_comm_rma_mod.F90 | 49 ++ base/modules/tools/psb_d_nest_builder_mod.F90 | 76 ++- cuda/Makefile | 2 +- cuda/psb_d_cuda_vect_mod.F90 | 40 ++ test/comm/cg/gpu_cg.sh | 89 ++++ test/comm/cg/psb_comm_cg_test.F90 | 38 +- test/comm/spmv/Makefile | 9 +- test/comm/spmv/extract_scorep.sh | 206 ++++++++ test/comm/spmv/gpu_spmv.sh | 113 +++++ test/comm/spmv/psb_spmv_test.f90 | 34 +- test/comm/spmv/strong_spmv.sh | 96 ++++ test/nested/CMakeLists.txt | 6 +- test/nested/Makefile | 11 +- test/nested/psb_d_nest_halo_regime_test.F90 | 398 ++++++++++++++++ 15 files changed, 1324 insertions(+), 282 deletions(-) create mode 100644 test/comm/cg/gpu_cg.sh create mode 100755 test/comm/spmv/extract_scorep.sh create mode 100644 test/comm/spmv/gpu_spmv.sh create mode 100644 test/comm/spmv/strong_spmv.sh create mode 100644 test/nested/psb_d_nest_halo_regime_test.F90 diff --git a/base/comm/internals/psi_dswapdata.F90 b/base/comm/internals/psi_dswapdata.F90 index 02db23d84..6a4e4f1d4 100644 --- a/base/comm/internals/psi_dswapdata.F90 +++ b/base/comm/internals/psi_dswapdata.F90 @@ -1009,6 +1009,8 @@ contains integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size integer(psb_ipk_), allocatable :: peer_mpi_rank(:) + integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx + integer(psb_mpk_), allocatable :: grp_ranks(:) logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed type(psb_comm_rma_handle), pointer :: rma_handle character(len=30) :: name @@ -1077,59 +1079,42 @@ contains end if end if + ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors + ! combuf is the pinned/registered buffer, so RMA uses pinned memory. if (buffer_size > 0) then if (.not. allocated(y%combuf)) then - ! Allocate buffer with size of comm_indexes%v to include all metadata, - ! matching baseline and topology buffer layout call y%new_buffer(ione*size(comm_indexes%v), info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if else if (size(y%combuf) < size(comm_indexes%v)) then - ! Need a larger exposed memory area: recreate the RMA window first, - ! then reallocate combuf and lazily create a new window below. if (rma_handle%window_open) then call mpi_win_unlock_all(rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if rma_handle%window_open = .false. end if if (rma_handle%window_ready) then call mpi_win_free(rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - rma_handle%window_ready = .false. rma_handle%win = mpi_win_null + rma_handle%window_ready = .false. end if - ! Allocate buffer with size of comm_indexes%v to include all metadata, - ! matching baseline and topology buffer layout call y%new_buffer(ione*size(comm_indexes%v), info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if end if - end if - if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then - ! Expose combuf once and keep the window around until the descriptor changes. - element_bytes = storage_size(y%combuf(1))/8 - exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) - call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & - & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 + if (.not. rma_handle%window_ready) then + element_bytes = storage_size(y%combuf(1))/8 + exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) + call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & + & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info,name,m_err=(/iret/)) + goto 9999 + end if + rma_handle%window_ready = .true. end if - rma_handle%window_ready = .true. end if if (buffer_size > 0) then @@ -1138,7 +1123,19 @@ contains end if call y%device_wait() - ! Pull data from each peer with per-neighbor passive lock (neighbor-only sync). + ! Open exposure (post) + access (start) epochs over the CACHED neighbor + ! group (built once with the layout in the handle) -- no per-call build. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + end if + do neighbor_idx=1, num_neighbors proc_to_comm = rma_handle%peer_proc(neighbor_idx) recv_count = rma_handle%peer_recv_counts(neighbor_idx) @@ -1161,12 +1158,6 @@ contains end if if (recv_count > 0) then - call mpi_win_lock(MPI_LOCK_SHARED, prc_rank, 0, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if remote_disp = int(remote_base - 1, kind=MPI_ADDRESS_KIND) call mpi_get(y%combuf(recv_pos), recv_count, psb_mpi_r_dpk_, prc_rank, remote_disp, recv_count, psb_mpi_r_dpk_, & & rma_handle%win, iret) @@ -1175,12 +1166,6 @@ contains call psb_errpush(info,name,m_err=(/iret/)) goto 9999 end if - call mpi_win_unlock(prc_rank, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if end if else if (send_count /= recv_count) then @@ -1191,6 +1176,19 @@ contains y%combuf(recv_pos:recv_pos+recv_count-1) = y%combuf(send_pos:send_pos+send_count-1) end if end do + + ! Close access epoch (my GETs done) then exposure epoch (peers done + ! reading my window): received data is now in y%combuf. Cached group, no free. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_complete(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_wait(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + end if end if end if @@ -1235,7 +1233,8 @@ contains integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size integer(psb_ipk_), allocatable :: peer_mpi_rank(:) - integer(psb_mpk_), parameter :: rma_push_notify_tag = 914_psb_mpk_ + integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx + integer(psb_mpk_), allocatable :: grp_ranks(:) logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed type(psb_comm_rma_handle), pointer :: rma_handle character(len=30) :: name @@ -1302,59 +1301,42 @@ contains end if end if + ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors + ! combuf is the pinned/registered buffer, so RMA uses pinned memory. if (buffer_size > 0) then if (.not. allocated(y%combuf)) then - ! Allocate buffer with size of comm_indexes%v to include all metadata, - ! matching baseline and topology buffer layout call y%new_buffer(ione*size(comm_indexes%v), info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if else if (size(y%combuf) < size(comm_indexes%v)) then - ! Need a larger exposed memory area: recreate the RMA window first, - ! then reallocate combuf and lazily create a new window below. if (rma_handle%window_open) then call mpi_win_unlock_all(rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if rma_handle%window_open = .false. end if if (rma_handle%window_ready) then call mpi_win_free(rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - rma_handle%window_ready = .false. rma_handle%win = mpi_win_null + rma_handle%window_ready = .false. end if - ! Allocate buffer with size of comm_indexes%v to include all metadata, - ! matching baseline and topology buffer layout call y%new_buffer(ione*size(comm_indexes%v), info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if end if - end if - if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then - ! Keep the window alive across repetitions: it is created once and reused. - element_bytes = storage_size(y%combuf(1))/8 - exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) - call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & - & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 + if (.not. rma_handle%window_ready) then + element_bytes = storage_size(y%combuf(1))/8 + exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) + call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & + & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info,name,m_err=(/iret/)) + goto 9999 + end if + rma_handle%window_ready = .true. end if - rma_handle%window_ready = .true. end if if (buffer_size > 0) then @@ -1363,34 +1345,20 @@ contains end if call y%device_wait() - ! Pre-post notification receives before opening the window (prevents isend/irecv ordering issues). - if (num_neighbors > 0) then - rma_handle%notify_recv_reqs(1:num_neighbors) = MPI_REQUEST_NULL - rma_handle%notify_send_reqs(1:num_neighbors) = MPI_REQUEST_NULL - end if - do neighbor_idx=1, num_neighbors - proc_to_comm = rma_handle%peer_proc(neighbor_idx) - if (proc_to_comm /= my_rank) then - prc_rank = rma_handle%peer_mpi_rank(neighbor_idx) - call mpi_irecv(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, & - & rma_push_notify_tag, icomm, rma_handle%notify_recv_reqs(neighbor_idx), iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if + ! Open exposure (post) + access (start) epochs over the CACHED neighbor + ! group (built once with the layout in the handle) -- no per-call build. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 end if - end do - - call mpi_win_lock_all(0, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 end if - rma_handle%window_open = .true. - ! Push data to each peer; after flush send a P2P notification so target knows data arrived. + ! Issue every PUT (remote) or perform the self-copy (local). do neighbor_idx=1, num_neighbors proc_to_comm = rma_handle%peer_proc(neighbor_idx) recv_count = rma_handle%peer_recv_counts(neighbor_idx) @@ -1422,19 +1390,6 @@ contains goto 9999 end if end if - call mpi_win_flush(prc_rank, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - call mpi_isend(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, & - & rma_push_notify_tag, icomm, rma_handle%notify_send_reqs(neighbor_idx), iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if else if (send_count /= recv_count) then info = psb_err_internal_error_ @@ -1444,36 +1399,24 @@ contains y%combuf(recv_pos:recv_pos+recv_count-1) = y%combuf(send_pos:send_pos+send_count-1) end if end do - end if - end if - - ! WAIT phase: close epoch, wait for P2P notifications, then scatter. - if (do_wait) then - if (rma_handle%window_open) then - call mpi_win_unlock_all(rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - rma_handle%window_open = .false. - end if - if (num_neighbors > 0) then - call mpi_waitall(num_neighbors, rma_handle%notify_recv_reqs, MPI_STATUSES_IGNORE, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - call mpi_waitall(num_neighbors, rma_handle%notify_send_reqs, MPI_STATUSES_IGNORE, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 + ! Close access epoch (my PUTs done) then exposure epoch (peers' PUTs into + ! me done): receive area now holds the pushed data. Cached group, no free. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_complete(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_wait(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if end if end if + end if + ! WAIT phase: data already delivered by PSCW in START; just scatter into Y. + if (do_wait) then if (total_recv > 0) then call y%sct(int(total_recv,psb_mpk_), rma_handle%peer_recv_indexes, y%combuf(total_send+1:total_send+total_recv), beta) end if @@ -2341,6 +2284,8 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size, total_send_, total_recv_ integer(psb_ipk_), allocatable :: peer_mpi_rank(:) + integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx + integer(psb_mpk_), allocatable :: grp_ranks(:) logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed type(psb_comm_rma_handle), pointer :: rma_handle character(len=30) :: name @@ -2406,33 +2351,42 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent end if end if + ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors + ! combuf is the pinned/registered buffer, so RMA uses pinned memory. if (buffer_size > 0) then if (.not. allocated(y%combuf)) then call y%new_buffer(buffer_size, info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if else if (size(y%combuf) < buffer_size) then + if (rma_handle%window_open) then + call mpi_win_unlock_all(rma_handle%win, iret) + rma_handle%window_open = .false. + end if + if (rma_handle%window_ready) then + call mpi_win_free(rma_handle%win, iret) + rma_handle%win = mpi_win_null + rma_handle%window_ready = .false. + end if call y%new_buffer(buffer_size, info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if end if - end if - if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then - element_bytes = storage_size(y%combuf(1))/8 - exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) - call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & - & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 + if (.not. rma_handle%window_ready) then + element_bytes = storage_size(y%combuf(1))/8 + exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) + call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & + & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info,name,m_err=(/iret/)) + goto 9999 + end if + rma_handle%window_ready = .true. end if - rma_handle%window_ready = .true. end if if (buffer_size > 0) then @@ -2441,7 +2395,19 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent end if call y%device_wait() - ! Pull data from each peer with per-neighbor passive lock (neighbor-only sync). + ! Open exposure (post) + access (start) epochs over the CACHED neighbor + ! group (built once with the layout in the handle) -- no per-call build. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + end if + do neighbor_idx=1, num_neighbors proc_to_comm = rma_handle%peer_proc(neighbor_idx) recv_count = rma_handle%peer_recv_counts(neighbor_idx) @@ -2458,12 +2424,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent goto 9999 end if if (recv_count > 0) then - call mpi_win_lock(MPI_LOCK_SHARED, prc_rank, 0, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if remote_disp = int((remote_base - 1) * n, kind=MPI_ADDRESS_KIND) call mpi_get(y%combuf(recv_pos), recv_count*n, psb_mpi_r_dpk_, prc_rank, remote_disp, recv_count*n, psb_mpi_r_dpk_, & & rma_handle%win, iret) @@ -2472,12 +2432,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent call psb_errpush(info,name,m_err=(/iret/)) goto 9999 end if - call mpi_win_unlock(prc_rank, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if end if else if (send_count /= recv_count) then @@ -2488,6 +2442,19 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent y%combuf(recv_pos:recv_pos+recv_count*n-1) = y%combuf(send_pos:send_pos+send_count*n-1) end if end do + + ! Close access epoch (my GETs done) then exposure epoch (peers done + ! reading my window): received data is now in y%combuf. Cached group, no free. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_complete(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_wait(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + end if end if end if @@ -2531,7 +2498,8 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size, total_send_, total_recv_ integer(psb_ipk_), allocatable :: peer_mpi_rank(:) - integer(psb_mpk_), parameter :: rma_push_notify_tag = 914_psb_mpk_ + integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx + integer(psb_mpk_), allocatable :: grp_ranks(:) logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed type(psb_comm_rma_handle), pointer :: rma_handle character(len=30) :: name @@ -2597,33 +2565,42 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent end if end if + ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors + ! combuf is the pinned/registered buffer, so RMA uses pinned memory. if (buffer_size > 0) then if (.not. allocated(y%combuf)) then call y%new_buffer(buffer_size, info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if else if (size(y%combuf) < buffer_size) then + if (rma_handle%window_open) then + call mpi_win_unlock_all(rma_handle%win, iret) + rma_handle%window_open = .false. + end if + if (rma_handle%window_ready) then + call mpi_win_free(rma_handle%win, iret) + rma_handle%win = mpi_win_null + rma_handle%window_ready = .false. + end if call y%new_buffer(buffer_size, info) if (info /= psb_success_) then - call psb_errpush(psb_err_alloc_dealloc_,name) - goto 9999 + call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999 end if end if - end if - if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then - element_bytes = storage_size(y%combuf(1))/8 - exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) - call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & - & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 + if (.not. rma_handle%window_ready) then + element_bytes = storage_size(y%combuf(1))/8 + exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND) + call mpi_win_create(y%combuf, exposed_bytes, element_bytes, & + & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info,name,m_err=(/iret/)) + goto 9999 + end if + rma_handle%window_ready = .true. end if - rma_handle%window_ready = .true. end if if (buffer_size > 0) then @@ -2632,33 +2609,20 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent end if call y%device_wait() - ! Pre-post notification receives before opening the window. - if (num_neighbors > 0) then - rma_handle%notify_recv_reqs(1:num_neighbors) = MPI_REQUEST_NULL - rma_handle%notify_send_reqs(1:num_neighbors) = MPI_REQUEST_NULL - end if - do neighbor_idx=1, num_neighbors - proc_to_comm = rma_handle%peer_proc(neighbor_idx) - if (proc_to_comm /= my_rank) then - prc_rank = rma_handle%peer_mpi_rank(neighbor_idx) - call mpi_irecv(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, & - & rma_push_notify_tag, icomm, rma_handle%notify_recv_reqs(neighbor_idx), iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if + ! Open exposure (post) + access (start) epochs over the CACHED neighbor + ! group (built once with the layout in the handle) -- no per-call build. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 end if - end do - - call mpi_win_lock_all(0, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 end if - rma_handle%window_open = .true. + ! Issue every PUT (remote) or perform the self-copy (local). do neighbor_idx=1, num_neighbors proc_to_comm = rma_handle%peer_proc(neighbor_idx) recv_count = rma_handle%peer_recv_counts(neighbor_idx) @@ -2684,19 +2648,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent goto 9999 end if end if - call mpi_win_flush(prc_rank, rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - call mpi_isend(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, & - & rma_push_notify_tag, icomm, rma_handle%notify_send_reqs(neighbor_idx), iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if else if (send_count /= recv_count) then info = psb_err_internal_error_ @@ -2706,34 +2657,24 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent y%combuf(recv_pos:recv_pos+recv_count*n-1) = y%combuf(send_pos:send_pos+send_count*n-1) end if end do + + ! Close access epoch (my PUTs done) then exposure epoch (peers' PUTs into + ! me done): receive area now holds the pushed data. Cached group, no free. + if (rma_handle%nbr_grp /= mpi_group_null) then + call mpi_win_complete(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + call mpi_win_wait(rma_handle%win, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999 + end if + end if end if end if - ! WAIT phase: close epoch, wait for P2P notifications, then scatter. + ! WAIT phase: data already delivered by PSCW in START; just scatter into Y. if (do_wait) then - if (rma_handle%window_open) then - call mpi_win_unlock_all(rma_handle%win, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - rma_handle%window_open = .false. - end if - if (num_neighbors > 0) then - call mpi_waitall(num_neighbors, rma_handle%notify_recv_reqs, MPI_STATUSES_IGNORE, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - call mpi_waitall(num_neighbors, rma_handle%notify_send_reqs, MPI_STATUSES_IGNORE, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info,name,m_err=(/iret/)) - goto 9999 - end if - end if if (total_recv > 0) then call y%sct(int(total_recv,psb_mpk_), rma_handle%peer_recv_indexes, y%combuf(total_send_+1:total_send_+total_recv_), beta) end if diff --git a/base/modules/comm/comm_schemes/psb_comm_rma_mod.F90 b/base/modules/comm/comm_schemes/psb_comm_rma_mod.F90 index 6d1a33f1d..5d2226833 100644 --- a/base/modules/comm/comm_schemes/psb_comm_rma_mod.F90 +++ b/base/modules/comm/comm_schemes/psb_comm_rma_mod.F90 @@ -3,6 +3,7 @@ module psb_comm_rma_mod use psb_desc_const_mod, only: psb_proc_id_, psb_n_elem_recv_, psb_elem_recv_, & & psb_n_elem_send_, psb_elem_send_ use psb_error_mod + use, intrinsic :: iso_c_binding, only: c_ptr, c_null_ptr #ifdef PSB_MPI_MOD use mpi #endif @@ -18,6 +19,9 @@ module psb_comm_rma_mod integer(psb_mpk_) :: win = mpi_win_null logical :: window_ready = .false. logical :: window_open = .false. + type(c_ptr) :: win_base = c_null_ptr + integer(psb_ipk_) :: win_nelem = 0 + integer(psb_mpk_) :: nbr_grp = mpi_group_null ! neighbor MPI group, built once with the layout logical :: layout_ready = .false. integer(psb_ipk_) :: layout_nnbr = -1 integer(psb_ipk_) :: layout_send = -1 @@ -57,6 +61,8 @@ contains this%win = mpi_win_null this%window_ready = .false. this%window_open = .false. + this%win_base = c_null_ptr + this%win_nelem = 0 this%layout_ready = .false. this%layout_nnbr = -1 this%layout_send = -1 @@ -67,8 +73,14 @@ contains subroutine psb_comm_rma_clear_memory_buffer_layout(this, info) class(psb_comm_rma_handle), intent(inout) :: this integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_) :: iret info = psb_success_ + ! Neighbors change when the layout is rebuilt: free the cached group. + if (this%nbr_grp /= mpi_group_null) then + call mpi_group_free(this%nbr_grp, iret) + this%nbr_grp = mpi_group_null + end if if (allocated(this%peer_proc)) deallocate(this%peer_proc) if (allocated(this%peer_send_counts)) deallocate(this%peer_send_counts) if (allocated(this%peer_recv_counts)) deallocate(this%peer_recv_counts) @@ -105,6 +117,8 @@ contains integer(psb_ipk_) :: list_pos, send_offset, recv_offset integer(psb_ipk_) :: proc_to_comm, recv_count, send_count integer(psb_ipk_) :: local_meta(4), remote_meta(4) + integer(psb_mpk_) :: comm_grp, n_off, grp_idx + integer(psb_mpk_), allocatable :: grp_ranks(:) call this%clear_memory_buffer_layout(info) if (info /= psb_success_) return @@ -206,6 +220,39 @@ contains return end if + ! Build the neighbor MPI group ONCE (off-diagonal peers); reused by every + ! RMA exchange (PSCW post/start) instead of rebuilding it each call. + n_off = 0 + do neighbor_idx = 1, n_neighbors + if (this%peer_proc(neighbor_idx) /= my_rank) n_off = n_off + 1 + end do + if (n_off > 0) then + allocate(grp_ranks(n_off), stat=iret) + if (iret /= 0) then + info = psb_err_alloc_dealloc_ + return + end if + grp_idx = 0 + do neighbor_idx = 1, n_neighbors + if (this%peer_proc(neighbor_idx) /= my_rank) then + grp_idx = grp_idx + 1 + grp_ranks(grp_idx) = this%peer_mpi_rank(neighbor_idx) + end if + end do + call mpi_comm_group(icomm, comm_grp, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + return + end if + call mpi_group_incl(comm_grp, n_off, grp_ranks, this%nbr_grp, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + return + end if + call mpi_group_free(comm_grp, iret) + deallocate(grp_ranks, stat=iret) + end if + this%layout_nnbr = n_neighbors this%layout_send = send_total this%layout_recv = recv_total @@ -229,6 +276,8 @@ contains call mpi_win_free(this%win, iret) this%win = mpi_win_null end if + this%win_base = c_null_ptr + this%win_nelem = 0 this%window_ready = .false. this%layout_ready = .false. this%layout_nnbr = -1 diff --git a/base/modules/tools/psb_d_nest_builder_mod.F90 b/base/modules/tools/psb_d_nest_builder_mod.F90 index fcbc5ed5a..936fdc336 100644 --- a/base/modules/tools/psb_d_nest_builder_mod.F90 +++ b/base/modules/tools/psb_d_nest_builder_mod.F90 @@ -83,6 +83,15 @@ module psb_d_nest_builder_mod real(psb_dpk_), allocatable :: entry_vals(:) end type psb_d_nest_block_buffer + ! per-block selective halo support: scatter map from a block's own (restricted) + ! column descriptor onto the field-j column space the block matrix is localized + ! against. map(k) is the field-j local column position of the k-th local column + ! of block_col_desc(i,j); used to scatter a per-block psb_halo result into the + ! field vector consumed by the (field-localized) block csmv. + type :: psb_d_nest_blk2field_map + integer(psb_ipk_), allocatable :: map(:) + end type psb_d_nest_blk2field_map + type :: psb_d_nest_matrix type(psb_ctxt_type) :: context integer(psb_ipk_) :: n_fields = 0 @@ -95,6 +104,14 @@ module psb_d_nest_builder_mod type(psb_desc_nest_type) :: grid_desc type(psb_dspmat_type) :: a_glob ! the matrix to hand to Krylov type(psb_desc_type) :: desc_glob ! the global descriptor + ! per-block (selective) halo-exchange support. block_col_desc(i,j) carries + ! ONLY block (i,j)'s own off-process columns (a subset of field j's union + ! halo), so its halo can be exchanged independently of the rest of column j; + ! blk2field(i,j) maps its local columns onto the field-j local columns the + ! block matrix is localized against. These coexist with a_glob/desc_glob + ! (the fused union path) WITHOUT a second copy of the blocks. + type(psb_desc_type), allocatable :: block_col_desc(:,:) + type(psb_d_nest_blk2field_map), allocatable :: blk2field(:,:) contains procedure, pass(op) :: init => psb_d_nest_op_init procedure, pass(op) :: ins => psb_d_nest_op_ins @@ -107,7 +124,7 @@ module psb_d_nest_builder_mod end type psb_d_nest_matrix private - public :: psb_d_nest_matrix + public :: psb_d_nest_matrix, psb_d_nest_blk2field_map contains @@ -201,6 +218,8 @@ contains type(psb_d_nest_base_mat) :: nest_operator integer(psb_ipk_) :: n_fields, i_field, j_field + integer(psb_ipk_) :: field_local_rows, n_block_cols, k_col, field_col + integer(psb_lpk_) :: global_col_idx character(len=24) :: name info = psb_success_ @@ -215,6 +234,50 @@ contains end if end do + ! 1b) per-block (restricted-halo) column descriptors + scatter maps, for the + ! selective halo-exchange regime. Each present block (i,j) gets a + ! descriptor whose halo is ONLY block (i,j)'s own off-process columns + ! (registered from that single block's buffer, NOT the column-wide union), + ! sharing field j's owned distribution. blk2field(i,j)%map sends its + ! local columns onto field j's local columns (the space the block matrix + ! is localized against), so a per-block psb_halo can be scattered into the + ! field vector consumed by the block csmv. The buffers are still alive + ! here (freed in step 6). + allocate(op%block_col_desc(n_fields,n_fields), op%blk2field(n_fields,n_fields), stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + do j_field = 1, n_fields + field_local_rows = op%field_desc(j_field)%get_local_rows() + do i_field = 1, n_fields + if (op%block_buffer(i_field,j_field)%n_entries <= 0) cycle + ! owned distribution identical to field j (same nl => same global ranges) + call psb_cdall(op%context, op%block_col_desc(i_field,j_field), info, nl=field_local_rows) + if (info == psb_success_) & + & call psb_cdins(op%block_buffer(i_field,j_field)%n_entries, & + & op%block_buffer(i_field,j_field)%entry_cols(1:op%block_buffer(i_field,j_field)%n_entries), & + & op%block_col_desc(i_field,j_field), info) + if (info == psb_success_) call psb_cdasb(op%block_col_desc(i_field,j_field), info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='per-block cdasb'); return + end if + ! block-local column k -> field-j local column position + n_block_cols = op%block_col_desc(i_field,j_field)%get_local_cols() + allocate(op%blk2field(i_field,j_field)%map(n_block_cols), stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + do k_col = 1, n_block_cols + call op%block_col_desc(i_field,j_field)%l2g(k_col, global_col_idx, info) + if (info == psb_success_) call op%field_desc(j_field)%g2l(global_col_idx, field_col, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='blk2field l2g/g2l'); return + end if + op%blk2field(i_field,j_field)%map(k_col) = field_col + end do + end do + end do + ! 2) build the local blocks (generally rectangular) from the triplets op%block_storage%nrblocks = n_fields op%block_storage%ncblocks = n_fields @@ -300,6 +363,17 @@ contains call op%desc_glob%free(local_info) call op%grid_desc%free(local_info) end if + if (allocated(op%block_col_desc)) then + do j_field = 1, size(op%block_col_desc,2) + do i_field = 1, size(op%block_col_desc,1) + ! only present blocks were assembled (and carry a blk2field map) + if (allocated(op%blk2field) .and. allocated(op%blk2field(i_field,j_field)%map)) & + & call op%block_col_desc(i_field,j_field)%free(local_info) + end do + end do + deallocate(op%block_col_desc, stat=local_info) + end if + if (allocated(op%blk2field)) deallocate(op%blk2field, stat=local_info) if (allocated(op%field_desc)) then do i_field = 1, size(op%field_desc) call op%field_desc(i_field)%free(local_info) diff --git a/cuda/Makefile b/cuda/Makefile index 61ba47271..fe8583044 100644 --- a/cuda/Makefile +++ b/cuda/Makefile @@ -106,7 +106,7 @@ svectordev.o: svectordev.h vectordev.h dvectordev.o: dvectordev.h vectordev.h cvectordev.o: cvectordev.h vectordev.h zvectordev.o: zvectordev.h vectordev.h -psb_cuda_env_mod.o: base_cusparse_mod.o +psb_cuda_env_mod.o: base_cusparse_mod.o core_mod.o psb_cuda_mod.o: psb_cuda_env_mod.o psb_i_cuda_vect_mod.o\ psb_d_cuda_vect_mod.o psb_s_cuda_vect_mod.o\ psb_z_cuda_vect_mod.o psb_c_cuda_vect_mod.o\ diff --git a/cuda/psb_d_cuda_vect_mod.F90 b/cuda/psb_d_cuda_vect_mod.F90 index 0dacc511a..c50603fe5 100644 --- a/cuda/psb_d_cuda_vect_mod.F90 +++ b/cuda/psb_d_cuda_vect_mod.F90 @@ -77,6 +77,7 @@ module psb_d_cuda_vect_mod procedure, pass(x) :: set_sync => d_cuda_set_sync procedure, pass(x) :: set_scal => d_cuda_set_scal !!$ procedure, pass(x) :: set_vect => d_cuda_set_vect + procedure, pass(x) :: gthzv => d_cuda_gthzv procedure, pass(x) :: gthzv_x => d_cuda_gthzv_x procedure, pass(y) :: sctb => d_cuda_sctb procedure, pass(y) :: sctb_x => d_cuda_sctb_x @@ -231,6 +232,45 @@ contains end select end subroutine d_cuda_check_addr + subroutine d_cuda_gthzv(n,idx,x,y) + ! GPU override of the plain-array-index gather y(1:n) = x(idx(1:n)). + ! Without it the generic gth(n,idx(:),buf) -- used ONLY by the RMA swap path + ! (psi_d_swapdata, rma_pull/push) -- fell back to the host d_base_gthzv, which + ! calls x%sync() and copies the WHOLE vector device->host on EVERY swap. Here + ! we gather on the device straight from x%deviceVect (mirrors the class-default + ! branch of d_cuda_gthzv_x): only the n boundary indices move H2D, no full D2H. + use psb_cuda_env_mod + use psi_serial_mod + implicit none + integer(psb_mpk_) :: n + integer(psb_ipk_) :: idx(:) + real(psb_dpk_) :: y(:) + class(psb_d_vect_cuda) :: x + integer :: info, ni + + info = 0 + if (x%is_host()) call x%sync() ! ensure device copy is current (no D2H of the whole vector) + ni = size(idx) + + if (x%i_buf_sz < ni) then + if (c_associated(x%i_buf)) then + call freeInt(x%i_buf); x%i_buf = c_null_ptr + end if + info = allocateInt(x%i_buf,ni); x%i_buf_sz = ni + end if + if (x%dt_buf_sz < n) then + if (c_associated(x%dt_buf)) then + call freeDouble(x%dt_buf); x%dt_buf = c_null_ptr + end if + info = allocateDouble(x%dt_buf,n); x%dt_buf_sz = n + end if + + if (info == 0) info = writeInt(x%i_buf,idx,ni) + if (info == 0) info = igathMultiVecDeviceDouble(x%deviceVect, 0, n, 1, x%i_buf, 1, x%dt_buf, 1) + if (info == 0) info = readDouble(x%dt_buf,y,n) + + end subroutine d_cuda_gthzv + subroutine d_cuda_gthzv_x(i,n,idx,x,y) use psb_cuda_env_mod use psi_serial_mod diff --git a/test/comm/cg/gpu_cg.sh b/test/comm/cg/gpu_cg.sh new file mode 100644 index 000000000..d91dd62c4 --- /dev/null +++ b/test/comm/cg/gpu_cg.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# ============================================================================ +# gpu_cg.sh GPU Conjugate Gradient across comm backends -- "sparse ranks" +# +# Final test of the comm-scheme study: a full CG solve (not just spmv) on GPU, +# with few MPI ranks (1 rank = 1 GPU) spread over MANY nodes so the halo +# exchange is dominated by inter-node traffic -- the regime where RMA may pay +# off. The test sweeps ALL backends (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET, +# MPI_PUT) x 3 preconditioners internally, so there is no per-mode loop here. +# +# It also self-verifies, for every scheme, that an internally-allocated work +# vector (like CG's hidden r/p/q/z) inherits the scheme from desc_a%comm_type +# -- look for "[OK] internal-style work vector inherited scheme" in run.out, +# and abort on "INTERNAL-VECTOR SCHEME MISMATCH". +# ============================================================================ +#SBATCH --job-name=cg_gpu +#SBATCH --output=cg_gpu_%j.out +#SBATCH --error=cg_gpu_%j.err +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=4 # MN5-ACC: 4x H100 per node +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=20 # 80 cores / 4 ranks +#SBATCH --time=00:30:00 +#SBATCH --qos=acc_debug +#SBATCH --account=ehpc580 + +# ============================================================================ +# USER CONFIGURATION +# ============================================================================ +NREP=8 # CG solves per (scheme,prec) for statistics +NWARM=1 # warm-up solves (discarded) +ITMAX=500 # CG iteration cap +IDIM=200 # ignored when --matrix is given + +# --- "sparse" knob: GPUs (= ranks) per node. +# RANKS_PER_NODE=1 -> 1 GPU/node, maximally spread, pure inter-node comm. +RANKS_PER_NODE=1 +RANK_POINTS="2 4 8" # total ranks per scale point + +EXE=$HOME/Desktop/scorep/psblas3/test/comm/cg/runs/psb_comm_cg_test +MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx + +# ============================================================================ +# ENVIRONMENT +# ============================================================================ +module purge +module load bsc/1.0 +module load nvidia-hpc-sdk/25.3 +module load gcc/12.3.0 +module load ucx/1.16.0-gcc +module load openmpi/5.0.5-gcc +module load openblas/0.3.27-gcc +export PATH="$HOME/scorep-cuda/bin:$PATH" +export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH" + +export OMPI_MCA_coll_hcoll_enable=0 +export SCOREP_CUDA_ENABLE=yes +export SCOREP_CUDA_BUFFER=48M + +RESDIR=$SLURM_SUBMIT_DIR/results_cg_gpu_${SLURM_JOB_ID} +mkdir -p $RESDIR + +echo "=== CG GPU (sparse-ranks study) ===" +echo " nrep=$NREP nwarm=$NWARM itmax=$ITMAX ranks_per_node=$RANKS_PER_NODE" +echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS]" +echo "===================================" + +for NRANKS in $RANK_POINTS; do + NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE )) + STEP_DIR=$RESDIR/${NRANKS}ranks + mkdir -p $STEP_DIR + + echo "" + echo ">>> CG GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)" + + srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \ + --gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \ + $EXE $IDIM $NREP $NWARM $ITMAX --gpu=TRUE --matrix=$MATRIX --fmt=MM \ + > $STEP_DIR/run.out 2>&1 + + echo ">>> exit=$? output=$STEP_DIR/run.out" + # quick propagation sanity at submit time: + grep -q "INTERNAL-VECTOR SCHEME MISMATCH" $STEP_DIR/run.out \ + && echo " !! SCHEME PROPAGATION FAILED at $NRANKS ranks -- check run.out" +done + +echo "" +echo "=== CG GPU DONE. Results: $RESDIR ===" +ls -R $RESDIR diff --git a/test/comm/cg/psb_comm_cg_test.F90 b/test/comm/cg/psb_comm_cg_test.F90 index 345728390..bbaecaee4 100644 --- a/test/comm/cg/psb_comm_cg_test.F90 +++ b/test/comm/cg/psb_comm_cg_test.F90 @@ -18,6 +18,8 @@ program psb_comm_cg_test type(psb_dspmat_type) :: a type(psb_desc_type) :: desc_a type(psb_d_vect_type) :: b, x + type(psb_d_vect_type) :: probe ! mimics a hidden CG work vector for scheme-propagation check + integer(psb_ipk_) :: probe_got type(psb_dprec_type) :: prec #ifdef PSB_HAVE_CUDA type(psb_d_vect_cuda) :: vmold @@ -30,7 +32,7 @@ program psb_comm_cg_test integer(psb_ipk_) :: desc_me, desc_np integer(psb_ipk_) :: idim, itmax, itrace, istop, iter integer(psb_ipk_) :: scheme_idx, prec_idx, rep, nrep, nwarm - integer(psb_ipk_), parameter :: n_schemes=5, n_precs=2 + integer(psb_ipk_), parameter :: n_schemes=5, n_precs=3 integer(psb_ipk_), allocatable :: iter_count(:,:,:), solve_info(:,:,:) integer(psb_ipk_) :: scheme_type(n_schemes) real(psb_dpk_) :: eps, err, t_start, t_elapsed @@ -81,8 +83,10 @@ program psb_comm_cg_test prec_type(1) = 'NONE' prec_type(2) = 'DIAG' + prec_type(3) = 'BJAC' prec_name(1) = 'none' prec_name(2) = 'diag' + prec_name(3) = 'bjac' call get_command_argument(1,arg) if (len_trim(arg) > 0) then @@ -167,7 +171,7 @@ program psb_comm_cg_test end if write(psb_out_unit,'("Number of processors : ",i0)') np write(psb_out_unit,'("Iterative method : CG")') - write(psb_out_unit,'("Preconditioners : NONE, DIAG")') + write(psb_out_unit,'("Preconditioners : NONE, DIAG, BJAC")') write(psb_out_unit,'("Max iterations (CG) : ",i0)') itmax write(psb_out_unit,'("Repetitions : ",i0)') nrep write(psb_out_unit,'("Warmup solves : ",i0)') nwarm @@ -280,6 +284,36 @@ program psb_comm_cg_test end if end if + ! ---- DECISIVE CHECK: scheme propagation to a HIDDEN-style work vector ---- + ! The x check above can be silently skipped: inside CG, x is only updated + ! with axpby (never haloed), so x%v%comm_handle may stay unallocated. The + ! vectors that ARE haloed (r,p,q,z) are allocated and freed *inside* psb_dcg, + ! so the test cannot inspect them directly. Here we replicate exactly what + ! psb_dcg does to those work vectors -- psb_geall against desc_a + geasb with + ! mold=x%v (so it is GPU-resident too when use_gpu) -- then force one halo. + ! Its comm_handle must lazy-init from desc_a%comm_type, proving that any + ! internally-allocated vector inherits the chosen scheme. + call psb_geall(probe, desc_a, info) + if (info == psb_success_) call psb_geasb(probe, desc_a, info, mold=x%v) + if (info == psb_success_) call psb_geaxpby(done, b, dzero, probe, desc_a, info) + if (info == psb_success_) call psb_halo(probe, desc_a, info) ! triggers dispatch + if (info /= psb_success_) goto 9999 + probe_got = -1 + if (allocated(probe%v%comm_handle)) probe_got = probe%v%comm_handle%comm_type + if (probe_got /= scheme_type(scheme_idx)) then + if (my_rank == psb_root_) & + write(psb_err_unit,'("INTERNAL-VECTOR SCHEME MISMATCH rank=",i0," expected=",i0," got=",i0)') & + my_rank, scheme_type(scheme_idx), probe_got + info = psb_err_internal_error_ + goto 9999 + else if (my_rank == psb_root_ .and. rep == 1) then + write(psb_out_unit,'(" [OK] internal-style work vector inherited scheme: ",a)') & + trim(scheme_name(scheme_idx)) + end if + call psb_gefree(probe, desc_a, info) + if (info /= psb_success_) goto 9999 + ! ------------------------------------------------------------------------ + call psb_geaxpby(dzero,b,dzero,x,desc_a,info) if (info /= psb_success_) goto 9999 diff --git a/test/comm/spmv/Makefile b/test/comm/spmv/Makefile index dd1df48b8..080cbfcf4 100644 --- a/test/comm/spmv/Makefile +++ b/test/comm/spmv/Makefile @@ -15,7 +15,7 @@ TOBJS=psb_spmv_test.o EXEDIR=./runs -all: runsd spmv_overlap +all: runsd psb_spmv_kernel runsd: (if test ! -d runs ; then mkdir runs; fi) @@ -23,10 +23,9 @@ runsd: psb_spmv_test.o: psb_spmv_test.f90 $(FC) $(FCOPT) -cpp $(FINCLUDES) $(FDEFINES) $(FCUDEFINES) -c $< -o $@ -spmv_overlap: $(TOBJS) - $(FLINK) $(LOPT) $(TOBJS) -o spmv_overlap $(PSBLAS_LIB) $(LDLIBS) - /bin/mv spmv_overlap $(EXEDIR) - /bin/cp -f $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel +psb_spmv_kernel: $(TOBJS) + $(FLINK) $(LOPT) $(TOBJS) -o psb_spmv_kernel $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_spmv_kernel $(EXEDIR) clean: /bin/rm -f $(TOBJS) $(TOBJS_API) *$(.mod) $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel diff --git a/test/comm/spmv/extract_scorep.sh b/test/comm/spmv/extract_scorep.sh new file mode 100755 index 000000000..2607203d8 --- /dev/null +++ b/test/comm/spmv/extract_scorep.sh @@ -0,0 +1,206 @@ +#!/bin/bash +# ============================================================================ +# extract_scorep.sh -- auto-extract comm-scheme measurements from a +# results_*_ directory (swapdata/spmv style). +# +# For every ranks/ subdir it collects: +# 1) run.out wall-clock avg time per backend (the REAL headline number) +# 2) cube_stat -r per swap routine from scorep_profile/profile.cubex: +# - INCL (total time in the routine, summed over call paths) +# - one-time setup (Win_create/Win_allocate/topology_init/ +# ini_memory_buffer_layout/alltoallv_init) +# - steady = INCL - one-time +# - every MPI_* component (to scorep_components.csv) +# +# Output (written into RESULTS_DIR): +# runout.csv ranks,backend,total_s,avg_s +# scorep_summary.csv ranks,scheme,incl_s,onetime_s,steady_s +# scorep_components.csv ranks,scheme,component,time_s +# plus pretty pivot tables on stdout. +# +# Usage: +# ./extract_scorep.sh # process ALL ranks/ in current dir +# ./extract_scorep.sh 80ranks # process ONE rank-point dir +# ./extract_scorep.sh path/to/results # process ALL ranks/ under that dir +# +# CSVs ACCUMULATE (header written only if absent), so you can run one +# rank-point at a time and the pivots grow. To start over: rm *.csv +# ============================================================================ +set -u +ARG="${1:-.}" +OUTDIR="${OUTDIR:-.}" # where the CSVs go (default: current dir) + +command -v cube_stat >/dev/null 2>&1 || { echo "ERROR: cube_stat not in PATH"; exit 1; } + +# scheme label : mangled routine name used by cube_stat -r +SCHEMES=( + "baseline:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_baseline_vect" + "neighbor:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_topology_vect" + "persistent:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_persistent_topology_vect" + "rma_pull:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_pull_vect" + "rma_push:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_push_vect" +) + +RUNOUT="$OUTDIR/runout.csv" +SUMMARY="$OUTDIR/scorep_summary.csv" +COMPONENTS="$OUTDIR/scorep_components.csv" +# write header only if the file does not exist yet (append/accumulate mode) +[ -f "$RUNOUT" ] || echo "ranks,backend,total_s,avg_s,setup_s" > "$RUNOUT" +[ -f "$SUMMARY" ] || echo "ranks,scheme,incl_s,setup_s,gpuwait_s,pack_s,mpimove_s,mpiwait_s,netmpi_s" > "$SUMMARY" +[ -f "$COMPONENTS" ] || echo "ranks,scheme,bucket,component,time_s" > "$COMPONENTS" + +# Select dirs: a single rank-point dir (has scorep_profile/run.out) or, if ARG +# is a results dir, all its ranks/ subdirs. +if [ -f "$ARG/scorep_profile/profile.cubex" ] || [ -f "$ARG/run.out" ]; then + DIRS=( "$ARG" ) +else + DIRS=( $(ls -d "$ARG"/*ranks 2>/dev/null | sort -t/ -k99 -V) ) +fi +[ ${#DIRS[@]} -eq 0 ] && { echo "No rank-point dir(s) found under '$ARG'"; exit 1; } + +for d in "${DIRS[@]}"; do + ranks=$(basename "$d" | sed 's/[^0-9]//g') + [ -z "$ranks" ] && continue + + # ---- 1) run.out wall-clock avg + setup per backend ---- + # One run.out holds several "comm backend" blocks in sequence; we accumulate + # tot/avg/setup for the current block and flush it when the next block starts + # (or at EOF). "setup time" is optional: 0 if the benchmark does not emit it + # (i.e. until the warm-up-timing patch lands), so the break-even falls back to + # pure steady-state comparison. + if [ -f "$d/run.out" ]; then + awk -v r="$ranks" ' + function flush(){ if (be!=""){ print r","be","tot","avg","setup } be="";tot=0;avg=0;setup=0 } + /comm backend/ { flush(); split($0,a,":"); be=a[2]; gsub(/ /,"",be) } + /total time/ { split($0,a,":"); tot=a[2]+0 } + /avg time/ { split($0,a,":"); avg=a[2]+0 } + /setup time/ { split($0,a,":"); setup=a[2]+0 } + END { flush() } + ' "$d/run.out" >> "$RUNOUT" + fi + + # ---- 2) scorep cube_stat per scheme ---- + cubex="$d/scorep_profile/profile.cubex" + [ -f "$cubex" ] || { echo "WARN: no profile.cubex in $d" >&2; continue; } + + for entry in "${SCHEMES[@]}"; do + label="${entry%%:*}"; routine="${entry#*:}" + out=$(cube_stat -r "$routine" "$cubex" 2>/dev/null) + [ -z "$out" ] && continue + printf '%s\n' "$out" | awk -v ranks="$ranks" -v scheme="$label" -v comp="$COMPONENTS" ' + # Bucketize every child component of the swap routine. cube_stat -r reports, + # for each child, its inclusive time within the routine subtree, and the + # children partition INCL -- so per-bucket sums are meaningful. + # + # Why this matters on GPU: the single largest component in EVERY scheme is + # d_cuda_device_wait (~0.036s, the GPU stream sync), which is NOT network + # time and is ~equal across schemes. The old "steady = INCL - onetime" left + # it inside, masking the real differentiator -- the MPI wait/sync time. + # Buckets (each component lands in exactly one, order matters): + # setup : one-time Win_create / topology_init / alltoallv_init / + # ini_memory_buffer_layout + # gpuwait : GPU stream sync (device_wait, cuda_sync) -- exclude from MPI + # mpiwait : blocking MPI sync (MPI_Wait, Win_wait/post/start/complete) + # <-- the discriminating metric ("guarda le wait") + # mpimove : actual transfer (Isend/Irecv/Put/Get/Start/[I]neighbor_alltoallv) + # pack : gather/scatter/buffer (gthz*, sctb*, new_buffer, realloc) + # netmpi = mpimove + mpiwait (honest per-iteration network cost) + { + k=split($0,a,","); v=a[k]+0 # value = last comma-field + nm=$0; sub(/,[^,]*$/,"",nm) # name = everything before it + if (nm ~ /^INCL\(/) { incl+=v; next } + if (nm ~ /^EXCL\(/) { next } + bkt="other" + if (nm ~ /Win_create|Win_allocate|topology_init|ini_memory_buffer_layout|alltoallv_init/) { setup+=v; bkt="setup" } + else if (nm ~ /device_wait|cuda_sync/) { gpuwait+=v; bkt="gpuwait" } + else if (nm ~ /MPI_Wait|Win_wait|Win_post|Win_start|Win_complete/) { mpiwait+=v; bkt="mpiwait" } + else if (nm ~ /MPI_Isend|MPI_Irecv|MPI_Send|MPI_Recv|MPI_Put|MPI_Get|MPI_Start|neighbor_alltoallv|Neighbor_alltoallv/) { mpimove+=v; bkt="mpimove" } + else if (nm ~ /gthz|sctb|new_buffer|realloc/) { pack+=v; bkt="pack" } + cn=nm; sub(/.*::/,"",cn); gsub(/"/,"",cn) + print ranks","scheme","bkt","cn","v >> comp + } + END { printf "%s,%s,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f\n", \ + ranks, scheme, incl, setup, gpuwait, pack, mpimove, mpiwait, mpimove+mpiwait } + ' >> "$SUMMARY" + done +done + +# ---- pretty pivots ---- +pivot() { # $1=csv $2=value-col-index(1-based, after ranks,key) $3=title + awk -F, -v vc="$2" -v title="$3" ' + NR==1 { next } + { r=$1; k=$2; val=$(2+vc); v[r","k]=val; rs[r]=1; ks[k]=1 } + END { + n=0; for (k in ks) cols[++n]=k + # stable-ish column order + order="baseline neighbor persistent rma_pull rma_push P2P NEIGHBOR PNEIGHBOR MPI_GET MPI_PUT" + m=split(order,oc," "); delete cols; n=0 + for (i=1;i<=m;i++) if (oc[i] in ks) cols[++n]=oc[i] + printf "\n=== %s ===\n", title + printf "%-8s", "ranks"; for (i=1;i<=n;i++) printf "%14s", cols[i]; printf "\n" + nr=0; for (r in rs) rr[++nr]=r + # numeric sort of ranks + for (i=1;i<=nr;i++) for (j=i+1;j<=nr;j++) if (rr[i]+0>rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t} + for (i=1;i<=nr;i++){ printf "%-8s", rr[i]; for (c=1;c<=n;c++){key=rr[i]","cols[c]; printf "%14s", (key in v)?v[key]:"-"} printf "\n" } + }' "$1" +} + +# ---- break-even pivot: iterations after which scheme X beats P2P ---- +# Model for an iterative solver doing n_it spmv (halo exchanges): +# T_X(n) = setup_X + n * steady_X (steady = run.out avg, setup excluded +# by the warm-up; setup = run.out "setup time") +# X beats the P2P reference when T_X(n) < T_P2P(n). Solving for the crossover: +# n* = (setup_X - setup_P2P) / (steady_P2P - steady_X) +# Output legend (per cell): +# : break-even n* (X wins for n_it > n*) -- more setup, less steady +# always : X wins from iteration 1 (cheaper setup AND cheaper steady) -> dominates P2P +# never : P2P wins for all n_it (more setup AND slower steady) -> X is dominated +# rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t} + anysetup=0 + for (i=1;i<=nr;i++){ + r=rr[i]; printf "%-8s", r + sref=steady[r",P2P"]; pref=setp[r",P2P"] + for (c=1;c<=m;c++){ be=oc[c]; if (!(be in bes)) continue + key=r","be + if (!(key in steady) || !(r",P2P" in steady)) { printf "%14s","-"; continue } + dsteady = sref - steady[key] # >0 : X faster per-iter than P2P + dsetup = setp[key] - pref # >0 : X pays more setup than P2P + if (setp[key]>0 || pref>0) anysetup=1 + if (dsteady > 0) printf "%14s", (dsetup<=0 ? "always" : sprintf("%.0f", dsetup/dsteady)) + else if (dsteady < 0) printf "%14s", (dsetup>=0 ? "never" : sprintf("<%.0f", dsetup/dsteady)) + else printf "%14s", (dsetup<0 ? "always" : "never") + } + printf "\n" + } + if (!anysetup) print "\n NOTE: all setup_s == 0 -> run.out has no \"setup time\" yet (apply the\n warm-up-timing patch to the benchmark). Columns reflect steady-state only." + }' "$RUNOUT" +} + +echo "Wrote: $RUNOUT $SUMMARY $COMPONENTS" +pivot "$RUNOUT" 2 "run.out AVG time per backend [s] (real wall-clock = steady-state per iter)" +pivot "$RUNOUT" 3 "run.out SETUP time per backend [s] (one-time, paid in warm-up)" +breakeven +pivot "$SUMMARY" 6 "scorep MPI WAIT per scheme [s] <== THE discriminator (sync/imbalance; 'guarda le wait')" +pivot "$SUMMARY" 7 "scorep NET MPI per scheme [s] (mpimove + mpiwait; excl GPU sync, pack, one-time setup)" +pivot "$SUMMARY" 5 "scorep MPI MOVE per scheme [s] (pure transfer: Isend/Irecv/Put/Get/a2av)" +pivot "$SUMMARY" 2 "scorep SETUP per scheme [s] (one-time: Win_create/topology_init/alltoallv_init/buf_layout)" +pivot "$SUMMARY" 3 "scorep GPU WAIT per scheme [s] (d_cuda_device_wait; sanity: ~equal across schemes)" +pivot "$SUMMARY" 1 "scorep INCL per scheme [s] (everything; dominated by GPU wait -> misleading alone)" + +echo +echo "Tip: full per-component breakdown (now bucketed) is in $COMPONENTS" +echo " (ranks,scheme,bucket,component,time). e.g.:" +echo " grep ',rma_pull,' $COMPONENTS | sort -t, -k5 -gr # rma_pull hot spots" +echo " awk -F, '\$3==\"mpiwait\"' $COMPONENTS # only the waits" diff --git a/test/comm/spmv/gpu_spmv.sh b/test/comm/spmv/gpu_spmv.sh new file mode 100644 index 000000000..3263a7365 --- /dev/null +++ b/test/comm/spmv/gpu_spmv.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# ============================================================================ +# gpu_spmv.sh GPU SpMV across comm backends -- "sparse ranks" study +# +# Goal: few MPI ranks spread over MANY nodes (1 rank = 1 GPU), so that the +# halo exchange is dominated by INTER-node traffic. This is where RMA +# (MPI_Get/MPI_Put) may finally pay off: with few, fat messages over the +# network the one-time window-creation cost can amortize (break-even n*), +# unlike the dense CPU runs where RMA was dominated. +# +# The spmv test sweeps ALL backends internally (P2P, NEIGHBOR, PNEIGHBOR, +# MPI_GET, MPI_PUT) in one run -> no per-mode loop here. +# ============================================================================ +#SBATCH --job-name=spmv_gpu +#SBATCH --output=spmv_gpu_%j.out +#SBATCH --error=spmv_gpu_%j.err +#SBATCH --nodes=8 # reserve up front; we use a subset per point +#SBATCH --ntasks-per-node=4 # MN5-ACC has 4x H100 per node +#SBATCH --gres=gpu:4 # request all 4 GPUs on each reserved node +#SBATCH --cpus-per-task=20 # 80 cores / 4 ranks = 20 cores per rank +#SBATCH --time=00:30:00 +#SBATCH --qos=acc_debug +#SBATCH --account=ehpc580 + +# ============================================================================ +# USER CONFIGURATION +# ============================================================================ +TIMES=50 # SpMV repetitions (timed) +GPU_FMT=HLG # GPU matrix format (HLG|ELG|CSRG|HDIAG) +PROFILE=true # Score-P profiling on/off + +# --- "sparse" knob: how many ranks (= GPUs) to put on each node. +# RANKS_PER_NODE=1 -> 1 GPU/node, maximally spread, pure inter-node comm +# (this is the configuration that isolates RMA-over-network) +# RANKS_PER_NODE=4 -> pack all 4 GPUs/node (denser, more intra-node) +RANKS_PER_NODE=1 + +# total ranks per scale point. With RANKS_PER_NODE=1 these are also node counts, +# so "2 4 8" means 2, 4, 8 nodes each holding a single GPU. +RANK_POINTS="2 4 8" + +EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel +MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx + +# ============================================================================ +# ENVIRONMENT +# ============================================================================ +module purge +module load bsc/1.0 +module load nvidia-hpc-sdk/25.3 +module load gcc/12.3.0 +module load ucx/1.16.0-gcc +module load openmpi/5.0.5-gcc +module load openblas/0.3.27-gcc +export PATH="$HOME/scorep-cuda/bin:$PATH" +export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH" + +export OMPI_MCA_coll_hcoll_enable=0 +export SCOREP_CUDA_ENABLE=yes # GPU run: turn the CUDA adapter ON +export SCOREP_CUDA_BUFFER=48M # per-stream CUDA record buffer + +RESDIR=$SLURM_SUBMIT_DIR/results_spmv_gpu_${SLURM_JOB_ID} +mkdir -p $RESDIR + +# --- Score-P region filter (same as CPU: drop tiny high-frequency USR funcs) --- +FILTER=$RESDIR/scorep.filt +cat > $FILTER <<'EOF' +SCOREP_REGION_NAMES_BEGIN + EXCLUDE + psb_indx_map_mod::* + psb_desc_mod::* + psb_error_mod::* + psb_gen_block_map_mod::* + psi_penv_mod::* + psb_hash_mod::* +SCOREP_REGION_NAMES_END +EOF + +echo "=== SPMV GPU (sparse-ranks study) ===" +echo " times=$TIMES gpu_fmt=$GPU_FMT ranks_per_node=$RANKS_PER_NODE" +echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE" +echo "=====================================" + +for NRANKS in $RANK_POINTS; do + NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE )) + STEP_DIR=$RESDIR/${NRANKS}ranks + mkdir -p $STEP_DIR + + if [ "$PROFILE" = "true" ]; then + export SCOREP_ENABLE_PROFILING=true + export SCOREP_ENABLE_TRACING=false + export SCOREP_TOTAL_MEMORY=256M # bumped: CUDA adapter needs more + export SCOREP_FILTERING_FILE=$FILTER + export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile + else + export SCOREP_ENABLE_PROFILING=false + export SCOREP_ENABLE_TRACING=false + fi + + echo "" + echo ">>> GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)" + + srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \ + --gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \ + $EXE --gpu=TRUE --gpu_fmt=$GPU_FMT --matrix=$MATRIX --fmt=MM --times=$TIMES \ + > $STEP_DIR/run.out 2>&1 + + echo ">>> exit=$? output=$STEP_DIR/run.out" +done + +echo "" +echo "=== SPMV GPU DONE. Results: $RESDIR ===" +ls -R $RESDIR diff --git a/test/comm/spmv/psb_spmv_test.f90 b/test/comm/spmv/psb_spmv_test.f90 index 5efae0077..a7fff7492 100644 --- a/test/comm/spmv/psb_spmv_test.f90 +++ b/test/comm/spmv/psb_spmv_test.f90 @@ -525,7 +525,7 @@ contains return end subroutine psb_d_gen_pde3d - subroutine run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_in,times_in,do_swap,comm_mode) + subroutine run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_in,times_in,comm_mode) use psb_base_mod #ifdef PSB_HAVE_CUDA use psb_cuda_mod @@ -539,7 +539,6 @@ contains character(len=*), intent(in) :: cpu_fmt character(len=*), intent(in) :: gpu_fmt integer(psb_ipk_), intent(in) :: idim_in, times_in - logical, intent(in) :: do_swap character(len=*), intent(in) :: comm_mode type(psb_dspmat_type) :: a @@ -628,9 +627,6 @@ contains end if if (info /= psb_success_) goto 9999 - call psb_comm_set(comm_type,x%v%comm_handle,info) - if (info /= psb_success_) goto 9999 - #ifdef PSB_HAVE_CUDA if (use_gpu) then select case(psb_toupper(trim(gpu_fmt))) @@ -654,14 +650,22 @@ contains end if #endif + ! Set the comm scheme on the DESCRIPTOR, not the vector. desc_a%comm_type + ! survives the GPU x%cnv above (which allocates a fresh x%v WITHOUT copying + ! comm_handle); x then lazy-inits its handle from desc_a%comm_type at the + ! first halo. Setting psb_comm_set on x%v%comm_handle *before* cnv silently + ! reverted every backend to baseline on GPU (CPU was unaffected: no cnv). + call desc_a%set_comm_scheme(comm_type, info) + if (info /= psb_success_) goto 9999 + ! warm-up - call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap) + call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.true.) if (info /= psb_success_) goto 9999 call psb_barrier(ctxt) t0 = psb_wtime() do i = 1, times - call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap) + call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.true.) if (info /= psb_success_) exit end do t1 = psb_wtime() @@ -672,11 +676,7 @@ contains avg_t = dt / real(times, psb_dpk_) if (my_rank == psb_root_) then - if (do_swap) then - write(psb_out_unit,'(/,"SpMV benchmark (overlap)")') - else - write(psb_out_unit,'(/,"SpMV benchmark (no overlap)")') - end if + write(psb_out_unit,'(/,"SpMV benchmark")') write(psb_out_unit,'(" cpu matrix fmt : ",a)') trim(afmt) if (use_gpu) write(psb_out_unit,'(" gpu matrix fmt : ",a)') trim(psb_toupper(trim(gpu_fmt))) if (use_external_matrix) then @@ -784,7 +784,6 @@ program psb_spmv_kernel character(len=8) :: cpu_fmt character(len=8) :: gpu_fmt integer(psb_ipk_) :: idim_arg, times_arg - logical :: do_overlap integer :: kmode integer, parameter :: n_comm_modes = 5 character(len=20), parameter :: comm_modes(n_comm_modes) = [character(len=20) :: & @@ -797,7 +796,6 @@ program psb_spmv_kernel matrix_fmt = 'MM' cpu_fmt = 'CSR' gpu_fmt = 'HLG' - do_overlap = .true. call psb_init(ctxt) call psb_info(ctxt, my_rank, np) @@ -842,10 +840,6 @@ program psb_spmv_kernel gpu_fmt = psb_toupper(adjustl(arg(14:len_trim(arg)))) else if (index(psb_toupper(trim(arg)), '--GPU_FMT=') == 1) then gpu_fmt = psb_toupper(adjustl(arg(11:len_trim(arg)))) - else if ((trim(psb_toupper(arg)) == '--NOOVERLAP') .or. (trim(psb_toupper(arg)) == '--NO_OVERLAP')) then - do_overlap = .false. - else if ((trim(psb_toupper(arg)) == '--OVERLAP') .or. (trim(psb_toupper(arg)) == '--SWAP')) then - do_overlap = .true. else if (trim(psb_toupper(arg)) == '--MATRIX') then if (k < command_argument_count()) call get_command_argument(k+1,matrix_file) else if (trim(psb_toupper(arg)) == '--FMT') then @@ -897,7 +891,7 @@ program psb_spmv_kernel write(psb_out_unit,'("GPU enabled : ",l1)') use_gpu write(psb_out_unit,'("Usage: ./psb_spmv_kernel [--gpu=TRUE|FALSE] [--dim=N] [--times=N] ",& &"[--cpu_fmt=CSR|COO|CSC|ELL|HLL] [--gpu_fmt=HLL|ELL|CSR|HDIA] [--matrix=] [--fmt=MM|HB] ",& - &"[--overlap|--nooverlap] (runs all comm backends)")') + &"(runs all comm backends)")') end if do kmode = 1, n_comm_modes @@ -905,7 +899,7 @@ program psb_spmv_kernel write(psb_out_unit,'(/,"=== Backend sweep: ",a," ===")') trim(comm_modes(kmode)) end if call run_spmv_kernel(ctxt, use_gpu, matrix_file, matrix_fmt, cpu_fmt, gpu_fmt, & - & idim_arg, times_arg, do_overlap, comm_modes(kmode)) + & idim_arg, times_arg, comm_modes(kmode)) end do #ifdef PSB_HAVE_CUDA diff --git a/test/comm/spmv/strong_spmv.sh b/test/comm/spmv/strong_spmv.sh new file mode 100644 index 000000000..b385c7074 --- /dev/null +++ b/test/comm/spmv/strong_spmv.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# ============================================================================ +# strong_spmv.sbatch STRONG scaling, CPU-only SpMV across comm backends +# +# Fixed total problem size; MPI ranks grow across scale points. The spmv test +# sweeps ALL comm backends internally (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET, +# MPI_PUT) in a single run, so there is no per-mode loop. +# All scale points run inside ONE reserved allocation (same hardware). +# ============================================================================ +#SBATCH --job-name=spmv_strong +#SBATCH --output=spmv_strong_%j.out +#SBATCH --error=spmv_strong_%j.err +#SBATCH --nodes=8 # reserve the maximum up front +#SBATCH --ntasks-per-node=80 # 80 cores per ACC node -> 80 ranks/node +#SBATCH --cpus-per-task=1 # CPU-only: one core per rank +#SBATCH --time=00:30:00 +#SBATCH --qos=acc_debug +#SBATCH --account=ehpc580 + +# ============================================================================ +# USER CONFIGURATION +# ============================================================================ +DIM=280 # FIXED problem size (idim^3 unknowns) +TIMES=50 # SpMV repetitions (timed) +CPU_FMT=CSR # CPU matrix storage format +PROFILE=true # Score-P profiling on/off +RANK_POINTS="80 160 320 640" # total ranks per scale point (multiples of 80) + +EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel + +# ============================================================================ +# ENVIRONMENT +# ============================================================================ +module purge +module load bsc/1.0 +module load nvidia-hpc-sdk/25.3 +module load gcc/12.3.0 +module load ucx/1.16.0-gcc +module load openmpi/5.0.5-gcc +module load openblas/0.3.27-gcc +export PATH="$HOME/scorep-cuda/bin:$PATH" +export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH" + +export OMPI_MCA_coll_hcoll_enable=0 +export SCOREP_CUDA_ENABLE=no # CPU-only run: silence Score-P CUDA adapter + +RESDIR=$SLURM_SUBMIT_DIR/results_spmv_strong_${SLURM_JOB_ID} +mkdir -p $RESDIR + +# --- Score-P region filter: exclude the high-frequency tiny USR functions so +# the profile reflects real MPI/comm time, not instrumentation overhead. --- +FILTER=$RESDIR/scorep.filt +cat > $FILTER <<'EOF' +SCOREP_REGION_NAMES_BEGIN + EXCLUDE + psb_indx_map_mod::* + psb_desc_mod::* + psb_error_mod::* + psb_gen_block_map_mod::* + psi_penv_mod::* + psb_hash_mod::* +SCOREP_REGION_NAMES_END +EOF + +echo "=== SPMV STRONG scaling (CPU-only) ===" +echo " fixed_dim=$DIM times=$TIMES cpu_fmt=$CPU_FMT" +echo " reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE" +echo "======================================" + +for NRANKS in $RANK_POINTS; do + NNODES=$(( (NRANKS + 79) / 80 )) # 80 ranks per node + STEP_DIR=$RESDIR/${NRANKS}ranks + mkdir -p $STEP_DIR + + if [ "$PROFILE" = "true" ]; then + export SCOREP_ENABLE_PROFILING=true + export SCOREP_ENABLE_TRACING=false + export SCOREP_TOTAL_MEMORY=128M + export SCOREP_FILTERING_FILE=$FILTER + export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile + else + export SCOREP_ENABLE_PROFILING=false + export SCOREP_ENABLE_TRACING=false + fi + + echo "" + echo ">>> STRONG point: $NRANKS ranks ($NNODES nodes), fixed dim=$DIM" + srun -N $NNODES -n $NRANKS --ntasks-per-node=80 --cpus-per-task=1 \ + $EXE --gpu=FALSE --dim=$DIM --times=$TIMES --cpu_fmt=$CPU_FMT \ + > $STEP_DIR/run.out 2>&1 + echo ">>> exit=$? output=$STEP_DIR/run.out" +done + +echo "" +echo "=== SPMV STRONG DONE. Results: $RESDIR ===" +ls -R $RESDIR diff --git a/test/nested/CMakeLists.txt b/test/nested/CMakeLists.txt index 7e5c44e05..dcc097dd2 100644 --- a/test/nested/CMakeLists.txt +++ b/test/nested/CMakeLists.txt @@ -26,6 +26,7 @@ file(MAKE_DIRECTORY ${EXEDIR}) set(SOURCES_D_NEST_GLOB_TEST psb_d_nest_glob_test.F90) set(SOURCES_D_NEST_RECT_TEST psb_d_nest_rect_test.F90) set(SOURCES_D_NEST_CG_TEST psb_d_nest_cg_test.F90) +set(SOURCES_D_NEST_HALO_TEST psb_d_nest_halo_regime_test.F90) add_executable(psb_d_nest_glob_test ${SOURCES_D_NEST_GLOB_TEST}) target_link_libraries(psb_d_nest_glob_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base) @@ -36,8 +37,11 @@ target_link_libraries(psb_d_nest_rect_test psblas::util psblas::linsolve psblas: add_executable(psb_d_nest_cg_test ${SOURCES_D_NEST_CG_TEST}) target_link_libraries(psb_d_nest_cg_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base) +add_executable(psb_d_nest_halo_regime_test ${SOURCES_D_NEST_HALO_TEST}) +target_link_libraries(psb_d_nest_halo_regime_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base) + # Set output directory for executables -foreach(target psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test) +foreach(target psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test psb_d_nest_halo_regime_test) set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${EXEDIR} ) diff --git a/test/nested/Makefile b/test/nested/Makefile index d647aee38..f909e0f68 100644 --- a/test/nested/Makefile +++ b/test/nested/Makefile @@ -16,11 +16,15 @@ FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG). EXEDIR=./runs -all: runsd psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test +all: runsd psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test psb_d_nest_halo_regime_test runsd: (if test ! -d runs ; then mkdir runs; fi) +psb_d_nest_halo_regime_test: psb_d_nest_halo_regime_test.o + $(FLINK) psb_d_nest_halo_regime_test.o -o psb_d_nest_halo_regime_test $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_d_nest_halo_regime_test $(EXEDIR) + psb_d_nest_glob_test: psb_d_nest_glob_test.o $(FLINK) psb_d_nest_glob_test.o -o psb_d_nest_glob_test $(PSBLAS_LIB) $(LDLIBS) /bin/mv psb_d_nest_glob_test $(EXEDIR) @@ -34,8 +38,9 @@ psb_d_nest_cg_test: psb_d_nest_cg_test.o /bin/mv psb_d_nest_cg_test $(EXEDIR) clean: - /bin/rm -f psb_d_nest_glob_test.o psb_d_nest_rect_test.o psb_d_nest_cg_test.o *$(.mod) \ - $(EXEDIR)/psb_d_nest_glob_test $(EXEDIR)/psb_d_nest_rect_test $(EXEDIR)/psb_d_nest_cg_test + /bin/rm -f psb_d_nest_glob_test.o psb_d_nest_rect_test.o psb_d_nest_cg_test.o psb_d_nest_halo_regime_test.o *$(.mod) \ + $(EXEDIR)/psb_d_nest_glob_test $(EXEDIR)/psb_d_nest_rect_test $(EXEDIR)/psb_d_nest_cg_test \ + $(EXEDIR)/psb_d_nest_halo_regime_test verycleanlib: (cd ../..; make veryclean) lib: diff --git a/test/nested/psb_d_nest_halo_regime_test.F90 b/test/nested/psb_d_nest_halo_regime_test.F90 new file mode 100644 index 000000000..d16835927 --- /dev/null +++ b/test/nested/psb_d_nest_halo_regime_test.F90 @@ -0,0 +1,398 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! File: psb_d_nest_halo_regime_test.F90 +! Author: Simone Staccone (Stack-1) +! +! Benchmarks the two halo-exchange regimes of a nested (MATNEST-style) operator +! on the SAME assembled blocks: +! +! * UNION (fused): one psb_halo over the composed global descriptor brings in +! the union of every field's ghosts at once (the production path used by +! psb_spmm / Krylov / AMG4PSBLAS). +! +! * PER-BLOCK (selective): each present block (i,j) exchanges ONLY its own +! off-process columns, through its restricted descriptor block_col_desc(i,j), +! one psb_halo per block. +! +! The two products must coincide to machine precision. The test reports, for +! each regime: the number of halo exchanges, the full-matvec time, and the +! PURE-COMMUNICATION time (only the psb_halo calls), so the cost of the message +! aggregation can be isolated. +! +! Saddle-point 2x2 operator [ A B^T ; B 0 ] with TWO fields of DIFFERENT +! size (n1 = field 1 / "velocity", n2 = field 2 / "pressure", n1 /= n2 allowed); +! the (2,2) block is absent. The blocks have REAL, distinct per-block halos (so +! per-block vs union is meaningful): A (1,1) tridiagonal on field 1; the +! rectangular B^T (1,2) and B (2,1) couple every row to a column half a field +! away, which lands on another process. +! +! Run: mpirun -np

./psb_d_nest_halo_regime_test [n1] [n2] [n_reps] [stride] +! n1 : global rows of field 1 (default 2000000) +! n2 : global rows of field 2 (default 500000) +! n_reps : timed repetitions (default 50) +! stride : B/B^T coupling distance (default 1) +! small => local halo => LATENCY-bound (aggregation wins) +! ~n/2 => huge halo => BANDWIDTH-bound (union ~ per-block) +! +program psb_d_nest_halo_regime_test + use psb_base_mod + use psb_d_nest_mod + implicit none + + type(psb_ctxt_type) :: context + integer(psb_ipk_) :: my_rank, num_procs, info, i_local_row + integer(psb_ipk_) :: entry_idx, field1_local_rows, field2_local_rows + integer(psb_ipk_) :: n_local_rows, n_local_cols, n_exch, n_warm + integer(psb_ipk_) :: n_reps, narg + integer(psb_lpk_) :: global_row, field1_size, field2_size, gcol, stride + character(len=64) :: arg + + type(psb_d_nest_matrix) :: nested_matrix + integer(psb_lpk_), allocatable :: entry_rows(:), entry_cols(:) + integer(psb_lpk_), allocatable :: field1_rows(:), field2_rows(:) + real(psb_dpk_), allocatable :: entry_vals(:) + real(psb_dpk_), allocatable :: x_arr(:), work_x(:), y_union(:), y_perblock(:) + real(psb_dpk_) :: mismatch_norm, t_u, t_b + logical :: ok_u, ok_b + integer(psb_ipk_) :: s_idx, n_schemes + integer(psb_ipk_), allocatable :: schemes(:) + character(len=34), allocatable :: scheme_names(:) + type(psb_d_vect_type) :: xg_vect ! global-local halo vector + type(psb_d_vect_type), allocatable :: bvect(:,:) ! one per present block + real(psb_dpk_), parameter :: tolerance = 1.0e-10_psb_dpk_ + + call psb_init(context) + call psb_info(context, my_rank, num_procs) + + ! -------- runtime parameters (rank 0 parses, then broadcast) -------- + field1_size = 2000000_psb_lpk_ + field2_size = 500000_psb_lpk_ + n_reps = 50 + stride = 1_psb_lpk_ ! B/B^T coupling distance: small => local halo + ! (latency-bound), large (~n/2) => big halo + ! (bandwidth-bound) + if (my_rank == 0) then + narg = command_argument_count() + if (narg >= 1) then + call get_command_argument(1, arg); read(arg,*) field1_size + end if + if (narg >= 2) then + call get_command_argument(2, arg); read(arg,*) field2_size + end if + if (narg >= 3) then + call get_command_argument(3, arg); read(arg,*) n_reps + end if + if (narg >= 4) then + call get_command_argument(4, arg); read(arg,*) stride + end if + end if + call psb_bcast(context, field1_size) + call psb_bcast(context, field2_size) + call psb_bcast(context, n_reps) + call psb_bcast(context, stride) + n_warm = max(5, n_reps/10) + + !--------------------------------------------------------------- + ! 1) build the 2x2 nested operator with real per-block halos + !--------------------------------------------------------------- + call nested_matrix%init(context, [field1_size, field2_size], info) + if (info /= psb_success_) then + if (my_rank==0) write(*,*) 'FAIL: init info=', info; goto 9999 + end if + field1_rows = nested_matrix%get_owned_rows(1) + field2_rows = nested_matrix%get_owned_rows(2) + field1_local_rows = size(field1_rows) + field2_local_rows = size(field2_rows) + + ! A = tridiag(-1,2,-1) -> block (1,1) + allocate(entry_rows(3*field1_local_rows), entry_cols(3*field1_local_rows), entry_vals(3*field1_local_rows)) + entry_idx = 0 + do i_local_row = 1, field1_local_rows + global_row = field1_rows(i_local_row) + entry_idx = entry_idx + 1 + entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row; entry_vals(entry_idx) = 2.0_psb_dpk_ + if (global_row > 1) then + entry_idx = entry_idx + 1 + entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row - 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_ + end if + if (global_row < field1_size) then + entry_idx = entry_idx + 1 + entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row + 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_ + end if + end do + call nested_matrix%ins(1, 1, entry_idx, entry_rows, entry_cols, entry_vals, info) + deallocate(entry_rows, entry_cols, entry_vals) + + ! B^T -> block (1,2): rows in field 1, columns in field 2 (RECTANGULAR n1 x n2). + ! Each row r couples to field-2 columns (r mod n2) and (r + n2/2 mod n2); the + ! second one lands on another process => real, distinct halo for this block. + allocate(entry_rows(2*field1_local_rows), entry_cols(2*field1_local_rows), entry_vals(2*field1_local_rows)) + entry_idx = 0 + do i_local_row = 1, field1_local_rows + global_row = field1_rows(i_local_row) + entry_idx = entry_idx + 1 + gcol = mod(global_row - 1_psb_lpk_, field2_size) + 1_psb_lpk_ + entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.5_psb_dpk_ + entry_idx = entry_idx + 1 + gcol = mod(global_row - 1_psb_lpk_ + stride, field2_size) + 1_psb_lpk_ + entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.25_psb_dpk_ + end do + call nested_matrix%ins(1, 2, entry_idx, entry_rows, entry_cols, entry_vals, info) + deallocate(entry_rows, entry_cols, entry_vals) + + ! B -> block (2,1): rows in field 2, columns in field 1 (RECTANGULAR n2 x n1). + allocate(entry_rows(2*field2_local_rows), entry_cols(2*field2_local_rows), entry_vals(2*field2_local_rows)) + entry_idx = 0 + do i_local_row = 1, field2_local_rows + global_row = field2_rows(i_local_row) + entry_idx = entry_idx + 1 + gcol = mod(global_row - 1_psb_lpk_, field1_size) + 1_psb_lpk_ + entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.3_psb_dpk_ + entry_idx = entry_idx + 1 + gcol = mod(global_row - 1_psb_lpk_ + stride, field1_size) + 1_psb_lpk_ + entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.15_psb_dpk_ + end do + call nested_matrix%ins(2, 1, entry_idx, entry_rows, entry_cols, entry_vals, info) + deallocate(entry_rows, entry_cols, entry_vals) + + call nested_matrix%asb(info) + if (info /= psb_success_) then + if (my_rank==0) write(*,*) 'FAIL: asb info=', info; goto 9999 + end if + + !--------------------------------------------------------------- + ! 2) global-local work vectors (x[g] = g on the owned entries) + !--------------------------------------------------------------- + n_local_rows = nested_matrix%desc_glob%get_local_rows() + n_local_cols = nested_matrix%desc_glob%get_local_cols() + allocate(x_arr(n_local_cols), work_x(n_local_cols), & + & y_union(n_local_cols), y_perblock(n_local_cols)) + x_arr = dzero + do i_local_row = 1, n_local_rows + call nested_matrix%desc_glob%l2g(i_local_row, global_row, info) + x_arr(i_local_row) = real(global_row, psb_dpk_) + end do + + !--------------------------------------------------------------- + ! 3) correctness: union vs per-block on the same x + !--------------------------------------------------------------- + work_x = x_arr + call psb_spmm(done, nested_matrix%a_glob, work_x, dzero, y_union, nested_matrix%desc_glob, info) + if (info /= psb_success_) then + if (my_rank==0) write(*,*) 'FAIL: psb_spmm (union) info=', info; goto 9999 + end if + call nest_spmv_perblock(nested_matrix, done, x_arr, dzero, y_perblock, info, n_exch, .false.) + if (info /= psb_success_) then + if (my_rank==0) write(*,*) 'FAIL: per-block driver info=', info; goto 9999 + end if + mismatch_norm = dzero + do i_local_row = 1, n_local_rows + mismatch_norm = max(mismatch_norm, abs(y_union(i_local_row) - y_perblock(i_local_row))) + end do + call psb_amx(context, mismatch_norm) + + !--------------------------------------------------------------- + ! 4) per-scheme PURE COMMUNICATION sweep. + ! The comm scheme is selected per descriptor (desc%set_comm_scheme) and is + ! honoured ONLY by the encapsulated vect path; the array path used in (3) + ! is always baseline. For each scheme we time, on persistent vects: + ! union = one psb_halo over desc_glob + ! per-block = one psb_halo per present block_col_desc(i,j) + !--------------------------------------------------------------- + n_schemes = 5 + allocate(schemes(n_schemes), scheme_names(n_schemes)) + schemes = [ psb_comm_isend_irecv_, psb_comm_ineighbor_alltoallv_, & + & psb_comm_persistent_ineighbor_alltoallv_, & + & psb_comm_rma_pull_, psb_comm_rma_push_ ] + scheme_names = [ character(len=34) :: 'isend_irecv (baseline)', & + & 'ineighbor_alltoallv', 'persistent_ineighbor_alltoallv', & + & 'rma_pull', 'rma_push' ] + allocate(bvect(nested_matrix%n_fields, nested_matrix%n_fields)) + + if (my_rank == 0) then + write(*,'(a,i0,a,i0,a,i0,a,i0,a,i0)') ' np=', num_procs, ' n1=', field1_size, & + & ' n2=', field2_size, ' stride=', stride, ' reps=', n_reps + write(*,'(a,es12.4)') ' max|y_union - y_perblock| = ', mismatch_norm + if (mismatch_norm <= tolerance) then + write(*,*) ' PASS: regimes agree' + else + write(*,*) ' FAIL: regimes disagree' + end if + write(*,'(a,i0)') ' halo exchanges: union = 1 per-block = ', n_exch + write(*,*) + write(*,'(a)') ' pure halo communication time [s] (min over reps, slowest rank)' + write(*,'(a)') ' scheme union per-block ratio' + end if + + do s_idx = 1, n_schemes + call set_scheme(schemes(s_idx)) + call build_vects() + call time_comm('U', t_u, ok_u) + call time_comm('B', t_b, ok_b) + call free_vects() + if (my_rank == 0) then + if (ok_u .and. ok_b) then + write(*,'(1x,a34,es15.4,es15.4,3x,f7.2)') scheme_names(s_idx), t_u, t_b, & + & t_b/max(t_u, tiny(t_u)) + else + write(*,'(1x,a34,a)') scheme_names(s_idx), ' (unavailable on this build/MPI)' + end if + end if + end do + + deallocate(bvect, schemes, scheme_names) + call nested_matrix%free(info) +9999 continue + call psb_exit(context) + +contains + + ! Set the communication scheme on the global descriptor and on every present + ! per-block descriptor (the per-block exchanges must use the same scheme). + subroutine set_scheme(scheme) + integer(psb_ipk_), intent(in) :: scheme + integer(psb_ipk_) :: i, j, linfo + call nested_matrix%desc_glob%set_comm_scheme(scheme, linfo) + do j = 1, nested_matrix%n_fields + do i = 1, nested_matrix%n_fields + if (nested_matrix%block_storage%has_block(i,j)) & + & call nested_matrix%block_col_desc(i,j)%set_comm_scheme(scheme, linfo) + end do + end do + end subroutine set_scheme + + ! Fresh persistent halo vectors (the comm_handle, hence the scheme, is created + ! from desc%comm_type on the first psb_halo and then reused across reps). + subroutine build_vects() + integer(psb_ipk_) :: i, j, linfo + call psb_geall(xg_vect, nested_matrix%desc_glob, linfo) + call psb_geasb(xg_vect, nested_matrix%desc_glob, linfo) + do j = 1, nested_matrix%n_fields + do i = 1, nested_matrix%n_fields + if (.not. nested_matrix%block_storage%has_block(i,j)) cycle + call psb_geall(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo) + call psb_geasb(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo) + end do + end do + end subroutine build_vects + + subroutine free_vects() + integer(psb_ipk_) :: i, j, linfo + call xg_vect%free(linfo) + do j = 1, nested_matrix%n_fields + do i = 1, nested_matrix%n_fields + if (nested_matrix%block_storage%has_block(i,j)) call bvect(i,j)%free(linfo) + end do + end do + end subroutine free_vects + + ! Time the pure communication of one regime: n_warm warm-up runs, then min over + ! n_reps of the slowest rank's wall time (psb_amx = max across ranks). + subroutine time_comm(code, t_min, ok) + character, intent(in) :: code ! 'U' union / 'B' per-block + real(psb_dpk_), intent(out) :: t_min + logical, intent(out) :: ok + integer(psb_ipk_) :: rep, linfo + real(psb_dpk_) :: t0, dt + ok = .true. + do rep = 1, n_warm + call do_comm(code, linfo); if (linfo /= 0) ok = .false. + end do + t_min = huge(t_min) + do rep = 1, n_reps + call psb_barrier(context) + t0 = psb_wtime() + call do_comm(code, linfo) + dt = psb_wtime() - t0 + call psb_amx(context, dt) + t_min = min(t_min, dt) + if (linfo /= 0) ok = .false. + end do + if (.not. ok) t_min = dzero + end subroutine time_comm + + subroutine do_comm(code, linfo) + character, intent(in) :: code + integer(psb_ipk_), intent(out) :: linfo + integer(psb_ipk_) :: i, j + linfo = 0 + if (code == 'U') then + call psb_halo(xg_vect, nested_matrix%desc_glob, linfo) + else + do j = 1, nested_matrix%n_fields + do i = 1, nested_matrix%n_fields + if (.not. nested_matrix%block_storage%has_block(i,j)) cycle + call psb_halo(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo) + if (linfo /= 0) return + end do + end do + end if + end subroutine do_comm + + ! Per-block nested matvec, exchanging one block's halo at a time. x, y are in + ! the composed global-local layout. halo_only=.true. performs ONLY the + ! per-block psb_halo calls (for the pure-communication timing). + subroutine nest_spmv_perblock(op, alpha, x, beta, y, info, n_exchanges, halo_only) + type(psb_d_nest_matrix), intent(inout) :: op + real(psb_dpk_), intent(in) :: x(:) + real(psb_dpk_), intent(inout) :: y(:) + real(psb_dpk_), intent(in) :: alpha, beta + integer(psb_ipk_), intent(out) :: info, n_exchanges + logical, intent(in) :: halo_only + + integer(psb_ipk_) :: nf, i, j, k, nrl, nownj, nowni, nfc, nbc + integer(psb_ipk_), allocatable :: owned_offset(:) + real(psb_dpk_), allocatable :: xs(:), xf(:), yb(:) + + info = psb_success_; n_exchanges = 0 + nf = op%n_fields + allocate(owned_offset(nf+1)) + owned_offset(1) = 0 + do j = 1, nf + owned_offset(j+1) = owned_offset(j) + op%field_desc(j)%get_local_rows() + end do + nrl = owned_offset(nf+1) + + if (.not. halo_only) then + if (beta == dzero) then + y(1:nrl) = dzero + else if (beta /= done) then + y(1:nrl) = beta * y(1:nrl) + end if + end if + + do j = 1, nf + nownj = op%field_desc(j)%get_local_rows() + nfc = op%field_desc(j)%get_local_cols() + do i = 1, nf + if (.not. op%block_storage%has_block(i,j)) cycle + nbc = op%block_col_desc(i,j)%get_local_cols() + allocate(xs(nbc)); xs = dzero + xs(1:nownj) = x(owned_offset(j)+1 : owned_offset(j)+nownj) + call psb_halo(xs, op%block_col_desc(i,j), info) + if (info /= psb_success_) return + n_exchanges = n_exchanges + 1 + if (.not. halo_only) then + allocate(xf(nfc)); xf = dzero + do k = 1, nbc + xf(op%blk2field(i,j)%map(k)) = xs(k) + end do + nowni = op%field_desc(i)%get_local_rows() + allocate(yb(nowni)); yb = dzero + call op%block_storage%mats(i,j)%a%csmv(alpha, xf, dzero, yb, info) + if (info /= psb_success_) return + y(owned_offset(i)+1 : owned_offset(i)+nowni) = & + & y(owned_offset(i)+1 : owned_offset(i)+nowni) + yb + deallocate(xf, yb) + end if + deallocate(xs) + end do + end do + deallocate(owned_offset) + end subroutine nest_spmv_perblock + +end program psb_d_nest_halo_regime_test