[WIP] RMA swapdata optimization, nest builder updates, GPU/scorep test tooling

- psi_dswapdata / psb_comm_rma: RMA swapdata optimization (PSCW + Win_allocate) - psb_d_nest_builder: builder updates - cuda: Makefile + psb_d_cuda_vect_mod adjustments - test/comm: spmv/cg drivers, Makefiles, GPU + scorep/strong-scaling scripts - test/nested: halo-regime test + Makefile/CMake updates
1 day ago · 9533c22989
parent 4106f44331
commit 9533c22989
15 changed files with 1324 additions and 282 deletions
--- a/base/comm/internals/psi_dswapdata.F90
+++ b/base/comm/internals/psi_dswapdata.F90
@ -1009,6 +1009,8 @@ contains
    integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
    integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size
    integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
+    integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
+    integer(psb_mpk_), allocatable :: grp_ranks(:)
    logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
    type(psb_comm_rma_handle), pointer :: rma_handle
    character(len=30) :: name
@ -1077,59 +1079,42 @@ contains
        end if
      end if

+      ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
+      ! combuf is the pinned/registered buffer, so RMA uses pinned memory.
      if (buffer_size > 0) then
        if (.not. allocated(y%combuf)) then
-          ! Allocate buffer with size of comm_indexes%v to include all metadata,
-          ! matching baseline and topology buffer layout
          call y%new_buffer(ione*size(comm_indexes%v), info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        else if (size(y%combuf) < size(comm_indexes%v)) then
-          ! Need a larger exposed memory area: recreate the RMA window first,
-          ! then reallocate combuf and lazily create a new window below.
          if (rma_handle%window_open) then
            call mpi_win_unlock_all(rma_handle%win, iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
            rma_handle%window_open = .false.
          end if
          if (rma_handle%window_ready) then
            call mpi_win_free(rma_handle%win, iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
-            rma_handle%window_ready = .false.
            rma_handle%win = mpi_win_null
+            rma_handle%window_ready = .false.
          end if
-          ! Allocate buffer with size of comm_indexes%v to include all metadata,
-          ! matching baseline and topology buffer layout
          call y%new_buffer(ione*size(comm_indexes%v), info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        end if
-      end if

-      if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
-        ! Expose combuf once and keep the window around until the descriptor changes.
-        element_bytes = storage_size(y%combuf(1))/8
-        exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
-        call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
-             & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
+        if (.not. rma_handle%window_ready) then
+          element_bytes = storage_size(y%combuf(1))/8
+          exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
+          call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
+               & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_
+            call psb_errpush(info,name,m_err=(/iret/))
+            goto 9999
+          end if
+          rma_handle%window_ready = .true.
        end if
-        rma_handle%window_ready = .true.
      end if

      if (buffer_size > 0) then
@ -1138,7 +1123,19 @@ contains
        end if
        call y%device_wait()

-        ! Pull data from each peer with per-neighbor passive lock (neighbor-only sync).
+        ! Open exposure (post) + access (start) epochs over the CACHED neighbor
+        ! group (built once with the layout in the handle) -- no per-call build.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+        end if
+
        do neighbor_idx=1, num_neighbors
          proc_to_comm = rma_handle%peer_proc(neighbor_idx)
          recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -1161,12 +1158,6 @@ contains
            end if

            if (recv_count > 0) then
-              call mpi_win_lock(MPI_LOCK_SHARED, prc_rank, 0, rma_handle%win, iret)
-              if (iret /= mpi_success) then
-                info = psb_err_mpi_error_
-                call psb_errpush(info,name,m_err=(/iret/))
-                goto 9999
-              end if
              remote_disp = int(remote_base - 1, kind=MPI_ADDRESS_KIND)
              call mpi_get(y%combuf(recv_pos), recv_count, psb_mpi_r_dpk_, prc_rank, remote_disp, recv_count, psb_mpi_r_dpk_, &
                   & rma_handle%win, iret)
@ -1175,12 +1166,6 @@ contains
                call psb_errpush(info,name,m_err=(/iret/))
                goto 9999
              end if
-              call mpi_win_unlock(prc_rank, rma_handle%win, iret)
-              if (iret /= mpi_success) then
-                info = psb_err_mpi_error_
-                call psb_errpush(info,name,m_err=(/iret/))
-                goto 9999
-              end if
            end if
          else
            if (send_count /= recv_count) then
@ -1191,6 +1176,19 @@ contains
            y%combuf(recv_pos:recv_pos+recv_count-1) = y%combuf(send_pos:send_pos+send_count-1)
          end if
        end do
+
+        ! Close access epoch (my GETs done) then exposure epoch (peers done
+        ! reading my window): received data is now in y%combuf. Cached group, no free.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_complete(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_wait(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+        end if
      end if
    end if

@ -1235,7 +1233,8 @@ contains
    integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
    integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size
    integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
-    integer(psb_mpk_), parameter :: rma_push_notify_tag = 914_psb_mpk_
+    integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
+    integer(psb_mpk_), allocatable :: grp_ranks(:)
    logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
    type(psb_comm_rma_handle), pointer :: rma_handle
    character(len=30) :: name
@ -1302,59 +1301,42 @@ contains
        end if
      end if

+      ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
+      ! combuf is the pinned/registered buffer, so RMA uses pinned memory.
      if (buffer_size > 0) then
        if (.not. allocated(y%combuf)) then
-          ! Allocate buffer with size of comm_indexes%v to include all metadata,
-          ! matching baseline and topology buffer layout
          call y%new_buffer(ione*size(comm_indexes%v), info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        else if (size(y%combuf) < size(comm_indexes%v)) then
-          ! Need a larger exposed memory area: recreate the RMA window first,
-          ! then reallocate combuf and lazily create a new window below.
          if (rma_handle%window_open) then
            call mpi_win_unlock_all(rma_handle%win, iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
            rma_handle%window_open = .false.
          end if
          if (rma_handle%window_ready) then
            call mpi_win_free(rma_handle%win, iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
-            rma_handle%window_ready = .false.
            rma_handle%win = mpi_win_null
+            rma_handle%window_ready = .false.
          end if
-          ! Allocate buffer with size of comm_indexes%v to include all metadata,
-          ! matching baseline and topology buffer layout
          call y%new_buffer(ione*size(comm_indexes%v), info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        end if
-      end if

-      if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
-        ! Keep the window alive across repetitions: it is created once and reused.
-        element_bytes = storage_size(y%combuf(1))/8
-        exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
-        call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
-             & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
+        if (.not. rma_handle%window_ready) then
+          element_bytes = storage_size(y%combuf(1))/8
+          exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
+          call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
+               & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_
+            call psb_errpush(info,name,m_err=(/iret/))
+            goto 9999
+          end if
+          rma_handle%window_ready = .true.
        end if
-        rma_handle%window_ready = .true.
      end if

      if (buffer_size > 0) then
@ -1363,34 +1345,20 @@ contains
        end if
        call y%device_wait()

-        ! Pre-post notification receives before opening the window (prevents isend/irecv ordering issues).
-        if (num_neighbors > 0) then
-          rma_handle%notify_recv_reqs(1:num_neighbors) = MPI_REQUEST_NULL
-          rma_handle%notify_send_reqs(1:num_neighbors) = MPI_REQUEST_NULL
-        end if
-        do neighbor_idx=1, num_neighbors
-          proc_to_comm = rma_handle%peer_proc(neighbor_idx)
-          if (proc_to_comm /= my_rank) then
-            prc_rank = rma_handle%peer_mpi_rank(neighbor_idx)
-            call mpi_irecv(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
-                 & rma_push_notify_tag, icomm, rma_handle%notify_recv_reqs(neighbor_idx), iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
+        ! Open exposure (post) + access (start) epochs over the CACHED neighbor
+        ! group (built once with the layout in the handle) -- no per-call build.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
          end if
-        end do
-
-        call mpi_win_lock_all(0, rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
        end if
-        rma_handle%window_open = .true.

-        ! Push data to each peer; after flush send a P2P notification so target knows data arrived.
+        ! Issue every PUT (remote) or perform the self-copy (local).
        do neighbor_idx=1, num_neighbors
          proc_to_comm = rma_handle%peer_proc(neighbor_idx)
          recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -1422,19 +1390,6 @@ contains
                goto 9999
              end if
            end if
-            call mpi_win_flush(prc_rank, rma_handle%win, iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
-            call mpi_isend(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
-                 & rma_push_notify_tag, icomm, rma_handle%notify_send_reqs(neighbor_idx), iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
          else
            if (send_count /= recv_count) then
              info = psb_err_internal_error_
@ -1444,36 +1399,24 @@ contains
            y%combuf(recv_pos:recv_pos+recv_count-1) = y%combuf(send_pos:send_pos+send_count-1)
          end if
        end do
-      end if
-    end if
-
-    ! WAIT phase: close epoch, wait for P2P notifications, then scatter.
-    if (do_wait) then
-      if (rma_handle%window_open) then
-        call mpi_win_unlock_all(rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
-        end if
-        rma_handle%window_open = .false.
-      end if

-      if (num_neighbors > 0) then
-        call mpi_waitall(num_neighbors, rma_handle%notify_recv_reqs, MPI_STATUSES_IGNORE, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
-        end if
-        call mpi_waitall(num_neighbors, rma_handle%notify_send_reqs, MPI_STATUSES_IGNORE, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
+        ! Close access epoch (my PUTs done) then exposure epoch (peers' PUTs into
+        ! me done): receive area now holds the pushed data. Cached group, no free.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_complete(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_wait(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
        end if
      end if
+    end if

+    ! WAIT phase: data already delivered by PSCW in START; just scatter into Y.
+    if (do_wait) then
      if (total_recv > 0) then
        call y%sct(int(total_recv,psb_mpk_), rma_handle%peer_recv_indexes, y%combuf(total_send+1:total_send+total_recv), beta)
      end if
@ -2341,6 +2284,8 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
    integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
    integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size, total_send_, total_recv_
    integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
+    integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
+    integer(psb_mpk_), allocatable :: grp_ranks(:)
    logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
    type(psb_comm_rma_handle), pointer :: rma_handle
    character(len=30) :: name
@ -2406,33 +2351,42 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
        end if
      end if

+      ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
+      ! combuf is the pinned/registered buffer, so RMA uses pinned memory.
      if (buffer_size > 0) then
        if (.not. allocated(y%combuf)) then
          call y%new_buffer(buffer_size, info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        else if (size(y%combuf) < buffer_size) then
+          if (rma_handle%window_open) then
+            call mpi_win_unlock_all(rma_handle%win, iret)
+            rma_handle%window_open = .false.
+          end if
+          if (rma_handle%window_ready) then
+            call mpi_win_free(rma_handle%win, iret)
+            rma_handle%win = mpi_win_null
+            rma_handle%window_ready = .false.
+          end if
          call y%new_buffer(buffer_size, info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        end if
-      end if

-      if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
-        element_bytes = storage_size(y%combuf(1))/8
-        exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
-        call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
-             & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
+        if (.not. rma_handle%window_ready) then
+          element_bytes = storage_size(y%combuf(1))/8
+          exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
+          call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
+               & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_
+            call psb_errpush(info,name,m_err=(/iret/))
+            goto 9999
+          end if
+          rma_handle%window_ready = .true.
        end if
-        rma_handle%window_ready = .true.
      end if

      if (buffer_size > 0) then
@ -2441,7 +2395,19 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
        end if
        call y%device_wait()

-        ! Pull data from each peer with per-neighbor passive lock (neighbor-only sync).
+        ! Open exposure (post) + access (start) epochs over the CACHED neighbor
+        ! group (built once with the layout in the handle) -- no per-call build.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+        end if
+
        do neighbor_idx=1, num_neighbors
          proc_to_comm = rma_handle%peer_proc(neighbor_idx)
          recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -2458,12 +2424,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
              goto 9999
            end if
            if (recv_count > 0) then
-              call mpi_win_lock(MPI_LOCK_SHARED, prc_rank, 0, rma_handle%win, iret)
-              if (iret /= mpi_success) then
-                info = psb_err_mpi_error_
-                call psb_errpush(info,name,m_err=(/iret/))
-                goto 9999
-              end if
              remote_disp = int((remote_base - 1) * n, kind=MPI_ADDRESS_KIND)
              call mpi_get(y%combuf(recv_pos), recv_count*n, psb_mpi_r_dpk_, prc_rank, remote_disp, recv_count*n, psb_mpi_r_dpk_, &
                   & rma_handle%win, iret)
@ -2472,12 +2432,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
                call psb_errpush(info,name,m_err=(/iret/))
                goto 9999
              end if
-              call mpi_win_unlock(prc_rank, rma_handle%win, iret)
-              if (iret /= mpi_success) then
-                info = psb_err_mpi_error_
-                call psb_errpush(info,name,m_err=(/iret/))
-                goto 9999
-              end if
            end if
          else
            if (send_count /= recv_count) then
@ -2488,6 +2442,19 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
            y%combuf(recv_pos:recv_pos+recv_count*n-1) = y%combuf(send_pos:send_pos+send_count*n-1)
          end if
        end do
+
+        ! Close access epoch (my GETs done) then exposure epoch (peers done
+        ! reading my window): received data is now in y%combuf. Cached group, no free.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_complete(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_wait(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+        end if
      end if
    end if

@ -2531,7 +2498,8 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
    integer(kind=MPI_ADDRESS_KIND) :: remote_disp, exposed_bytes
    integer(psb_ipk_) :: err_act, neighbor_idx, buffer_size, total_send_, total_recv_
    integer(psb_ipk_), allocatable :: peer_mpi_rank(:)
-    integer(psb_mpk_), parameter :: rma_push_notify_tag = 914_psb_mpk_
+    integer(psb_mpk_) :: comm_grp, nbr_grp, n_off, grp_idx
+    integer(psb_mpk_), allocatable :: grp_ranks(:)
    logical :: do_start, do_wait, memory_buffer_layout_rebuild_needed
    type(psb_comm_rma_handle), pointer :: rma_handle
    character(len=30) :: name
@ -2597,33 +2565,42 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
        end if
      end if

+      ! Expose y%combuf as the RMA window (created once, reused). On GPU vectors
+      ! combuf is the pinned/registered buffer, so RMA uses pinned memory.
      if (buffer_size > 0) then
        if (.not. allocated(y%combuf)) then
          call y%new_buffer(buffer_size, info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        else if (size(y%combuf) < buffer_size) then
+          if (rma_handle%window_open) then
+            call mpi_win_unlock_all(rma_handle%win, iret)
+            rma_handle%window_open = .false.
+          end if
+          if (rma_handle%window_ready) then
+            call mpi_win_free(rma_handle%win, iret)
+            rma_handle%win = mpi_win_null
+            rma_handle%window_ready = .false.
+          end if
          call y%new_buffer(buffer_size, info)
          if (info /= psb_success_) then
-            call psb_errpush(psb_err_alloc_dealloc_,name)
-            goto 9999
+            call psb_errpush(psb_err_alloc_dealloc_,name); goto 9999
          end if
        end if
-      end if

-      if ((buffer_size > 0).and.(.not. rma_handle%window_ready)) then
-        element_bytes = storage_size(y%combuf(1))/8
-        exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
-        call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
-             & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
+        if (.not. rma_handle%window_ready) then
+          element_bytes = storage_size(y%combuf(1))/8
+          exposed_bytes = int(size(y%combuf),kind=MPI_ADDRESS_KIND) * int(element_bytes,kind=MPI_ADDRESS_KIND)
+          call mpi_win_create(y%combuf, exposed_bytes, element_bytes, &
+               & mpi_info_null, ctxt%get_mpic(), rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_
+            call psb_errpush(info,name,m_err=(/iret/))
+            goto 9999
+          end if
+          rma_handle%window_ready = .true.
        end if
-        rma_handle%window_ready = .true.
      end if

      if (buffer_size > 0) then
@ -2632,33 +2609,20 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
        end if
        call y%device_wait()

-        ! Pre-post notification receives before opening the window.
-        if (num_neighbors > 0) then
-          rma_handle%notify_recv_reqs(1:num_neighbors) = MPI_REQUEST_NULL
-          rma_handle%notify_send_reqs(1:num_neighbors) = MPI_REQUEST_NULL
-        end if
-        do neighbor_idx=1, num_neighbors
-          proc_to_comm = rma_handle%peer_proc(neighbor_idx)
-          if (proc_to_comm /= my_rank) then
-            prc_rank = rma_handle%peer_mpi_rank(neighbor_idx)
-            call mpi_irecv(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
-                 & rma_push_notify_tag, icomm, rma_handle%notify_recv_reqs(neighbor_idx), iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
+        ! Open exposure (post) + access (start) epochs over the CACHED neighbor
+        ! group (built once with the layout in the handle) -- no per-call build.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_post(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_start(rma_handle%nbr_grp, 0, rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
          end if
-        end do
-
-        call mpi_win_lock_all(0, rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
        end if
-        rma_handle%window_open = .true.

+        ! Issue every PUT (remote) or perform the self-copy (local).
        do neighbor_idx=1, num_neighbors
          proc_to_comm = rma_handle%peer_proc(neighbor_idx)
          recv_count = rma_handle%peer_recv_counts(neighbor_idx)
@ -2684,19 +2648,6 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
                goto 9999
              end if
            end if
-            call mpi_win_flush(prc_rank, rma_handle%win, iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
-            call mpi_isend(rma_handle%notify_buf(neighbor_idx), 1, psb_mpi_mpk_, prc_rank, &
-                 & rma_push_notify_tag, icomm, rma_handle%notify_send_reqs(neighbor_idx), iret)
-            if (iret /= mpi_success) then
-              info = psb_err_mpi_error_
-              call psb_errpush(info,name,m_err=(/iret/))
-              goto 9999
-            end if
          else
            if (send_count /= recv_count) then
              info = psb_err_internal_error_
@ -2706,34 +2657,24 @@ end subroutine psi_dswap_neighbor_topology_multivect_persistent
            y%combuf(recv_pos:recv_pos+recv_count*n-1) = y%combuf(send_pos:send_pos+send_count*n-1)
          end if
        end do
+
+        ! Close access epoch (my PUTs done) then exposure epoch (peers' PUTs into
+        ! me done): receive area now holds the pushed data. Cached group, no free.
+        if (rma_handle%nbr_grp /= mpi_group_null) then
+          call mpi_win_complete(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+          call mpi_win_wait(rma_handle%win, iret)
+          if (iret /= mpi_success) then
+            info = psb_err_mpi_error_; call psb_errpush(info,name,m_err=(/iret/)); goto 9999
+          end if
+        end if
      end if
    end if

-    ! WAIT phase: close epoch, wait for P2P notifications, then scatter.
+    ! WAIT phase: data already delivered by PSCW in START; just scatter into Y.
    if (do_wait) then
-      if (rma_handle%window_open) then
-        call mpi_win_unlock_all(rma_handle%win, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
-        end if
-        rma_handle%window_open = .false.
-      end if
-      if (num_neighbors > 0) then
-        call mpi_waitall(num_neighbors, rma_handle%notify_recv_reqs, MPI_STATUSES_IGNORE, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
-        end if
-        call mpi_waitall(num_neighbors, rma_handle%notify_send_reqs, MPI_STATUSES_IGNORE, iret)
-        if (iret /= mpi_success) then
-          info = psb_err_mpi_error_
-          call psb_errpush(info,name,m_err=(/iret/))
-          goto 9999
-        end if
-      end if
      if (total_recv > 0) then
        call y%sct(int(total_recv,psb_mpk_), rma_handle%peer_recv_indexes, y%combuf(total_send_+1:total_send_+total_recv_), beta)
      end if
--- a/base/modules/comm/comm_schemes/psb_comm_rma_mod.F90
+++ b/base/modules/comm/comm_schemes/psb_comm_rma_mod.F90
@ -3,6 +3,7 @@ module psb_comm_rma_mod
  use psb_desc_const_mod, only: psb_proc_id_, psb_n_elem_recv_, psb_elem_recv_, &
       & psb_n_elem_send_, psb_elem_send_
  use psb_error_mod
+  use, intrinsic :: iso_c_binding, only: c_ptr, c_null_ptr
 #ifdef PSB_MPI_MOD
  use mpi
 #endif
@ -18,6 +19,9 @@ module psb_comm_rma_mod
    integer(psb_mpk_) :: win = mpi_win_null
    logical :: window_ready = .false.
    logical :: window_open = .false.
+    type(c_ptr) :: win_base = c_null_ptr
+    integer(psb_ipk_) :: win_nelem = 0
+    integer(psb_mpk_) :: nbr_grp = mpi_group_null   ! neighbor MPI group, built once with the layout
    logical :: layout_ready = .false.
    integer(psb_ipk_) :: layout_nnbr = -1
    integer(psb_ipk_) :: layout_send = -1
@ -57,6 +61,8 @@ contains
    this%win = mpi_win_null
    this%window_ready = .false.
    this%window_open = .false.
+    this%win_base = c_null_ptr
+    this%win_nelem = 0
    this%layout_ready = .false.
    this%layout_nnbr = -1
    this%layout_send = -1
@ -67,8 +73,14 @@ contains
  subroutine psb_comm_rma_clear_memory_buffer_layout(this, info)
    class(psb_comm_rma_handle), intent(inout) :: this
    integer(psb_ipk_), intent(out) :: info
+    integer(psb_mpk_) :: iret

    info = psb_success_
+    ! Neighbors change when the layout is rebuilt: free the cached group.
+    if (this%nbr_grp /= mpi_group_null) then
+      call mpi_group_free(this%nbr_grp, iret)
+      this%nbr_grp = mpi_group_null
+    end if
    if (allocated(this%peer_proc)) deallocate(this%peer_proc)
    if (allocated(this%peer_send_counts)) deallocate(this%peer_send_counts)
    if (allocated(this%peer_recv_counts)) deallocate(this%peer_recv_counts)
@ -105,6 +117,8 @@ contains
    integer(psb_ipk_) :: list_pos, send_offset, recv_offset
    integer(psb_ipk_) :: proc_to_comm, recv_count, send_count
    integer(psb_ipk_) :: local_meta(4), remote_meta(4)
+    integer(psb_mpk_) :: comm_grp, n_off, grp_idx
+    integer(psb_mpk_), allocatable :: grp_ranks(:)

    call this%clear_memory_buffer_layout(info)
    if (info /= psb_success_) return
@ -206,6 +220,39 @@ contains
      return
    end if

+    ! Build the neighbor MPI group ONCE (off-diagonal peers); reused by every
+    ! RMA exchange (PSCW post/start) instead of rebuilding it each call.
+    n_off = 0
+    do neighbor_idx = 1, n_neighbors
+      if (this%peer_proc(neighbor_idx) /= my_rank) n_off = n_off + 1
+    end do
+    if (n_off > 0) then
+      allocate(grp_ranks(n_off), stat=iret)
+      if (iret /= 0) then
+        info = psb_err_alloc_dealloc_
+        return
+      end if
+      grp_idx = 0
+      do neighbor_idx = 1, n_neighbors
+        if (this%peer_proc(neighbor_idx) /= my_rank) then
+          grp_idx = grp_idx + 1
+          grp_ranks(grp_idx) = this%peer_mpi_rank(neighbor_idx)
+        end if
+      end do
+      call mpi_comm_group(icomm, comm_grp, iret)
+      if (iret /= mpi_success) then
+        info = psb_err_mpi_error_
+        return
+      end if
+      call mpi_group_incl(comm_grp, n_off, grp_ranks, this%nbr_grp, iret)
+      if (iret /= mpi_success) then
+        info = psb_err_mpi_error_
+        return
+      end if
+      call mpi_group_free(comm_grp, iret)
+      deallocate(grp_ranks, stat=iret)
+    end if
+
    this%layout_nnbr = n_neighbors
    this%layout_send = send_total
    this%layout_recv = recv_total
@ -229,6 +276,8 @@ contains
      call mpi_win_free(this%win, iret)
      this%win = mpi_win_null
    end if
+    this%win_base = c_null_ptr
+    this%win_nelem = 0
    this%window_ready = .false.
    this%layout_ready = .false.
    this%layout_nnbr = -1
--- a/base/modules/tools/psb_d_nest_builder_mod.F90
+++ b/base/modules/tools/psb_d_nest_builder_mod.F90
@ -83,6 +83,15 @@ module psb_d_nest_builder_mod
    real(psb_dpk_),    allocatable :: entry_vals(:)
  end type psb_d_nest_block_buffer

+  ! per-block selective halo support: scatter map from a block's own (restricted)
+  ! column descriptor onto the field-j column space the block matrix is localized
+  ! against.  map(k) is the field-j local column position of the k-th local column
+  ! of block_col_desc(i,j); used to scatter a per-block psb_halo result into the
+  ! field vector consumed by the (field-localized) block csmv.
+  type :: psb_d_nest_blk2field_map
+    integer(psb_ipk_), allocatable :: map(:)
+  end type psb_d_nest_blk2field_map
+
  type :: psb_d_nest_matrix
    type(psb_ctxt_type)                          :: context
    integer(psb_ipk_)                            :: n_fields  = 0
@ -95,6 +104,14 @@ module psb_d_nest_builder_mod
    type(psb_desc_nest_type)                     :: grid_desc
    type(psb_dspmat_type)                        :: a_glob               ! the matrix to hand to Krylov
    type(psb_desc_type)                          :: desc_glob            ! the global descriptor
+    ! per-block (selective) halo-exchange support.  block_col_desc(i,j) carries
+    ! ONLY block (i,j)'s own off-process columns (a subset of field j's union
+    ! halo), so its halo can be exchanged independently of the rest of column j;
+    ! blk2field(i,j) maps its local columns onto the field-j local columns the
+    ! block matrix is localized against.  These coexist with a_glob/desc_glob
+    ! (the fused union path) WITHOUT a second copy of the blocks.
+    type(psb_desc_type),            allocatable   :: block_col_desc(:,:)
+    type(psb_d_nest_blk2field_map), allocatable   :: blk2field(:,:)
  contains
    procedure, pass(op) :: init => psb_d_nest_op_init
    procedure, pass(op) :: ins  => psb_d_nest_op_ins
@ -107,7 +124,7 @@ module psb_d_nest_builder_mod
  end type psb_d_nest_matrix

  private
-  public :: psb_d_nest_matrix
+  public :: psb_d_nest_matrix, psb_d_nest_blk2field_map

 contains

@ -201,6 +218,8 @@ contains

    type(psb_d_nest_base_mat) :: nest_operator
    integer(psb_ipk_)         :: n_fields, i_field, j_field
+    integer(psb_ipk_)         :: field_local_rows, n_block_cols, k_col, field_col
+    integer(psb_lpk_)         :: global_col_idx
    character(len=24)         :: name

    info = psb_success_
@ -215,6 +234,50 @@ contains
      end if
    end do

+    ! 1b) per-block (restricted-halo) column descriptors + scatter maps, for the
+    !     selective halo-exchange regime.  Each present block (i,j) gets a
+    !     descriptor whose halo is ONLY block (i,j)'s own off-process columns
+    !     (registered from that single block's buffer, NOT the column-wide union),
+    !     sharing field j's owned distribution.  blk2field(i,j)%map sends its
+    !     local columns onto field j's local columns (the space the block matrix
+    !     is localized against), so a per-block psb_halo can be scattered into the
+    !     field vector consumed by the block csmv.  The buffers are still alive
+    !     here (freed in step 6).
+    allocate(op%block_col_desc(n_fields,n_fields), op%blk2field(n_fields,n_fields), stat=info)
+    if (info /= 0) then
+      info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return
+    end if
+    do j_field = 1, n_fields
+      field_local_rows = op%field_desc(j_field)%get_local_rows()
+      do i_field = 1, n_fields
+        if (op%block_buffer(i_field,j_field)%n_entries <= 0) cycle
+        ! owned distribution identical to field j (same nl => same global ranges)
+        call psb_cdall(op%context, op%block_col_desc(i_field,j_field), info, nl=field_local_rows)
+        if (info == psb_success_) &
+             & call psb_cdins(op%block_buffer(i_field,j_field)%n_entries,            &
+             &                op%block_buffer(i_field,j_field)%entry_cols(1:op%block_buffer(i_field,j_field)%n_entries), &
+             &                op%block_col_desc(i_field,j_field), info)
+        if (info == psb_success_) call psb_cdasb(op%block_col_desc(i_field,j_field), info)
+        if (info /= psb_success_) then
+          call psb_errpush(psb_err_from_subroutine_, name, a_err='per-block cdasb'); return
+        end if
+        ! block-local column k -> field-j local column position
+        n_block_cols = op%block_col_desc(i_field,j_field)%get_local_cols()
+        allocate(op%blk2field(i_field,j_field)%map(n_block_cols), stat=info)
+        if (info /= 0) then
+          info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return
+        end if
+        do k_col = 1, n_block_cols
+          call op%block_col_desc(i_field,j_field)%l2g(k_col, global_col_idx, info)
+          if (info == psb_success_) call op%field_desc(j_field)%g2l(global_col_idx, field_col, info)
+          if (info /= psb_success_) then
+            call psb_errpush(psb_err_from_subroutine_, name, a_err='blk2field l2g/g2l'); return
+          end if
+          op%blk2field(i_field,j_field)%map(k_col) = field_col
+        end do
+      end do
+    end do
+
    ! 2) build the local blocks (generally rectangular) from the triplets
    op%block_storage%nrblocks = n_fields
    op%block_storage%ncblocks = n_fields
@ -300,6 +363,17 @@ contains
      call op%desc_glob%free(local_info)
      call op%grid_desc%free(local_info)
    end if
+    if (allocated(op%block_col_desc)) then
+      do j_field = 1, size(op%block_col_desc,2)
+        do i_field = 1, size(op%block_col_desc,1)
+          ! only present blocks were assembled (and carry a blk2field map)
+          if (allocated(op%blk2field) .and. allocated(op%blk2field(i_field,j_field)%map)) &
+               & call op%block_col_desc(i_field,j_field)%free(local_info)
+        end do
+      end do
+      deallocate(op%block_col_desc, stat=local_info)
+    end if
+    if (allocated(op%blk2field)) deallocate(op%blk2field, stat=local_info)
    if (allocated(op%field_desc)) then
      do i_field = 1, size(op%field_desc)
        call op%field_desc(i_field)%free(local_info)
--- a/cuda/Makefile
+++ b/cuda/Makefile
@ -106,7 +106,7 @@ svectordev.o:  svectordev.h vectordev.h
 dvectordev.o:  dvectordev.h vectordev.h
 cvectordev.o:  cvectordev.h vectordev.h
 zvectordev.o:  zvectordev.h vectordev.h
-psb_cuda_env_mod.o: base_cusparse_mod.o
+psb_cuda_env_mod.o: base_cusparse_mod.o core_mod.o
 psb_cuda_mod.o:  psb_cuda_env_mod.o psb_i_cuda_vect_mod.o\
 	psb_d_cuda_vect_mod.o  psb_s_cuda_vect_mod.o\
 	psb_z_cuda_vect_mod.o  psb_c_cuda_vect_mod.o\
--- a/cuda/psb_d_cuda_vect_mod.F90
+++ b/cuda/psb_d_cuda_vect_mod.F90
@ -77,6 +77,7 @@ module psb_d_cuda_vect_mod
    procedure, pass(x) :: set_sync => d_cuda_set_sync
    procedure, pass(x) :: set_scal => d_cuda_set_scal
 !!$    procedure, pass(x) :: set_vect => d_cuda_set_vect
+    procedure, pass(x) :: gthzv    => d_cuda_gthzv
    procedure, pass(x) :: gthzv_x  => d_cuda_gthzv_x
    procedure, pass(y) :: sctb     => d_cuda_sctb
    procedure, pass(y) :: sctb_x   => d_cuda_sctb_x
@ -231,6 +232,45 @@ contains
    end select
  end subroutine d_cuda_check_addr

+  subroutine d_cuda_gthzv(n,idx,x,y)
+    ! GPU override of the plain-array-index gather  y(1:n) = x(idx(1:n)).
+    ! Without it the generic gth(n,idx(:),buf) -- used ONLY by the RMA swap path
+    ! (psi_d_swapdata, rma_pull/push) -- fell back to the host d_base_gthzv, which
+    ! calls x%sync() and copies the WHOLE vector device->host on EVERY swap. Here
+    ! we gather on the device straight from x%deviceVect (mirrors the class-default
+    ! branch of d_cuda_gthzv_x): only the n boundary indices move H2D, no full D2H.
+    use psb_cuda_env_mod
+    use psi_serial_mod
+    implicit none
+    integer(psb_mpk_) :: n
+    integer(psb_ipk_) :: idx(:)
+    real(psb_dpk_)    :: y(:)
+    class(psb_d_vect_cuda) :: x
+    integer :: info, ni
+
+    info = 0
+    if (x%is_host()) call x%sync()          ! ensure device copy is current (no D2H of the whole vector)
+    ni = size(idx)
+
+    if (x%i_buf_sz < ni) then
+      if (c_associated(x%i_buf)) then
+        call freeInt(x%i_buf); x%i_buf = c_null_ptr
+      end if
+      info = allocateInt(x%i_buf,ni); x%i_buf_sz = ni
+    end if
+    if (x%dt_buf_sz < n) then
+      if (c_associated(x%dt_buf)) then
+        call freeDouble(x%dt_buf); x%dt_buf = c_null_ptr
+      end if
+      info = allocateDouble(x%dt_buf,n); x%dt_buf_sz = n
+    end if
+
+    if (info == 0) info = writeInt(x%i_buf,idx,ni)
+    if (info == 0) info = igathMultiVecDeviceDouble(x%deviceVect, 0, n, 1, x%i_buf, 1, x%dt_buf, 1)
+    if (info == 0) info = readDouble(x%dt_buf,y,n)
+
+  end subroutine d_cuda_gthzv
+
  subroutine d_cuda_gthzv_x(i,n,idx,x,y)
    use psb_cuda_env_mod
    use psi_serial_mod
--- a/test/comm/cg/gpu_cg.sh
+++ b/test/comm/cg/gpu_cg.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+# ============================================================================
+# gpu_cg.sh   GPU Conjugate Gradient across comm backends -- "sparse ranks"
+#
+# Final test of the comm-scheme study: a full CG solve (not just spmv) on GPU,
+# with few MPI ranks (1 rank = 1 GPU) spread over MANY nodes so the halo
+# exchange is dominated by inter-node traffic -- the regime where RMA may pay
+# off. The test sweeps ALL backends (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET,
+# MPI_PUT) x 3 preconditioners internally, so there is no per-mode loop here.
+#
+# It also self-verifies, for every scheme, that an internally-allocated work
+# vector (like CG's hidden r/p/q/z) inherits the scheme from desc_a%comm_type
+# -- look for "[OK] internal-style work vector inherited scheme" in run.out,
+# and abort on "INTERNAL-VECTOR SCHEME MISMATCH".
+# ============================================================================
+#SBATCH --job-name=cg_gpu
+#SBATCH --output=cg_gpu_%j.out
+#SBATCH --error=cg_gpu_%j.err
+#SBATCH --nodes=8
+#SBATCH --ntasks-per-node=4         # MN5-ACC: 4x H100 per node
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=20          # 80 cores / 4 ranks
+#SBATCH --time=00:30:00
+#SBATCH --qos=acc_debug
+#SBATCH --account=ehpc580
+
+# ============================================================================
+# USER CONFIGURATION
+# ============================================================================
+NREP=8                              # CG solves per (scheme,prec) for statistics
+NWARM=1                             # warm-up solves (discarded)
+ITMAX=500                           # CG iteration cap
+IDIM=200                            # ignored when --matrix is given
+
+# --- "sparse" knob: GPUs (= ranks) per node.
+#     RANKS_PER_NODE=1 -> 1 GPU/node, maximally spread, pure inter-node comm.
+RANKS_PER_NODE=1
+RANK_POINTS="2 4 8"                 # total ranks per scale point
+
+EXE=$HOME/Desktop/scorep/psblas3/test/comm/cg/runs/psb_comm_cg_test
+MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx
+
+# ============================================================================
+# ENVIRONMENT
+# ============================================================================
+module purge
+module load bsc/1.0
+module load nvidia-hpc-sdk/25.3
+module load gcc/12.3.0
+module load ucx/1.16.0-gcc
+module load openmpi/5.0.5-gcc
+module load openblas/0.3.27-gcc
+export PATH="$HOME/scorep-cuda/bin:$PATH"
+export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
+
+export OMPI_MCA_coll_hcoll_enable=0
+export SCOREP_CUDA_ENABLE=yes
+export SCOREP_CUDA_BUFFER=48M
+
+RESDIR=$SLURM_SUBMIT_DIR/results_cg_gpu_${SLURM_JOB_ID}
+mkdir -p $RESDIR
+
+echo "=== CG GPU (sparse-ranks study) ==="
+echo "  nrep=$NREP nwarm=$NWARM itmax=$ITMAX ranks_per_node=$RANKS_PER_NODE"
+echo "  reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS]"
+echo "==================================="
+
+for NRANKS in $RANK_POINTS; do
+    NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE ))
+    STEP_DIR=$RESDIR/${NRANKS}ranks
+    mkdir -p $STEP_DIR
+
+    echo ""
+    echo ">>> CG GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)"
+
+    srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \
+         --gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \
+    $EXE $IDIM $NREP $NWARM $ITMAX --gpu=TRUE --matrix=$MATRIX --fmt=MM \
+    > $STEP_DIR/run.out 2>&1
+
+    echo ">>> exit=$? output=$STEP_DIR/run.out"
+    # quick propagation sanity at submit time:
+    grep -q "INTERNAL-VECTOR SCHEME MISMATCH" $STEP_DIR/run.out \
+        && echo "    !! SCHEME PROPAGATION FAILED at $NRANKS ranks -- check run.out"
+done
+
+echo ""
+echo "=== CG GPU DONE. Results: $RESDIR ==="
+ls -R $RESDIR
--- a/test/comm/cg/psb_comm_cg_test.F90
+++ b/test/comm/cg/psb_comm_cg_test.F90
@ -18,6 +18,8 @@ program psb_comm_cg_test
  type(psb_dspmat_type) :: a
  type(psb_desc_type) :: desc_a
  type(psb_d_vect_type) :: b, x
+  type(psb_d_vect_type) :: probe          ! mimics a hidden CG work vector for scheme-propagation check
+  integer(psb_ipk_)     :: probe_got
  type(psb_dprec_type)  :: prec
 #ifdef PSB_HAVE_CUDA
  type(psb_d_vect_cuda) :: vmold
@ -30,7 +32,7 @@ program psb_comm_cg_test
  integer(psb_ipk_) :: desc_me, desc_np
  integer(psb_ipk_) :: idim, itmax, itrace, istop, iter
  integer(psb_ipk_) :: scheme_idx, prec_idx, rep, nrep, nwarm
-  integer(psb_ipk_), parameter :: n_schemes=5, n_precs=2
+  integer(psb_ipk_), parameter :: n_schemes=5, n_precs=3
  integer(psb_ipk_), allocatable :: iter_count(:,:,:), solve_info(:,:,:)
  integer(psb_ipk_) :: scheme_type(n_schemes)
  real(psb_dpk_) :: eps, err, t_start, t_elapsed
@ -81,8 +83,10 @@ program psb_comm_cg_test

  prec_type(1) = 'NONE'
  prec_type(2) = 'DIAG'
+  prec_type(3) = 'BJAC'
  prec_name(1) = 'none'
  prec_name(2) = 'diag'
+  prec_name(3) = 'bjac'

  call get_command_argument(1,arg)
  if (len_trim(arg) > 0) then
@ -167,7 +171,7 @@ program psb_comm_cg_test
    end if
    write(psb_out_unit,'("Number of processors : ",i0)') np
    write(psb_out_unit,'("Iterative method     : CG")')
-    write(psb_out_unit,'("Preconditioners      : NONE, DIAG")')
+    write(psb_out_unit,'("Preconditioners      : NONE, DIAG, BJAC")')
    write(psb_out_unit,'("Max iterations (CG)  : ",i0)') itmax
    write(psb_out_unit,'("Repetitions          : ",i0)') nrep
    write(psb_out_unit,'("Warmup solves        : ",i0)') nwarm
@ -280,6 +284,36 @@ program psb_comm_cg_test
          end if
        end if

+        ! ---- DECISIVE CHECK: scheme propagation to a HIDDEN-style work vector ----
+        ! The x check above can be silently skipped: inside CG, x is only updated
+        ! with axpby (never haloed), so x%v%comm_handle may stay unallocated. The
+        ! vectors that ARE haloed (r,p,q,z) are allocated and freed *inside* psb_dcg,
+        ! so the test cannot inspect them directly. Here we replicate exactly what
+        ! psb_dcg does to those work vectors -- psb_geall against desc_a + geasb with
+        ! mold=x%v (so it is GPU-resident too when use_gpu) -- then force one halo.
+        ! Its comm_handle must lazy-init from desc_a%comm_type, proving that any
+        ! internally-allocated vector inherits the chosen scheme.
+        call psb_geall(probe, desc_a, info)
+        if (info == psb_success_) call psb_geasb(probe, desc_a, info, mold=x%v)
+        if (info == psb_success_) call psb_geaxpby(done, b, dzero, probe, desc_a, info)
+        if (info == psb_success_) call psb_halo(probe, desc_a, info)   ! triggers dispatch
+        if (info /= psb_success_) goto 9999
+        probe_got = -1
+        if (allocated(probe%v%comm_handle)) probe_got = probe%v%comm_handle%comm_type
+        if (probe_got /= scheme_type(scheme_idx)) then
+          if (my_rank == psb_root_) &
+            write(psb_err_unit,'("INTERNAL-VECTOR SCHEME MISMATCH rank=",i0," expected=",i0," got=",i0)') &
+                 my_rank, scheme_type(scheme_idx), probe_got
+          info = psb_err_internal_error_
+          goto 9999
+        else if (my_rank == psb_root_ .and. rep == 1) then
+          write(psb_out_unit,'("  [OK] internal-style work vector inherited scheme: ",a)') &
+               trim(scheme_name(scheme_idx))
+        end if
+        call psb_gefree(probe, desc_a, info)
+        if (info /= psb_success_) goto 9999
+        ! ------------------------------------------------------------------------
+
        call psb_geaxpby(dzero,b,dzero,x,desc_a,info)
        if (info /= psb_success_) goto 9999

--- a/test/comm/spmv/Makefile
+++ b/test/comm/spmv/Makefile
@ -15,7 +15,7 @@ TOBJS=psb_spmv_test.o

 EXEDIR=./runs

-all: runsd spmv_overlap
+all: runsd psb_spmv_kernel

 runsd:
 	(if test ! -d runs ; then mkdir runs; fi)
@ -23,10 +23,9 @@ runsd:
 psb_spmv_test.o: psb_spmv_test.f90
 	$(FC) $(FCOPT) -cpp $(FINCLUDES) $(FDEFINES) $(FCUDEFINES) -c $< -o $@

-spmv_overlap: $(TOBJS)
-	$(FLINK) $(LOPT) $(TOBJS) -o spmv_overlap $(PSBLAS_LIB) $(LDLIBS)
-	/bin/mv spmv_overlap $(EXEDIR)
-	/bin/cp -f $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel
+psb_spmv_kernel: $(TOBJS)
+	$(FLINK) $(LOPT) $(TOBJS) -o psb_spmv_kernel $(PSBLAS_LIB) $(LDLIBS)
+	/bin/mv psb_spmv_kernel $(EXEDIR)

 clean:
 	/bin/rm -f $(TOBJS) $(TOBJS_API) *$(.mod) $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel
--- a/test/comm/spmv/extract_scorep.sh
+++ b/test/comm/spmv/extract_scorep.sh
@ -0,0 +1,206 @@
+#!/bin/bash
+# ============================================================================
+# extract_scorep.sh  -- auto-extract comm-scheme measurements from a
+#                       results_*_<jobid> directory (swapdata/spmv style).
+#
+# For every <N>ranks/ subdir it collects:
+#   1) run.out wall-clock avg time per backend  (the REAL headline number)
+#   2) cube_stat -r per swap routine from scorep_profile/profile.cubex:
+#        - INCL (total time in the routine, summed over call paths)
+#        - one-time setup  (Win_create/Win_allocate/topology_init/
+#                            ini_memory_buffer_layout/alltoallv_init)
+#        - steady = INCL - one-time
+#        - every MPI_* component (to scorep_components.csv)
+#
+# Output (written into RESULTS_DIR):
+#   runout.csv            ranks,backend,total_s,avg_s
+#   scorep_summary.csv    ranks,scheme,incl_s,onetime_s,steady_s
+#   scorep_components.csv ranks,scheme,component,time_s
+# plus pretty pivot tables on stdout.
+#
+# Usage:
+#   ./extract_scorep.sh                 # process ALL <N>ranks/ in current dir
+#   ./extract_scorep.sh 80ranks         # process ONE rank-point dir
+#   ./extract_scorep.sh path/to/results # process ALL <N>ranks/ under that dir
+#
+# CSVs ACCUMULATE (header written only if absent), so you can run one
+# rank-point at a time and the pivots grow. To start over: rm *.csv
+# ============================================================================
+set -u
+ARG="${1:-.}"
+OUTDIR="${OUTDIR:-.}"     # where the CSVs go (default: current dir)
+
+command -v cube_stat >/dev/null 2>&1 || { echo "ERROR: cube_stat not in PATH"; exit 1; }
+
+# scheme label : mangled routine name used by cube_stat -r
+SCHEMES=(
+  "baseline:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_baseline_vect"
+  "neighbor:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_topology_vect"
+  "persistent:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_neighbor_persistent_topology_vect"
+  "rma_pull:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_pull_vect"
+  "rma_push:psi_d_comm_v_mod.psi_d_swapdata_impl::psi_dswap_rma_push_vect"
+)
+
+RUNOUT="$OUTDIR/runout.csv"
+SUMMARY="$OUTDIR/scorep_summary.csv"
+COMPONENTS="$OUTDIR/scorep_components.csv"
+# write header only if the file does not exist yet (append/accumulate mode)
+[ -f "$RUNOUT" ]     || echo "ranks,backend,total_s,avg_s,setup_s"     > "$RUNOUT"
+[ -f "$SUMMARY" ]    || echo "ranks,scheme,incl_s,setup_s,gpuwait_s,pack_s,mpimove_s,mpiwait_s,netmpi_s" > "$SUMMARY"
+[ -f "$COMPONENTS" ] || echo "ranks,scheme,bucket,component,time_s"     > "$COMPONENTS"
+
+# Select dirs: a single rank-point dir (has scorep_profile/run.out) or, if ARG
+# is a results dir, all its <N>ranks/ subdirs.
+if [ -f "$ARG/scorep_profile/profile.cubex" ] || [ -f "$ARG/run.out" ]; then
+  DIRS=( "$ARG" )
+else
+  DIRS=( $(ls -d "$ARG"/*ranks 2>/dev/null | sort -t/ -k99 -V) )
+fi
+[ ${#DIRS[@]} -eq 0 ] && { echo "No rank-point dir(s) found under '$ARG'"; exit 1; }
+
+for d in "${DIRS[@]}"; do
+  ranks=$(basename "$d" | sed 's/[^0-9]//g')
+  [ -z "$ranks" ] && continue
+
+  # ---- 1) run.out wall-clock avg + setup per backend ----
+  # One run.out holds several "comm backend" blocks in sequence; we accumulate
+  # tot/avg/setup for the current block and flush it when the next block starts
+  # (or at EOF). "setup time" is optional: 0 if the benchmark does not emit it
+  # (i.e. until the warm-up-timing patch lands), so the break-even falls back to
+  # pure steady-state comparison.
+  if [ -f "$d/run.out" ]; then
+    awk -v r="$ranks" '
+      function flush(){ if (be!=""){ print r","be","tot","avg","setup } be="";tot=0;avg=0;setup=0 }
+      /comm backend/  { flush(); split($0,a,":"); be=a[2]; gsub(/ /,"",be) }
+      /total time/    { split($0,a,":"); tot=a[2]+0 }
+      /avg time/      { split($0,a,":"); avg=a[2]+0 }
+      /setup time/    { split($0,a,":"); setup=a[2]+0 }
+      END { flush() }
+    ' "$d/run.out" >> "$RUNOUT"
+  fi
+
+  # ---- 2) scorep cube_stat per scheme ----
+  cubex="$d/scorep_profile/profile.cubex"
+  [ -f "$cubex" ] || { echo "WARN: no profile.cubex in $d" >&2; continue; }
+
+  for entry in "${SCHEMES[@]}"; do
+    label="${entry%%:*}"; routine="${entry#*:}"
+    out=$(cube_stat -r "$routine" "$cubex" 2>/dev/null)
+    [ -z "$out" ] && continue
+    printf '%s\n' "$out" | awk -v ranks="$ranks" -v scheme="$label" -v comp="$COMPONENTS" '
+      # Bucketize every child component of the swap routine. cube_stat -r reports,
+      # for each child, its inclusive time within the routine subtree, and the
+      # children partition INCL -- so per-bucket sums are meaningful.
+      #
+      # Why this matters on GPU: the single largest component in EVERY scheme is
+      # d_cuda_device_wait (~0.036s, the GPU stream sync), which is NOT network
+      # time and is ~equal across schemes. The old "steady = INCL - onetime" left
+      # it inside, masking the real differentiator -- the MPI wait/sync time.
+      # Buckets (each component lands in exactly one, order matters):
+      #   setup   : one-time  Win_create / topology_init / alltoallv_init /
+      #                       ini_memory_buffer_layout
+      #   gpuwait : GPU stream sync (device_wait, cuda_sync) -- exclude from MPI
+      #   mpiwait : blocking MPI sync (MPI_Wait, Win_wait/post/start/complete)
+      #             <-- the discriminating metric ("guarda le wait")
+      #   mpimove : actual transfer (Isend/Irecv/Put/Get/Start/[I]neighbor_alltoallv)
+      #   pack    : gather/scatter/buffer (gthz*, sctb*, new_buffer, realloc)
+      #   netmpi  = mpimove + mpiwait  (honest per-iteration network cost)
+      {
+        k=split($0,a,","); v=a[k]+0          # value = last comma-field
+        nm=$0; sub(/,[^,]*$/,"",nm)           # name  = everything before it
+        if (nm ~ /^INCL\(/) { incl+=v; next }
+        if (nm ~ /^EXCL\(/) { next }
+        bkt="other"
+        if      (nm ~ /Win_create|Win_allocate|topology_init|ini_memory_buffer_layout|alltoallv_init/) { setup+=v;   bkt="setup" }
+        else if (nm ~ /device_wait|cuda_sync/)                                                          { gpuwait+=v; bkt="gpuwait" }
+        else if (nm ~ /MPI_Wait|Win_wait|Win_post|Win_start|Win_complete/)                              { mpiwait+=v; bkt="mpiwait" }
+        else if (nm ~ /MPI_Isend|MPI_Irecv|MPI_Send|MPI_Recv|MPI_Put|MPI_Get|MPI_Start|neighbor_alltoallv|Neighbor_alltoallv/) { mpimove+=v; bkt="mpimove" }
+        else if (nm ~ /gthz|sctb|new_buffer|realloc/)                                                   { pack+=v;    bkt="pack" }
+        cn=nm; sub(/.*::/,"",cn); gsub(/"/,"",cn)
+        print ranks","scheme","bkt","cn","v >> comp
+      }
+      END { printf "%s,%s,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f,%.4f\n", \
+                   ranks, scheme, incl, setup, gpuwait, pack, mpimove, mpiwait, mpimove+mpiwait }
+    ' >> "$SUMMARY"
+  done
+done
+
+# ---- pretty pivots ----
+pivot() {  # $1=csv  $2=value-col-index(1-based, after ranks,key)  $3=title
+  awk -F, -v vc="$2" -v title="$3" '
+    NR==1 { next }
+    { r=$1; k=$2; val=$(2+vc); v[r","k]=val; rs[r]=1; ks[k]=1 }
+    END {
+      n=0; for (k in ks) cols[++n]=k
+      # stable-ish column order
+      order="baseline neighbor persistent rma_pull rma_push P2P NEIGHBOR PNEIGHBOR MPI_GET MPI_PUT"
+      m=split(order,oc," "); delete cols; n=0
+      for (i=1;i<=m;i++) if (oc[i] in ks) cols[++n]=oc[i]
+      printf "\n=== %s ===\n", title
+      printf "%-8s", "ranks"; for (i=1;i<=n;i++) printf "%14s", cols[i]; printf "\n"
+      nr=0; for (r in rs) rr[++nr]=r
+      # numeric sort of ranks
+      for (i=1;i<=nr;i++) for (j=i+1;j<=nr;j++) if (rr[i]+0>rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t}
+      for (i=1;i<=nr;i++){ printf "%-8s", rr[i]; for (c=1;c<=n;c++){key=rr[i]","cols[c]; printf "%14s", (key in v)?v[key]:"-"} printf "\n" }
+    }' "$1"
+}
+
+# ---- break-even pivot: iterations after which scheme X beats P2P ----
+# Model for an iterative solver doing n_it spmv (halo exchanges):
+#     T_X(n) = setup_X + n * steady_X     (steady = run.out avg, setup excluded
+#                                           by the warm-up; setup = run.out "setup time")
+# X beats the P2P reference when T_X(n) < T_P2P(n). Solving for the crossover:
+#     n* = (setup_X - setup_P2P) / (steady_P2P - steady_X)
+# Output legend (per cell):
+#   <number> : break-even n*  (X wins for n_it > n*)   -- more setup, less steady
+#   always   : X wins from iteration 1 (cheaper setup AND cheaper steady) -> dominates P2P
+#   never    : P2P wins for all n_it (more setup AND slower steady) -> X is dominated
+#   <n*      : X wins ONLY for n_it < n*  (cheaper setup but slower steady)
+#              ^ this is the GPU/few-ranks RMA case: if RMA steady drops below P2P
+#                you'll instead see a number or "always" here.
+breakeven() {
+  awk -F, '
+    NR==1 { next }
+    { r=$1; be=$2; steady[r","be]=$4+0; setp[r","be]=$5+0; rs[r]=1; bes[be]=1 }
+    END {
+      order="NEIGHBOR PNEIGHBOR MPI_GET MPI_PUT"; m=split(order,oc," ")
+      printf "\n=== break-even n* vs P2P  (iterations to amortize setup; needs run.out '\''setup time'\'') ===\n"
+      printf "%-8s", "ranks"; for (i=1;i<=m;i++) if (oc[i] in bes) printf "%14s", oc[i]; printf "\n"
+      nr=0; for (r in rs) rr[++nr]=r
+      for (i=1;i<=nr;i++) for (j=i+1;j<=nr;j++) if (rr[i]+0>rr[j]+0){t=rr[i];rr[i]=rr[j];rr[j]=t}
+      anysetup=0
+      for (i=1;i<=nr;i++){
+        r=rr[i]; printf "%-8s", r
+        sref=steady[r",P2P"]; pref=setp[r",P2P"]
+        for (c=1;c<=m;c++){ be=oc[c]; if (!(be in bes)) continue
+          key=r","be
+          if (!(key in steady) || !(r",P2P" in steady)) { printf "%14s","-"; continue }
+          dsteady = sref - steady[key]      # >0 : X faster per-iter than P2P
+          dsetup  = setp[key] - pref        # >0 : X pays more setup than P2P
+          if (setp[key]>0 || pref>0) anysetup=1
+          if (dsteady > 0)      printf "%14s", (dsetup<=0 ? "always" : sprintf("%.0f", dsetup/dsteady))
+          else if (dsteady < 0) printf "%14s", (dsetup>=0 ? "never"  : sprintf("<%.0f", dsetup/dsteady))
+          else                  printf "%14s", (dsetup<0  ? "always" : "never")
+        }
+        printf "\n"
+      }
+      if (!anysetup) print "\n  NOTE: all setup_s == 0 -> run.out has no \"setup time\" yet (apply the\n        warm-up-timing patch to the benchmark). Columns reflect steady-state only."
+    }' "$RUNOUT"
+}
+
+echo "Wrote: $RUNOUT  $SUMMARY  $COMPONENTS"
+pivot "$RUNOUT"  2 "run.out  AVG time per backend [s]  (real wall-clock = steady-state per iter)"
+pivot "$RUNOUT"  3 "run.out  SETUP time per backend [s]  (one-time, paid in warm-up)"
+breakeven
+pivot "$SUMMARY" 6 "scorep  MPI WAIT per scheme [s]  <== THE discriminator (sync/imbalance; 'guarda le wait')"
+pivot "$SUMMARY" 7 "scorep  NET MPI per scheme [s]  (mpimove + mpiwait; excl GPU sync, pack, one-time setup)"
+pivot "$SUMMARY" 5 "scorep  MPI MOVE per scheme [s]  (pure transfer: Isend/Irecv/Put/Get/a2av)"
+pivot "$SUMMARY" 2 "scorep  SETUP per scheme [s]  (one-time: Win_create/topology_init/alltoallv_init/buf_layout)"
+pivot "$SUMMARY" 3 "scorep  GPU WAIT per scheme [s]  (d_cuda_device_wait; sanity: ~equal across schemes)"
+pivot "$SUMMARY" 1 "scorep  INCL per scheme [s]  (everything; dominated by GPU wait -> misleading alone)"
+
+echo
+echo "Tip: full per-component breakdown (now bucketed) is in $COMPONENTS"
+echo "     (ranks,scheme,bucket,component,time).  e.g.:"
+echo "       grep ',rma_pull,' $COMPONENTS | sort -t, -k5 -gr   # rma_pull hot spots"
+echo "       awk -F, '\$3==\"mpiwait\"' $COMPONENTS              # only the waits"
--- a/test/comm/spmv/gpu_spmv.sh
+++ b/test/comm/spmv/gpu_spmv.sh
@ -0,0 +1,113 @@
+#!/bin/bash
+# ============================================================================
+# gpu_spmv.sh   GPU SpMV across comm backends -- "sparse ranks" study
+#
+# Goal: few MPI ranks spread over MANY nodes (1 rank = 1 GPU), so that the
+# halo exchange is dominated by INTER-node traffic. This is where RMA
+# (MPI_Get/MPI_Put) may finally pay off: with few, fat messages over the
+# network the one-time window-creation cost can amortize (break-even n*),
+# unlike the dense CPU runs where RMA was dominated.
+#
+# The spmv test sweeps ALL backends internally (P2P, NEIGHBOR, PNEIGHBOR,
+# MPI_GET, MPI_PUT) in one run -> no per-mode loop here.
+# ============================================================================
+#SBATCH --job-name=spmv_gpu
+#SBATCH --output=spmv_gpu_%j.out
+#SBATCH --error=spmv_gpu_%j.err
+#SBATCH --nodes=8                   # reserve up front; we use a subset per point
+#SBATCH --ntasks-per-node=4         # MN5-ACC has 4x H100 per node
+#SBATCH --gres=gpu:4                # request all 4 GPUs on each reserved node
+#SBATCH --cpus-per-task=20          # 80 cores / 4 ranks = 20 cores per rank
+#SBATCH --time=00:30:00
+#SBATCH --qos=acc_debug
+#SBATCH --account=ehpc580
+
+# ============================================================================
+# USER CONFIGURATION
+# ============================================================================
+TIMES=50                            # SpMV repetitions (timed)
+GPU_FMT=HLG                         # GPU matrix format (HLG|ELG|CSRG|HDIAG)
+PROFILE=true                        # Score-P profiling on/off
+
+# --- "sparse" knob: how many ranks (= GPUs) to put on each node.
+#     RANKS_PER_NODE=1  -> 1 GPU/node, maximally spread, pure inter-node comm
+#                          (this is the configuration that isolates RMA-over-network)
+#     RANKS_PER_NODE=4  -> pack all 4 GPUs/node (denser, more intra-node)
+RANKS_PER_NODE=1
+
+# total ranks per scale point. With RANKS_PER_NODE=1 these are also node counts,
+# so "2 4 8" means 2, 4, 8 nodes each holding a single GPU.
+RANK_POINTS="2 4 8"
+
+EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel
+MATRIX=$HOME/Desktop/scorep/psblas3/test/comm/data/Geo_1438.mtx
+
+# ============================================================================
+# ENVIRONMENT
+# ============================================================================
+module purge
+module load bsc/1.0
+module load nvidia-hpc-sdk/25.3
+module load gcc/12.3.0
+module load ucx/1.16.0-gcc
+module load openmpi/5.0.5-gcc
+module load openblas/0.3.27-gcc
+export PATH="$HOME/scorep-cuda/bin:$PATH"
+export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
+
+export OMPI_MCA_coll_hcoll_enable=0
+export SCOREP_CUDA_ENABLE=yes           # GPU run: turn the CUDA adapter ON
+export SCOREP_CUDA_BUFFER=48M           # per-stream CUDA record buffer
+
+RESDIR=$SLURM_SUBMIT_DIR/results_spmv_gpu_${SLURM_JOB_ID}
+mkdir -p $RESDIR
+
+# --- Score-P region filter (same as CPU: drop tiny high-frequency USR funcs) ---
+FILTER=$RESDIR/scorep.filt
+cat > $FILTER <<'EOF'
+SCOREP_REGION_NAMES_BEGIN
+  EXCLUDE
+    psb_indx_map_mod::*
+    psb_desc_mod::*
+    psb_error_mod::*
+    psb_gen_block_map_mod::*
+    psi_penv_mod::*
+    psb_hash_mod::*
+SCOREP_REGION_NAMES_END
+EOF
+
+echo "=== SPMV GPU (sparse-ranks study) ==="
+echo "  times=$TIMES gpu_fmt=$GPU_FMT ranks_per_node=$RANKS_PER_NODE"
+echo "  reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE"
+echo "====================================="
+
+for NRANKS in $RANK_POINTS; do
+    NNODES=$(( (NRANKS + RANKS_PER_NODE - 1) / RANKS_PER_NODE ))
+    STEP_DIR=$RESDIR/${NRANKS}ranks
+    mkdir -p $STEP_DIR
+
+    if [ "$PROFILE" = "true" ]; then
+        export SCOREP_ENABLE_PROFILING=true
+        export SCOREP_ENABLE_TRACING=false
+        export SCOREP_TOTAL_MEMORY=256M     # bumped: CUDA adapter needs more
+        export SCOREP_FILTERING_FILE=$FILTER
+        export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile
+    else
+        export SCOREP_ENABLE_PROFILING=false
+        export SCOREP_ENABLE_TRACING=false
+    fi
+
+    echo ""
+    echo ">>> GPU point: $NRANKS ranks on $NNODES nodes ($RANKS_PER_NODE GPU/node)"
+
+    srun -N $NNODES -n $NRANKS --ntasks-per-node=$RANKS_PER_NODE \
+         --gres=gpu:$RANKS_PER_NODE --gpu-bind=single:1 --cpus-per-task=20 \
+    $EXE --gpu=TRUE --gpu_fmt=$GPU_FMT --matrix=$MATRIX --fmt=MM --times=$TIMES \
+    > $STEP_DIR/run.out 2>&1
+
+    echo ">>> exit=$? output=$STEP_DIR/run.out"
+done
+
+echo ""
+echo "=== SPMV GPU DONE. Results: $RESDIR ==="
+ls -R $RESDIR
--- a/test/comm/spmv/psb_spmv_test.f90
+++ b/test/comm/spmv/psb_spmv_test.f90
@ -525,7 +525,7 @@ contains
    return
  end subroutine psb_d_gen_pde3d

-  subroutine run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_in,times_in,do_swap,comm_mode)
+  subroutine run_spmv_kernel(ctxt,use_gpu,matrix_file,matrix_fmt,cpu_fmt,gpu_fmt,idim_in,times_in,comm_mode)
    use psb_base_mod
 #ifdef PSB_HAVE_CUDA
    use psb_cuda_mod
@ -539,7 +539,6 @@ contains
    character(len=*), intent(in)    :: cpu_fmt
    character(len=*), intent(in)    :: gpu_fmt
    integer(psb_ipk_), intent(in)   :: idim_in, times_in
-    logical, intent(in)             :: do_swap
    character(len=*), intent(in)    :: comm_mode

    type(psb_dspmat_type)           :: a
@ -628,9 +627,6 @@ contains
    end if
    if (info /= psb_success_) goto 9999

-    call psb_comm_set(comm_type,x%v%comm_handle,info)
-    if (info /= psb_success_) goto 9999
-
 #ifdef PSB_HAVE_CUDA
    if (use_gpu) then
      select case(psb_toupper(trim(gpu_fmt)))
@ -654,14 +650,22 @@ contains
    end if
 #endif

+    ! Set the comm scheme on the DESCRIPTOR, not the vector. desc_a%comm_type
+    ! survives the GPU x%cnv above (which allocates a fresh x%v WITHOUT copying
+    ! comm_handle); x then lazy-inits its handle from desc_a%comm_type at the
+    ! first halo. Setting psb_comm_set on x%v%comm_handle *before* cnv silently
+    ! reverted every backend to baseline on GPU (CPU was unaffected: no cnv).
+    call desc_a%set_comm_scheme(comm_type, info)
+    if (info /= psb_success_) goto 9999
+
    ! warm-up
-    call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap)
+    call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.true.)
    if (info /= psb_success_) goto 9999

    call psb_barrier(ctxt)
    t0 = psb_wtime()
    do i = 1, times
-      call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=do_swap)
+      call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.true.)
      if (info /= psb_success_) exit
    end do
    t1 = psb_wtime()
@ -672,11 +676,7 @@ contains
    avg_t = dt / real(times, psb_dpk_)

    if (my_rank == psb_root_) then
-      if (do_swap) then
-        write(psb_out_unit,'(/,"SpMV benchmark (overlap)")')
-      else
-        write(psb_out_unit,'(/,"SpMV benchmark (no overlap)")')
-      end if
+      write(psb_out_unit,'(/,"SpMV benchmark")')
      write(psb_out_unit,'("  cpu matrix fmt   : ",a)') trim(afmt)
      if (use_gpu) write(psb_out_unit,'("  gpu matrix fmt   : ",a)') trim(psb_toupper(trim(gpu_fmt)))
      if (use_external_matrix) then
@ -784,7 +784,6 @@ program psb_spmv_kernel
  character(len=8)   :: cpu_fmt
  character(len=8)   :: gpu_fmt
  integer(psb_ipk_) :: idim_arg, times_arg
-  logical           :: do_overlap
  integer :: kmode
  integer, parameter :: n_comm_modes = 5
  character(len=20), parameter :: comm_modes(n_comm_modes) = [character(len=20) :: &
@ -797,7 +796,6 @@ program psb_spmv_kernel
  matrix_fmt  = 'MM'
  cpu_fmt     = 'CSR'
  gpu_fmt     = 'HLG'
-  do_overlap  = .true.

  call psb_init(ctxt)
  call psb_info(ctxt, my_rank, np)
@ -842,10 +840,6 @@ program psb_spmv_kernel
      gpu_fmt = psb_toupper(adjustl(arg(14:len_trim(arg))))
    else if (index(psb_toupper(trim(arg)), '--GPU_FMT=') == 1) then
      gpu_fmt = psb_toupper(adjustl(arg(11:len_trim(arg))))
-    else if ((trim(psb_toupper(arg)) == '--NOOVERLAP') .or. (trim(psb_toupper(arg)) == '--NO_OVERLAP')) then
-      do_overlap = .false.
-    else if ((trim(psb_toupper(arg)) == '--OVERLAP') .or. (trim(psb_toupper(arg)) == '--SWAP')) then
-      do_overlap = .true.
    else if (trim(psb_toupper(arg)) == '--MATRIX') then
      if (k < command_argument_count()) call get_command_argument(k+1,matrix_file)
    else if (trim(psb_toupper(arg)) == '--FMT') then
@ -897,7 +891,7 @@ program psb_spmv_kernel
    write(psb_out_unit,'("GPU enabled          : ",l1)') use_gpu
    write(psb_out_unit,'("Usage: ./psb_spmv_kernel [--gpu=TRUE|FALSE] [--dim=N] [--times=N] ",&
      &"[--cpu_fmt=CSR|COO|CSC|ELL|HLL] [--gpu_fmt=HLL|ELL|CSR|HDIA] [--matrix=<path>] [--fmt=MM|HB] ",&
-      &"[--overlap|--nooverlap] (runs all comm backends)")')
+      &"(runs all comm backends)")')
  end if

  do kmode = 1, n_comm_modes
@ -905,7 +899,7 @@ program psb_spmv_kernel
      write(psb_out_unit,'(/,"=== Backend sweep: ",a," ===")') trim(comm_modes(kmode))
    end if
    call run_spmv_kernel(ctxt, use_gpu, matrix_file, matrix_fmt, cpu_fmt, gpu_fmt, &
-      & idim_arg, times_arg, do_overlap, comm_modes(kmode))
+      & idim_arg, times_arg, comm_modes(kmode))
  end do

 #ifdef PSB_HAVE_CUDA
--- a/test/comm/spmv/strong_spmv.sh
+++ b/test/comm/spmv/strong_spmv.sh
@ -0,0 +1,96 @@
+#!/bin/bash
+# ============================================================================
+# strong_spmv.sbatch  STRONG scaling, CPU-only SpMV across comm backends
+#
+# Fixed total problem size; MPI ranks grow across scale points. The spmv test
+# sweeps ALL comm backends internally (P2P, NEIGHBOR, PNEIGHBOR, MPI_GET,
+# MPI_PUT) in a single run, so there is no per-mode loop.
+# All scale points run inside ONE reserved allocation (same hardware).
+# ============================================================================
+#SBATCH --job-name=spmv_strong
+#SBATCH --output=spmv_strong_%j.out
+#SBATCH --error=spmv_strong_%j.err
+#SBATCH --nodes=8                   # reserve the maximum up front
+#SBATCH --ntasks-per-node=80        # 80 cores per ACC node -> 80 ranks/node
+#SBATCH --cpus-per-task=1           # CPU-only: one core per rank
+#SBATCH --time=00:30:00
+#SBATCH --qos=acc_debug
+#SBATCH --account=ehpc580
+
+# ============================================================================
+# USER CONFIGURATION
+# ============================================================================
+DIM=280                             # FIXED problem size (idim^3 unknowns)
+TIMES=50                            # SpMV repetitions (timed)
+CPU_FMT=CSR                         # CPU matrix storage format
+PROFILE=true                        # Score-P profiling on/off
+RANK_POINTS="80 160 320 640"        # total ranks per scale point (multiples of 80)
+
+EXE=$HOME/Desktop/scorep/psblas3/test/comm/spmv/runs/psb_spmv_kernel
+
+# ============================================================================
+# ENVIRONMENT
+# ============================================================================
+module purge
+module load bsc/1.0
+module load nvidia-hpc-sdk/25.3
+module load gcc/12.3.0
+module load ucx/1.16.0-gcc
+module load openmpi/5.0.5-gcc
+module load openblas/0.3.27-gcc
+export PATH="$HOME/scorep-cuda/bin:$PATH"
+export LD_LIBRARY_PATH="$HOME/scorep-cuda/lib:$LD_LIBRARY_PATH"
+
+export OMPI_MCA_coll_hcoll_enable=0
+export SCOREP_CUDA_ENABLE=no        # CPU-only run: silence Score-P CUDA adapter
+
+RESDIR=$SLURM_SUBMIT_DIR/results_spmv_strong_${SLURM_JOB_ID}
+mkdir -p $RESDIR
+
+# --- Score-P region filter: exclude the high-frequency tiny USR functions so
+#     the profile reflects real MPI/comm time, not instrumentation overhead. ---
+FILTER=$RESDIR/scorep.filt
+cat > $FILTER <<'EOF'
+SCOREP_REGION_NAMES_BEGIN
+  EXCLUDE
+    psb_indx_map_mod::*
+    psb_desc_mod::*
+    psb_error_mod::*
+    psb_gen_block_map_mod::*
+    psi_penv_mod::*
+    psb_hash_mod::*
+SCOREP_REGION_NAMES_END
+EOF
+
+echo "=== SPMV STRONG scaling (CPU-only) ==="
+echo "  fixed_dim=$DIM times=$TIMES cpu_fmt=$CPU_FMT"
+echo "  reserved_nodes=$SLURM_NNODES rank_points=[$RANK_POINTS] profile=$PROFILE"
+echo "======================================"
+
+for NRANKS in $RANK_POINTS; do
+    NNODES=$(( (NRANKS + 79) / 80 ))     # 80 ranks per node
+    STEP_DIR=$RESDIR/${NRANKS}ranks
+    mkdir -p $STEP_DIR
+
+    if [ "$PROFILE" = "true" ]; then
+        export SCOREP_ENABLE_PROFILING=true
+        export SCOREP_ENABLE_TRACING=false
+        export SCOREP_TOTAL_MEMORY=128M
+        export SCOREP_FILTERING_FILE=$FILTER
+        export SCOREP_EXPERIMENT_DIRECTORY=$STEP_DIR/scorep_profile
+    else
+        export SCOREP_ENABLE_PROFILING=false
+        export SCOREP_ENABLE_TRACING=false
+    fi
+
+    echo ""
+    echo ">>> STRONG point: $NRANKS ranks ($NNODES nodes), fixed dim=$DIM"
+    srun -N $NNODES -n $NRANKS --ntasks-per-node=80 --cpus-per-task=1 \
+        $EXE --gpu=FALSE --dim=$DIM --times=$TIMES --cpu_fmt=$CPU_FMT \
+        > $STEP_DIR/run.out 2>&1
+    echo ">>> exit=$? output=$STEP_DIR/run.out"
+done
+
+echo ""
+echo "=== SPMV STRONG DONE. Results: $RESDIR ==="
+ls -R $RESDIR
--- a/test/nested/CMakeLists.txt
+++ b/test/nested/CMakeLists.txt
@ -26,6 +26,7 @@ file(MAKE_DIRECTORY ${EXEDIR})
 set(SOURCES_D_NEST_GLOB_TEST  psb_d_nest_glob_test.F90)
 set(SOURCES_D_NEST_RECT_TEST  psb_d_nest_rect_test.F90)
 set(SOURCES_D_NEST_CG_TEST    psb_d_nest_cg_test.F90)
+set(SOURCES_D_NEST_HALO_TEST  psb_d_nest_halo_regime_test.F90)

 add_executable(psb_d_nest_glob_test ${SOURCES_D_NEST_GLOB_TEST})
 target_link_libraries(psb_d_nest_glob_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base)
@ -36,8 +37,11 @@ target_link_libraries(psb_d_nest_rect_test psblas::util psblas::linsolve psblas:
 add_executable(psb_d_nest_cg_test ${SOURCES_D_NEST_CG_TEST})
 target_link_libraries(psb_d_nest_cg_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base)

+add_executable(psb_d_nest_halo_regime_test ${SOURCES_D_NEST_HALO_TEST})
+target_link_libraries(psb_d_nest_halo_regime_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base)
+
 # Set output directory for executables
-foreach(target psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test)
+foreach(target psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test psb_d_nest_halo_regime_test)
    set_target_properties(${target} PROPERTIES
        RUNTIME_OUTPUT_DIRECTORY ${EXEDIR}
    )
--- a/test/nested/Makefile
+++ b/test/nested/Makefile
@ -16,11 +16,15 @@ FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG).

 EXEDIR=./runs

-all: runsd psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test
+all: runsd psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test psb_d_nest_halo_regime_test

 runsd:
 	(if test ! -d runs ; then mkdir runs; fi)

+psb_d_nest_halo_regime_test: psb_d_nest_halo_regime_test.o
+	$(FLINK) psb_d_nest_halo_regime_test.o -o psb_d_nest_halo_regime_test $(PSBLAS_LIB) $(LDLIBS)
+	/bin/mv psb_d_nest_halo_regime_test $(EXEDIR)
+
 psb_d_nest_glob_test: psb_d_nest_glob_test.o
 	$(FLINK) psb_d_nest_glob_test.o -o psb_d_nest_glob_test $(PSBLAS_LIB) $(LDLIBS)
 	/bin/mv psb_d_nest_glob_test $(EXEDIR)
@ -34,8 +38,9 @@ psb_d_nest_cg_test: psb_d_nest_cg_test.o
 	/bin/mv psb_d_nest_cg_test $(EXEDIR)

 clean:
-	/bin/rm -f psb_d_nest_glob_test.o psb_d_nest_rect_test.o psb_d_nest_cg_test.o *$(.mod) \
-	$(EXEDIR)/psb_d_nest_glob_test $(EXEDIR)/psb_d_nest_rect_test $(EXEDIR)/psb_d_nest_cg_test
+	/bin/rm -f psb_d_nest_glob_test.o psb_d_nest_rect_test.o psb_d_nest_cg_test.o psb_d_nest_halo_regime_test.o *$(.mod) \
+	$(EXEDIR)/psb_d_nest_glob_test $(EXEDIR)/psb_d_nest_rect_test $(EXEDIR)/psb_d_nest_cg_test \
+	$(EXEDIR)/psb_d_nest_halo_regime_test
 verycleanlib:
 	(cd ../..; make veryclean)
 lib:
--- a/test/nested/psb_d_nest_halo_regime_test.F90
+++ b/test/nested/psb_d_nest_halo_regime_test.F90
@ -0,0 +1,398 @@
+!
+!                Parallel Sparse BLAS  version 3.5
+!      (C) Copyright 2006-2018
+!        Salvatore Filippone
+!        Alfredo Buttari
+!
+! File: psb_d_nest_halo_regime_test.F90
+! Author: Simone Staccone (Stack-1)
+!
+! Benchmarks the two halo-exchange regimes of a nested (MATNEST-style) operator
+! on the SAME assembled blocks:
+!
+!   * UNION (fused):  one psb_halo over the composed global descriptor brings in
+!     the union of every field's ghosts at once (the production path used by
+!     psb_spmm / Krylov / AMG4PSBLAS).
+!
+!   * PER-BLOCK (selective): each present block (i,j) exchanges ONLY its own
+!     off-process columns, through its restricted descriptor block_col_desc(i,j),
+!     one psb_halo per block.
+!
+! The two products must coincide to machine precision.  The test reports, for
+! each regime: the number of halo exchanges, the full-matvec time, and the
+! PURE-COMMUNICATION time (only the psb_halo calls), so the cost of the message
+! aggregation can be isolated.
+!
+! Saddle-point 2x2 operator  [ A  B^T ; B  0 ]  with TWO fields of DIFFERENT
+! size (n1 = field 1 / "velocity", n2 = field 2 / "pressure", n1 /= n2 allowed);
+! the (2,2) block is absent.  The blocks have REAL, distinct per-block halos (so
+! per-block vs union is meaningful): A (1,1) tridiagonal on field 1; the
+! rectangular B^T (1,2) and B (2,1) couple every row to a column half a field
+! away, which lands on another process.
+!
+! Run:  mpirun -np <P> ./psb_d_nest_halo_regime_test [n1] [n2] [n_reps] [stride]
+!         n1     : global rows of field 1   (default 2000000)
+!         n2     : global rows of field 2   (default  500000)
+!         n_reps : timed repetitions         (default 50)
+!         stride : B/B^T coupling distance   (default 1)
+!                  small  => local halo  => LATENCY-bound (aggregation wins)
+!                  ~n/2   => huge halo   => BANDWIDTH-bound (union ~ per-block)
+!
+program psb_d_nest_halo_regime_test
+  use psb_base_mod
+  use psb_d_nest_mod
+  implicit none
+
+  type(psb_ctxt_type)             :: context
+  integer(psb_ipk_)               :: my_rank, num_procs, info, i_local_row
+  integer(psb_ipk_)               :: entry_idx, field1_local_rows, field2_local_rows
+  integer(psb_ipk_)               :: n_local_rows, n_local_cols, n_exch, n_warm
+  integer(psb_ipk_)               :: n_reps, narg
+  integer(psb_lpk_)               :: global_row, field1_size, field2_size, gcol, stride
+  character(len=64)               :: arg
+
+  type(psb_d_nest_matrix)         :: nested_matrix
+  integer(psb_lpk_), allocatable  :: entry_rows(:), entry_cols(:)
+  integer(psb_lpk_), allocatable  :: field1_rows(:), field2_rows(:)
+  real(psb_dpk_),    allocatable  :: entry_vals(:)
+  real(psb_dpk_),    allocatable  :: x_arr(:), work_x(:), y_union(:), y_perblock(:)
+  real(psb_dpk_)                  :: mismatch_norm, t_u, t_b
+  logical                         :: ok_u, ok_b
+  integer(psb_ipk_)               :: s_idx, n_schemes
+  integer(psb_ipk_), allocatable  :: schemes(:)
+  character(len=34), allocatable  :: scheme_names(:)
+  type(psb_d_vect_type)              :: xg_vect            ! global-local halo vector
+  type(psb_d_vect_type), allocatable :: bvect(:,:)         ! one per present block
+  real(psb_dpk_), parameter       :: tolerance = 1.0e-10_psb_dpk_
+
+  call psb_init(context)
+  call psb_info(context, my_rank, num_procs)
+
+  ! -------- runtime parameters (rank 0 parses, then broadcast) --------
+  field1_size = 2000000_psb_lpk_
+  field2_size =  500000_psb_lpk_
+  n_reps      = 50
+  stride      = 1_psb_lpk_       ! B/B^T coupling distance: small => local halo
+                                 ! (latency-bound), large (~n/2) => big halo
+                                 ! (bandwidth-bound)
+  if (my_rank == 0) then
+    narg = command_argument_count()
+    if (narg >= 1) then
+      call get_command_argument(1, arg); read(arg,*) field1_size
+    end if
+    if (narg >= 2) then
+      call get_command_argument(2, arg); read(arg,*) field2_size
+    end if
+    if (narg >= 3) then
+      call get_command_argument(3, arg); read(arg,*) n_reps
+    end if
+    if (narg >= 4) then
+      call get_command_argument(4, arg); read(arg,*) stride
+    end if
+  end if
+  call psb_bcast(context, field1_size)
+  call psb_bcast(context, field2_size)
+  call psb_bcast(context, n_reps)
+  call psb_bcast(context, stride)
+  n_warm = max(5, n_reps/10)
+
+  !---------------------------------------------------------------
+  ! 1) build the 2x2 nested operator with real per-block halos
+  !---------------------------------------------------------------
+  call nested_matrix%init(context, [field1_size, field2_size], info)
+  if (info /= psb_success_) then
+    if (my_rank==0) write(*,*) 'FAIL: init info=', info; goto 9999
+  end if
+  field1_rows = nested_matrix%get_owned_rows(1)
+  field2_rows = nested_matrix%get_owned_rows(2)
+  field1_local_rows = size(field1_rows)
+  field2_local_rows = size(field2_rows)
+
+  ! A = tridiag(-1,2,-1) -> block (1,1)
+  allocate(entry_rows(3*field1_local_rows), entry_cols(3*field1_local_rows), entry_vals(3*field1_local_rows))
+  entry_idx = 0
+  do i_local_row = 1, field1_local_rows
+    global_row = field1_rows(i_local_row)
+    entry_idx = entry_idx + 1
+    entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row; entry_vals(entry_idx) = 2.0_psb_dpk_
+    if (global_row > 1) then
+      entry_idx = entry_idx + 1
+      entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row - 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_
+    end if
+    if (global_row < field1_size) then
+      entry_idx = entry_idx + 1
+      entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = global_row + 1_psb_lpk_; entry_vals(entry_idx) = -1.0_psb_dpk_
+    end if
+  end do
+  call nested_matrix%ins(1, 1, entry_idx, entry_rows, entry_cols, entry_vals, info)
+  deallocate(entry_rows, entry_cols, entry_vals)
+
+  ! B^T -> block (1,2): rows in field 1, columns in field 2 (RECTANGULAR n1 x n2).
+  ! Each row r couples to field-2 columns (r mod n2) and (r + n2/2 mod n2); the
+  ! second one lands on another process => real, distinct halo for this block.
+  allocate(entry_rows(2*field1_local_rows), entry_cols(2*field1_local_rows), entry_vals(2*field1_local_rows))
+  entry_idx = 0
+  do i_local_row = 1, field1_local_rows
+    global_row = field1_rows(i_local_row)
+    entry_idx = entry_idx + 1
+    gcol = mod(global_row - 1_psb_lpk_, field2_size) + 1_psb_lpk_
+    entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.5_psb_dpk_
+    entry_idx = entry_idx + 1
+    gcol = mod(global_row - 1_psb_lpk_ + stride, field2_size) + 1_psb_lpk_
+    entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.25_psb_dpk_
+  end do
+  call nested_matrix%ins(1, 2, entry_idx, entry_rows, entry_cols, entry_vals, info)
+  deallocate(entry_rows, entry_cols, entry_vals)
+
+  ! B -> block (2,1): rows in field 2, columns in field 1 (RECTANGULAR n2 x n1).
+  allocate(entry_rows(2*field2_local_rows), entry_cols(2*field2_local_rows), entry_vals(2*field2_local_rows))
+  entry_idx = 0
+  do i_local_row = 1, field2_local_rows
+    global_row = field2_rows(i_local_row)
+    entry_idx = entry_idx + 1
+    gcol = mod(global_row - 1_psb_lpk_, field1_size) + 1_psb_lpk_
+    entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.3_psb_dpk_
+    entry_idx = entry_idx + 1
+    gcol = mod(global_row - 1_psb_lpk_ + stride, field1_size) + 1_psb_lpk_
+    entry_rows(entry_idx) = global_row; entry_cols(entry_idx) = gcol; entry_vals(entry_idx) = 0.15_psb_dpk_
+  end do
+  call nested_matrix%ins(2, 1, entry_idx, entry_rows, entry_cols, entry_vals, info)
+  deallocate(entry_rows, entry_cols, entry_vals)
+
+  call nested_matrix%asb(info)
+  if (info /= psb_success_) then
+    if (my_rank==0) write(*,*) 'FAIL: asb info=', info; goto 9999
+  end if
+
+  !---------------------------------------------------------------
+  ! 2) global-local work vectors (x[g] = g on the owned entries)
+  !---------------------------------------------------------------
+  n_local_rows = nested_matrix%desc_glob%get_local_rows()
+  n_local_cols = nested_matrix%desc_glob%get_local_cols()
+  allocate(x_arr(n_local_cols), work_x(n_local_cols), &
+       &   y_union(n_local_cols), y_perblock(n_local_cols))
+  x_arr = dzero
+  do i_local_row = 1, n_local_rows
+    call nested_matrix%desc_glob%l2g(i_local_row, global_row, info)
+    x_arr(i_local_row) = real(global_row, psb_dpk_)
+  end do
+
+  !---------------------------------------------------------------
+  ! 3) correctness: union vs per-block on the same x
+  !---------------------------------------------------------------
+  work_x = x_arr
+  call psb_spmm(done, nested_matrix%a_glob, work_x, dzero, y_union, nested_matrix%desc_glob, info)
+  if (info /= psb_success_) then
+    if (my_rank==0) write(*,*) 'FAIL: psb_spmm (union) info=', info; goto 9999
+  end if
+  call nest_spmv_perblock(nested_matrix, done, x_arr, dzero, y_perblock, info, n_exch, .false.)
+  if (info /= psb_success_) then
+    if (my_rank==0) write(*,*) 'FAIL: per-block driver info=', info; goto 9999
+  end if
+  mismatch_norm = dzero
+  do i_local_row = 1, n_local_rows
+    mismatch_norm = max(mismatch_norm, abs(y_union(i_local_row) - y_perblock(i_local_row)))
+  end do
+  call psb_amx(context, mismatch_norm)
+
+  !---------------------------------------------------------------
+  ! 4) per-scheme PURE COMMUNICATION sweep.
+  !    The comm scheme is selected per descriptor (desc%set_comm_scheme) and is
+  !    honoured ONLY by the encapsulated vect path; the array path used in (3)
+  !    is always baseline.  For each scheme we time, on persistent vects:
+  !      union     = one psb_halo over desc_glob
+  !      per-block = one psb_halo per present block_col_desc(i,j)
+  !---------------------------------------------------------------
+  n_schemes = 5
+  allocate(schemes(n_schemes), scheme_names(n_schemes))
+  schemes = [ psb_comm_isend_irecv_, psb_comm_ineighbor_alltoallv_,            &
+       &      psb_comm_persistent_ineighbor_alltoallv_,                        &
+       &      psb_comm_rma_pull_, psb_comm_rma_push_ ]
+  scheme_names = [ character(len=34) :: 'isend_irecv (baseline)',              &
+       &           'ineighbor_alltoallv', 'persistent_ineighbor_alltoallv',    &
+       &           'rma_pull', 'rma_push' ]
+  allocate(bvect(nested_matrix%n_fields, nested_matrix%n_fields))
+
+  if (my_rank == 0) then
+    write(*,'(a,i0,a,i0,a,i0,a,i0,a,i0)') ' np=', num_procs, '  n1=', field1_size, &
+         &                           '  n2=', field2_size, '  stride=', stride, '  reps=', n_reps
+    write(*,'(a,es12.4)') ' max|y_union - y_perblock| = ', mismatch_norm
+    if (mismatch_norm <= tolerance) then
+      write(*,*) ' PASS: regimes agree'
+    else
+      write(*,*) ' FAIL: regimes disagree'
+    end if
+    write(*,'(a,i0)') ' halo exchanges:  union = 1   per-block = ', n_exch
+    write(*,*)
+    write(*,'(a)') ' pure halo communication time [s]  (min over reps, slowest rank)'
+    write(*,'(a)') ' scheme                              union           per-block       ratio'
+  end if
+
+  do s_idx = 1, n_schemes
+    call set_scheme(schemes(s_idx))
+    call build_vects()
+    call time_comm('U', t_u, ok_u)
+    call time_comm('B', t_b, ok_b)
+    call free_vects()
+    if (my_rank == 0) then
+      if (ok_u .and. ok_b) then
+        write(*,'(1x,a34,es15.4,es15.4,3x,f7.2)') scheme_names(s_idx), t_u, t_b, &
+             &                                     t_b/max(t_u, tiny(t_u))
+      else
+        write(*,'(1x,a34,a)') scheme_names(s_idx), '   (unavailable on this build/MPI)'
+      end if
+    end if
+  end do
+
+  deallocate(bvect, schemes, scheme_names)
+  call nested_matrix%free(info)
+9999 continue
+  call psb_exit(context)
+
+contains
+
+  ! Set the communication scheme on the global descriptor and on every present
+  ! per-block descriptor (the per-block exchanges must use the same scheme).
+  subroutine set_scheme(scheme)
+    integer(psb_ipk_), intent(in) :: scheme
+    integer(psb_ipk_) :: i, j, linfo
+    call nested_matrix%desc_glob%set_comm_scheme(scheme, linfo)
+    do j = 1, nested_matrix%n_fields
+      do i = 1, nested_matrix%n_fields
+        if (nested_matrix%block_storage%has_block(i,j)) &
+             & call nested_matrix%block_col_desc(i,j)%set_comm_scheme(scheme, linfo)
+      end do
+    end do
+  end subroutine set_scheme
+
+  ! Fresh persistent halo vectors (the comm_handle, hence the scheme, is created
+  ! from desc%comm_type on the first psb_halo and then reused across reps).
+  subroutine build_vects()
+    integer(psb_ipk_) :: i, j, linfo
+    call psb_geall(xg_vect, nested_matrix%desc_glob, linfo)
+    call psb_geasb(xg_vect, nested_matrix%desc_glob, linfo)
+    do j = 1, nested_matrix%n_fields
+      do i = 1, nested_matrix%n_fields
+        if (.not. nested_matrix%block_storage%has_block(i,j)) cycle
+        call psb_geall(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
+        call psb_geasb(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
+      end do
+    end do
+  end subroutine build_vects
+
+  subroutine free_vects()
+    integer(psb_ipk_) :: i, j, linfo
+    call xg_vect%free(linfo)
+    do j = 1, nested_matrix%n_fields
+      do i = 1, nested_matrix%n_fields
+        if (nested_matrix%block_storage%has_block(i,j)) call bvect(i,j)%free(linfo)
+      end do
+    end do
+  end subroutine free_vects
+
+  ! Time the pure communication of one regime: n_warm warm-up runs, then min over
+  ! n_reps of the slowest rank's wall time (psb_amx = max across ranks).
+  subroutine time_comm(code, t_min, ok)
+    character,      intent(in)  :: code     ! 'U' union / 'B' per-block
+    real(psb_dpk_), intent(out) :: t_min
+    logical,        intent(out) :: ok
+    integer(psb_ipk_) :: rep, linfo
+    real(psb_dpk_)    :: t0, dt
+    ok = .true.
+    do rep = 1, n_warm
+      call do_comm(code, linfo); if (linfo /= 0) ok = .false.
+    end do
+    t_min = huge(t_min)
+    do rep = 1, n_reps
+      call psb_barrier(context)
+      t0 = psb_wtime()
+      call do_comm(code, linfo)
+      dt = psb_wtime() - t0
+      call psb_amx(context, dt)
+      t_min = min(t_min, dt)
+      if (linfo /= 0) ok = .false.
+    end do
+    if (.not. ok) t_min = dzero
+  end subroutine time_comm
+
+  subroutine do_comm(code, linfo)
+    character,         intent(in)  :: code
+    integer(psb_ipk_), intent(out) :: linfo
+    integer(psb_ipk_) :: i, j
+    linfo = 0
+    if (code == 'U') then
+      call psb_halo(xg_vect, nested_matrix%desc_glob, linfo)
+    else
+      do j = 1, nested_matrix%n_fields
+        do i = 1, nested_matrix%n_fields
+          if (.not. nested_matrix%block_storage%has_block(i,j)) cycle
+          call psb_halo(bvect(i,j), nested_matrix%block_col_desc(i,j), linfo)
+          if (linfo /= 0) return
+        end do
+      end do
+    end if
+  end subroutine do_comm
+
+  ! Per-block nested matvec, exchanging one block's halo at a time.  x, y are in
+  ! the composed global-local layout.  halo_only=.true. performs ONLY the
+  ! per-block psb_halo calls (for the pure-communication timing).
+  subroutine nest_spmv_perblock(op, alpha, x, beta, y, info, n_exchanges, halo_only)
+    type(psb_d_nest_matrix), intent(inout) :: op
+    real(psb_dpk_),          intent(in)    :: x(:)
+    real(psb_dpk_),          intent(inout) :: y(:)
+    real(psb_dpk_),          intent(in)    :: alpha, beta
+    integer(psb_ipk_),       intent(out)   :: info, n_exchanges
+    logical,                 intent(in)    :: halo_only
+
+    integer(psb_ipk_)              :: nf, i, j, k, nrl, nownj, nowni, nfc, nbc
+    integer(psb_ipk_), allocatable :: owned_offset(:)
+    real(psb_dpk_),    allocatable :: xs(:), xf(:), yb(:)
+
+    info = psb_success_; n_exchanges = 0
+    nf = op%n_fields
+    allocate(owned_offset(nf+1))
+    owned_offset(1) = 0
+    do j = 1, nf
+      owned_offset(j+1) = owned_offset(j) + op%field_desc(j)%get_local_rows()
+    end do
+    nrl = owned_offset(nf+1)
+
+    if (.not. halo_only) then
+      if (beta == dzero) then
+        y(1:nrl) = dzero
+      else if (beta /= done) then
+        y(1:nrl) = beta * y(1:nrl)
+      end if
+    end if
+
+    do j = 1, nf
+      nownj = op%field_desc(j)%get_local_rows()
+      nfc   = op%field_desc(j)%get_local_cols()
+      do i = 1, nf
+        if (.not. op%block_storage%has_block(i,j)) cycle
+        nbc = op%block_col_desc(i,j)%get_local_cols()
+        allocate(xs(nbc)); xs = dzero
+        xs(1:nownj) = x(owned_offset(j)+1 : owned_offset(j)+nownj)
+        call psb_halo(xs, op%block_col_desc(i,j), info)
+        if (info /= psb_success_) return
+        n_exchanges = n_exchanges + 1
+        if (.not. halo_only) then
+          allocate(xf(nfc)); xf = dzero
+          do k = 1, nbc
+            xf(op%blk2field(i,j)%map(k)) = xs(k)
+          end do
+          nowni = op%field_desc(i)%get_local_rows()
+          allocate(yb(nowni)); yb = dzero
+          call op%block_storage%mats(i,j)%a%csmv(alpha, xf, dzero, yb, info)
+          if (info /= psb_success_) return
+          y(owned_offset(i)+1 : owned_offset(i)+nowni) = &
+               & y(owned_offset(i)+1 : owned_offset(i)+nowni) + yb
+          deallocate(xf, yb)
+        end if
+        deallocate(xs)
+      end do
+    end do
+    deallocate(owned_offset)
+  end subroutine nest_spmv_perblock
+
+end program psb_d_nest_halo_regime_test