From be6f27069a61ae004fca598400f5f382037008c1 Mon Sep 17 00:00:00 2001 From: Stack-1 Date: Tue, 21 Apr 2026 14:56:47 +0200 Subject: [PATCH] =?UTF-8?q?[UPDATE]=20Modified=20tests=20to=20sup=C3=A8por?= =?UTF-8?q?t=20GPU=20SpMV=20computing,=20Moreover=20dinstinct=20routine=20?= =?UTF-8?q?for=20persistant=20is=20now=20present=20in=20psi=5Fdswpadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/comm/internals/psi_dswapdata.F90 | 1320 +++++++---------- .../psb_comm_neighbor_impl_mod.F90 | 20 - test/comm/cg/Makefile | 6 +- test/comm/cg/psb_comm_cg_test.F90 | 131 +- test/comm/cg/psb_pde3d_cg_noprec.inp | 18 - test/comm/spmv/Makefile | 13 +- test/comm/spmv/psb_spmv_overlap_test.f90 | 1077 -------------- test/comm/spmv/psb_spmv_test.f90 | 694 +++++++++ test/comm/spmv/spmv_overlap.f90 | 28 - 9 files changed, 1379 insertions(+), 1928 deletions(-) delete mode 100644 test/comm/cg/psb_pde3d_cg_noprec.inp delete mode 100644 test/comm/spmv/psb_spmv_overlap_test.f90 create mode 100644 test/comm/spmv/psb_spmv_test.f90 delete mode 100644 test/comm/spmv/spmv_overlap.f90 diff --git a/base/comm/internals/psi_dswapdata.F90 b/base/comm/internals/psi_dswapdata.F90 index f7030eb1..d8c7fff9 100644 --- a/base/comm/internals/psi_dswapdata.F90 +++ b/base/comm/internals/psi_dswapdata.F90 @@ -85,105 +85,9 @@ submodule (psi_d_comm_v_mod) psi_d_swapdata_impl use psb_error_mod, only: psb_get_debug_level, psb_get_debug_unit, psb_debug_ext_ use psb_comm_factory_mod - logical, save :: psb_swap_timing_inited = .false. - logical, save :: psb_swap_timing_enabled = .false. - integer(psb_ipk_), save :: psb_swap_timing_max_report = 32 - integer(psb_ipk_), save :: psb_swap_timing_report_count = 0 - integer(psb_ipk_), save :: psb_swap_timing_wrapper_calls = 0 - integer(psb_ipk_), save :: psb_swap_timing_baseline_calls = 0 - integer(psb_ipk_), save :: psb_swap_timing_neighbor_calls = 0 - logical, save :: psb_swap_start_debug_inited = .false. - logical, save :: psb_swap_start_debug_enabled = .false. - integer(psb_ipk_), save :: psb_swap_start_debug_max_report = 128 - integer(psb_ipk_), save :: psb_swap_start_debug_report_count = 0 - contains - subroutine psb_swap_timing_setup() - implicit none - character(len=64) :: env_buf - integer(psb_ipk_) :: env_len, env_status, ios - - if (psb_swap_timing_inited) return - - psb_swap_timing_inited = .true. - psb_swap_timing_enabled = .false. - psb_swap_timing_max_report = 32 - - call get_environment_variable('PSB_SWAP_TIMING', env_buf, length=env_len, status=env_status) - if ((env_status == 0) .and. (env_len > 0)) then - select case(env_buf(1:1)) - case('1','t','T','y','Y') - psb_swap_timing_enabled = .true. - case default - psb_swap_timing_enabled = .false. - end select - end if - - call get_environment_variable('PSB_SWAP_TIMING_MAX_REPORT', env_buf, length=env_len, status=env_status) - if ((env_status == 0) .and. (env_len > 0)) then - read(env_buf(1:env_len), *, iostat=ios) psb_swap_timing_max_report - if ((ios /= 0) .or. (psb_swap_timing_max_report < 1)) psb_swap_timing_max_report = 32 - end if - - end subroutine psb_swap_timing_setup - - logical function psb_swap_timing_should_report() - implicit none - - call psb_swap_timing_setup() - - psb_swap_timing_should_report = .false. - if (.not. psb_swap_timing_enabled) return - if (psb_swap_timing_report_count >= psb_swap_timing_max_report) return - - psb_swap_timing_report_count = psb_swap_timing_report_count + 1 - psb_swap_timing_should_report = .true. - end function psb_swap_timing_should_report - - subroutine psb_swap_start_debug_setup() - implicit none - character(len=64) :: env_buf - integer(psb_ipk_) :: env_len, env_status, ios - - if (psb_swap_start_debug_inited) return - - psb_swap_start_debug_inited = .true. - psb_swap_start_debug_enabled = .false. - psb_swap_start_debug_max_report = 128 - - call get_environment_variable('PSB_SWAP_DEBUG_START', env_buf, length=env_len, status=env_status) - if ((env_status == 0) .and. (env_len > 0)) then - select case(env_buf(1:1)) - case('1','t','T','y','Y') - psb_swap_start_debug_enabled = .true. - case default - psb_swap_start_debug_enabled = .false. - end select - end if - - call get_environment_variable('PSB_SWAP_DEBUG_START_MAX_REPORT', env_buf, length=env_len, status=env_status) - if ((env_status == 0) .and. (env_len > 0)) then - read(env_buf(1:env_len), *, iostat=ios) psb_swap_start_debug_max_report - if ((ios /= 0) .or. (psb_swap_start_debug_max_report < 1)) psb_swap_start_debug_max_report = 128 - end if - end subroutine psb_swap_start_debug_setup - - logical function psb_swap_start_debug_should_report() - implicit none - - call psb_swap_start_debug_setup() - - psb_swap_start_debug_should_report = .false. - if (.not. psb_swap_start_debug_enabled) return - if (psb_swap_start_debug_report_count >= psb_swap_start_debug_max_report) return - - psb_swap_start_debug_report_count = psb_swap_start_debug_report_count + 1 - psb_swap_start_debug_should_report = .true. - end function psb_swap_start_debug_should_report - module subroutine psi_dswapdata_vect(swap_status,beta,y,desc_a,info,data) - #ifdef PSB_MPI_MOD use mpi #endif @@ -201,17 +105,11 @@ contains ! locals type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: np, me, total_send, total_recv, num_neighbors, data_ + integer(psb_ipk_) :: np, my_rank, total_send, total_recv, num_neighbors, data_ class(psb_i_base_vect_type), pointer :: comm_indexes - logical :: debug_on - integer(psb_ipk_) :: dbg_unit - logical :: timing_on, timing_report - real(psb_dpk_) :: t0, t1, t_get_list, t_kernel, t_total - integer(psb_ipk_) :: call_idx - character(len=24) :: phase_name, scheme_name, exchange_name ! communication scheme/status selectors - logical :: baseline, neighbor_a2av + logical :: baseline, ineighbor_a2av, ineighbor_a2av_persistent ! error handling variables integer(psb_ipk_) :: err_act @@ -223,33 +121,13 @@ contains ctxt = desc_a%get_context() - call psb_info(ctxt,me,np) + call psb_info(ctxt,my_rank,np) if (np == -1) then info=psb_err_context_error_ call psb_errpush(info,name) goto 9999 endif - debug_on = (psb_get_debug_level() >= psb_debug_ext_) - call psb_swap_timing_setup() - timing_on = psb_swap_timing_enabled - timing_report = .false. - if (timing_on) then - t_get_list = dzero - t_kernel = dzero - t_total = dzero - t0 = psb_wtime() - call_idx = psb_swap_timing_wrapper_calls + 1 - end if - - if (debug_on) then - dbg_unit = psb_get_debug_unit() - if (dbg_unit <= 0) dbg_unit = psb_err_unit - write(dbg_unit,*) me, trim(name), ': enter swap_status=', swap_status, & - & ' data=', data_, ' local_rows=', desc_a%get_local_rows(), & - & ' local_cols=', desc_a%get_local_cols(), ' y_nrows=', y%get_nrows() - end if - if (.not.psb_is_asb_desc(desc_a)) then info=psb_err_invalid_cd_state_ call psb_errpush(info,name) @@ -262,20 +140,12 @@ contains data_ = psb_comm_halo_ end if - if (timing_on) t1 = psb_wtime() call desc_a%get_list_p(data_,comm_indexes,num_neighbors,total_recv,total_send,info) - if (timing_on) t_get_list = psb_wtime() - t1 if (info /= psb_success_) then call psb_errpush(psb_err_internal_error_,name,a_err='desc_a%get_list_p') goto 9999 end if - if (debug_on) then - write(dbg_unit,*) me, trim(name), ': list_p num_neighbors=', num_neighbors, & - & ' total_send=', total_send, ' total_recv=', total_recv, & - & ' comm_indexes_size=', size(comm_indexes%v) - end if - if( (swap_status /= psb_comm_status_start_).and.(swap_status /= psb_comm_status_wait_)& & .and.(swap_status /= psb_comm_status_sync_) ) then @@ -292,14 +162,6 @@ contains end if end if - ! Debug: comm handle allocation and type - ! if(me == 0) then - ! write(psb_err_unit,*) me, 'CALLING' - ! write(psb_err_unit,*) me, 'DBG: comm_handle allocated=', allocated(y%comm_handle) - ! if (allocated(y%comm_handle)) then - ! write(psb_err_unit,*) me, 'DBG: comm_handle%comm_type=', y%comm_handle%comm_type - ! end if - ! end if ! Set the normalized swap status on the comm handle call y%comm_handle%set_swap_status(swap_status, info) if (info /= psb_success_) then @@ -307,89 +169,33 @@ contains goto 9999 end if - if (debug_on) then - write(dbg_unit,*) me, trim(name), ': comm_type=', y%comm_handle%comm_type, & - & ' swap_status=', swap_status - end if - - ! if(me == 0) then - ! write(psb_err_unit,*) me, 'DBG: after set_swap_status, info=', info - ! end if - baseline = .false. - neighbor_a2av = .false. select case(y%comm_handle%comm_type) - case(psb_comm_ineighbor_alltoallv_, psb_comm_persistent_ineighbor_alltoallv_) - neighbor_a2av = .true. - case default - baseline = .true. - end select - - ! if(me == 0) then - ! write(psb_err_unit,*) me, 'DBG: selected baseline=', baseline, ' neighbor=', neighbor_a2av - ! end if - - if (baseline) then - if (timing_on) t1 = psb_wtime() + case(psb_comm_isend_irecv_) call psi_dswap_baseline_vect(ctxt,swap_status,beta,y,comm_indexes,num_neighbors,total_send,total_recv,y%comm_handle,info) - if (timing_on) t_kernel = psb_wtime() - t1 if (info /= psb_success_) then call psb_errpush(info,name,a_err='baseline swap') goto 9999 end if - else if (neighbor_a2av) then - if (timing_on) t1 = psb_wtime() + case(psb_comm_ineighbor_alltoallv_) call psi_dswap_neighbor_topology_vect(ctxt,swap_status,beta,y,comm_indexes,num_neighbors,& & total_send,total_recv,y%comm_handle,info) - if (timing_on) t_kernel = psb_wtime() - t1 if (info /= psb_success_) then - call psb_errpush(info,name,a_err='neighbor a2av swap') + call psb_errpush(info,name,a_err='neighbor nonblocking swap') goto 9999 end if - else + case(psb_comm_persistent_ineighbor_alltoallv_) + call psi_dswap_neighbor_persistent_topology_vect(ctxt,swap_status,beta,y,comm_indexes,num_neighbors,& + & total_send,total_recv,y%comm_handle,info) + if (info /= psb_success_) then + call psb_errpush(info,name,a_err='neighbor persistent swap') + goto 9999 + end if + case default info = psb_err_mpi_error_ - call psb_errpush(info,name,a_err='Incompatible swap_status settings: neither baseline nor neighbor_a2av is true') + call psb_errpush(info,name,a_err='Incompatible swap_status settings: no valid communication mode selected') goto 9999 - end if - - if (timing_on) then - t_total = psb_wtime() - t0 - call psb_amx(ctxt, t_get_list) - call psb_amx(ctxt, t_kernel) - call psb_amx(ctxt, t_total) - if (me == psb_root_) timing_report = psb_swap_timing_should_report() - if ((me == psb_root_) .and. timing_report) then - psb_swap_timing_wrapper_calls = call_idx - select case(swap_status) - case(psb_comm_status_start_) - phase_name = 'start' - case(psb_comm_status_wait_) - phase_name = 'wait' - case(psb_comm_status_sync_) - phase_name = 'sync' - case default - phase_name = 'unknown' - end select - if (baseline) then - scheme_name = 'baseline' - else - if (y%comm_handle%comm_type == psb_comm_persistent_ineighbor_alltoallv_) then - scheme_name = 'persistent_neighbor' - else - scheme_name = 'neighbor' - end if - end if - if (call_idx == 1) then - exchange_name = 'first' - else - exchange_name = 'steady' - end if - write(psb_out_unit,'("SWAP_TIMING wrapper scheme=",a,", phase=",a,", exchange=",a,", call=",i0)') & - & trim(scheme_name), trim(phase_name), trim(exchange_name), call_idx - write(psb_out_unit,'(" get_list=",es12.5,", kernel=",es12.5,", total=",es12.5)') & - & t_get_list, t_kernel, t_total - end if - end if + end select call psb_erractionrestore(err_act) return @@ -419,7 +225,7 @@ contains ! locals integer(psb_mpk_) :: icomm - integer(psb_mpk_) :: np, me + integer(psb_mpk_) :: np, my_rank integer(psb_mpk_) :: proc_to_comm, p2ptag, p2pstat(mpi_status_size),& & iret, nesd, nerv integer(psb_mpk_), allocatable :: prcid(:) @@ -429,18 +235,12 @@ contains logical :: do_send,do_recv logical, parameter :: usersend=.false. logical :: debug - logical :: timing_on, timing_report - integer(psb_ipk_) :: dbg_unit - character(len=20) :: name - real(psb_dpk_) :: t0, t1 - real(psb_dpk_) :: t_buf, t_gth, t_post, t_wait, t_sct, t_dev, t_total - integer(psb_ipk_) :: call_idx - character(len=12) :: exchange_name + character(len=20) :: name info = psb_success_ name = 'psi_dswap_baseline_vect' call psb_erractionsave(err_act) - call psb_info(ctxt,me,np) + call psb_info(ctxt,my_rank,np) if (np == -1) then info = psb_err_context_error_ call psb_errpush(info,name) @@ -449,24 +249,6 @@ contains icomm = ctxt%get_mpic() - debug = (psb_get_debug_level() >= psb_debug_ext_) - call psb_swap_timing_setup() - timing_on = psb_swap_timing_enabled - timing_report = .false. - if (timing_on) then - t_buf = dzero - t_gth = dzero - t_post = dzero - t_wait = dzero - t_sct = dzero - t_dev = dzero - t_total = dzero - t0 = psb_wtime() - call_idx = psb_swap_timing_baseline_calls + 1 - end if - dbg_unit = psb_get_debug_unit() - if (dbg_unit <= 0) dbg_unit = psb_err_unit - baseline_comm_handle => null() select type(ch => comm_handle) type is(psb_comm_baseline_handle) @@ -491,7 +273,7 @@ contains total_send_ = total_send * n call comm_indexes%sync() - if (debug) write(dbg_unit,*) me,'Internal buffer' + if (debug) write(*,*) my_rank,'Internal buffer' if (do_send) then if (allocated(baseline_comm_handle%comid)) then if (any(baseline_comm_handle%comid /= mpi_request_null)) then @@ -503,16 +285,12 @@ contains goto 9999 end if end if - if (debug) write(dbg_unit,*) me,'do_send start' - if (timing_on) t1 = psb_wtime() call y%new_buffer(ione*size(comm_indexes%v),info) call psb_realloc(num_neighbors,2_psb_ipk_,baseline_comm_handle%comid,info) baseline_comm_handle%comid = mpi_request_null call psb_realloc(num_neighbors,prcid,info) - if (timing_on) t_buf = t_buf + (psb_wtime() - t1) ! First I post all the non blocking receives pnti = 1 - if (timing_on) t1 = psb_wtime() do i=1, num_neighbors proc_to_comm = comm_indexes%v(pnti+psb_proc_id_) nerv = comm_indexes%v(pnti+psb_n_elem_recv_) @@ -520,22 +298,20 @@ contains rcv_pt = 1+pnti+psb_n_elem_recv_ prcid(i) = psb_get_mpi_rank(ctxt,proc_to_comm) - if ((nerv>0).and.(proc_to_comm /= me)) then - if (debug) write(dbg_unit,*) me,'Posting receive from',prcid(i),rcv_pt + if ((nerv>0).and.(proc_to_comm /= my_rank)) then + if (debug) write(*,*) my_rank,'Posting receive from',prcid(i),rcv_pt p2ptag = psb_double_swap_tag - call mpi_irecv(y%combuf(rcv_pt),nerv,& - & psb_mpi_r_dpk_,prcid(i),& - & p2ptag, icomm,baseline_comm_handle%comid(i,2),iret) + call mpi_irecv(y%combuf(rcv_pt),nerv,& + & psb_mpi_r_dpk_,prcid(i),& + & p2ptag, icomm,baseline_comm_handle%comid(i,2),iret) end if pnti = pnti + nerv + nesd + 3 end do - if (timing_on) t_post = t_post + (psb_wtime() - t1) - if (debug) write(dbg_unit,*) me,' Gather ' + if (debug) write(*,*) my_rank,' Gather ' ! ! Then gather for sending. ! pnti = 1 - if (timing_on) t1 = psb_wtime() do i=1, num_neighbors nerv = comm_indexes%v(pnti+psb_n_elem_recv_) nesd = comm_indexes%v(pnti+nerv+psb_n_elem_send_) @@ -555,16 +331,13 @@ contains call y%gth(idx_pt,nesd,comm_indexes) pnti = pnti + nerv + nesd + 3 end do - if (timing_on) t_gth = t_gth + (psb_wtime() - t1) ! ! Then wait ! - if (timing_on) t1 = psb_wtime() call y%device_wait() - if (timing_on) t_dev = t_dev + (psb_wtime() - t1) - if (debug) write(dbg_unit,*) me,' isend' + if (debug) write(*,*) my_rank,' isend' ! ! Then send ! @@ -573,7 +346,6 @@ contains snd_pt = 1 rcv_pt = 1 p2ptag = psb_double_swap_tag - if (timing_on) t1 = psb_wtime() do i=1, num_neighbors proc_to_comm = comm_indexes%v(pnti+psb_proc_id_) nerv = comm_indexes%v(pnti+psb_n_elem_recv_) @@ -581,10 +353,10 @@ contains snd_pt = 1+pnti+nerv+psb_n_elem_send_ rcv_pt = 1+pnti+psb_n_elem_recv_ - if ((nesd>0).and.(proc_to_comm /= me)) then - call mpi_isend(y%combuf(snd_pt),nesd,& - & psb_mpi_r_dpk_,prcid(i),& - & p2ptag,icomm,baseline_comm_handle%comid(i,1),iret) + if ((nesd>0).and.(proc_to_comm /= my_rank)) then + call mpi_isend(y%combuf(snd_pt),nesd,& + & psb_mpi_r_dpk_,prcid(i),& + & p2ptag,icomm,baseline_comm_handle%comid(i,1),iret) end if if(iret /= mpi_success) then @@ -595,11 +367,10 @@ contains pnti = pnti + nerv + nesd + 3 end do - if (timing_on) t_post = t_post + (psb_wtime() - t1) end if if (do_recv) then - if (debug) write(dbg_unit,*) me,' do_Recv' + if (debug) write(*,*) my_rank,' do_Recv' if (.not.allocated(baseline_comm_handle%comid)) then ! ! No matching send? Something is wrong.... @@ -610,10 +381,9 @@ contains end if call psb_realloc(num_neighbors,prcid,info) - if (debug) write(dbg_unit,*) me,' wait' + if (debug) write(*,*) my_rank,' wait' pnti = 1 p2ptag = psb_double_swap_tag - if (timing_on) t1 = psb_wtime() do i=1, num_neighbors proc_to_comm = comm_indexes%v(pnti+psb_proc_id_) nerv = comm_indexes%v(pnti+psb_n_elem_recv_) @@ -621,7 +391,7 @@ contains snd_pt = 1+pnti+nerv+psb_n_elem_send_ rcv_pt = 1+pnti+psb_n_elem_recv_ - if (proc_to_comm /= me)then + if (proc_to_comm /= my_rank)then if (nesd>0) then call mpi_wait(baseline_comm_handle%comid(i,1),p2pstat,iret) if(iret /= mpi_success) then @@ -638,7 +408,7 @@ contains goto 9999 end if end if - else if (proc_to_comm == me) then + else if (proc_to_comm == my_rank) then if (nesd /= nerv) then write(psb_err_unit,*) & & 'Fatal error in swapdata: mismatch on self send',& @@ -646,15 +416,13 @@ contains end if y%combuf(rcv_pt:rcv_pt+nerv-1) = y%combuf(snd_pt:snd_pt+nesd-1) end if - pnti = pnti + nerv + nesd + 3 + pnti = pnti + nerv + nesd + 3 end do - if (timing_on) t_wait = t_wait + (psb_wtime() - t1) - if (debug) write(dbg_unit,*) me,' scatter' + if (debug) write(*,*) my_rank,' scatter' pnti = 1 snd_pt = 1 rcv_pt = 1 - if (timing_on) t1 = psb_wtime() do i=1, num_neighbors proc_to_comm = comm_indexes%v(pnti+psb_proc_id_) nerv = comm_indexes%v(pnti+psb_n_elem_recv_) @@ -674,12 +442,12 @@ contains goto 9999 end if - if (debug) write(dbg_unit,*)me,' Received from: ',prcid(i),& + if (debug) write(*,*)my_rank,' Received from: ',prcid(i),& & y%combuf(rcv_pt:rcv_pt+nerv-1) call y%sct(rcv_pt,nerv,comm_indexes,beta) - pnti = pnti + nerv + nesd + 3 + pnti = pnti + nerv + nesd + 3 end do - if (timing_on) t_sct = t_sct + (psb_wtime() - t1) + ! ! Waited for everybody, clean up ! @@ -688,10 +456,9 @@ contains ! ! Then wait for device ! - if (debug) write(dbg_unit,*) me,' wait' - if (timing_on) t1 = psb_wtime() + if (debug) write(*,*) my_rank,' wait' call y%device_wait() - if (debug) write(dbg_unit,*) me,' free buffer' + if (debug) write(*,*) my_rank,' free buffer' call y%maybe_free_buffer(info) if (info == 0) then if (allocated(y%comm_handle)) call y%comm_handle%free(info) @@ -700,33 +467,7 @@ contains call psb_errpush(psb_err_alloc_dealloc_,name) goto 9999 end if - if (timing_on) t_dev = t_dev + (psb_wtime() - t1) - if (debug) write(dbg_unit,*) me,' done' - end if - - if (timing_on) then - t_total = psb_wtime() - t0 - call psb_amx(ctxt, t_buf) - call psb_amx(ctxt, t_gth) - call psb_amx(ctxt, t_post) - call psb_amx(ctxt, t_wait) - call psb_amx(ctxt, t_sct) - call psb_amx(ctxt, t_dev) - call psb_amx(ctxt, t_total) - if (me == psb_root_) timing_report = psb_swap_timing_should_report() - if ((me == psb_root_) .and. timing_report) then - psb_swap_timing_baseline_calls = call_idx - if (call_idx == 1) then - exchange_name = 'first' - else - exchange_name = 'steady' - end if - write(psb_out_unit,'("SWAP_TIMING baseline phase start=",l1,", wait=",l1)') do_send, do_recv - write(psb_out_unit,'(" exchange=",a,", call=",i0)') trim(exchange_name), call_idx - write(psb_out_unit,'(" buf=",es12.5,", gth=",es12.5,", post=",es12.5,", wait=",es12.5)') & - & t_buf, t_gth, t_post, t_wait - write(psb_out_unit,'(" sct=",es12.5,", dev=",es12.5,", total=",es12.5)') t_sct, t_dev, t_total - end if + if (debug) write(*,*) my_rank,' done' end if @@ -761,25 +502,20 @@ contains ! locals integer(psb_mpk_) :: icomm - integer(psb_mpk_) :: np, me + integer(psb_mpk_) :: np, my_rank integer(psb_mpk_) :: iret, p2pstat(mpi_status_size) type(psb_comm_neighbor_handle), pointer :: neighbor_comm_handle integer(psb_ipk_) :: err_act, topology_total_send, topology_total_recv, buffer_size logical :: do_start, do_wait logical :: debug - logical :: timing_on, timing_report - integer(psb_ipk_) :: dbg_unit character(len=30) :: name - real(psb_dpk_) :: t0, t1 - real(psb_dpk_) :: t_topo, t_buf, t_gth, t_init, t_post, t_wait, t_sct, t_dev, t_total - integer(psb_ipk_) :: call_idx - character(len=12) :: exchange_name + info = psb_success_ name = 'psi_dswap_neighbor_topology_vect' call psb_erractionsave(err_act) - call psb_info(ctxt,me,np) + call psb_info(ctxt,my_rank,np) if (np == -1) then info=psb_err_context_error_ call psb_errpush(info,name) @@ -787,25 +523,6 @@ contains endif icomm = ctxt%get_mpic() - dbg_unit = psb_get_debug_unit() - if (dbg_unit <= 0) dbg_unit = psb_err_unit - debug = (psb_get_debug_level() >= psb_debug_ext_) - call psb_swap_timing_setup() - timing_on = psb_swap_timing_enabled - timing_report = .false. - if (timing_on) then - t_topo = dzero - t_buf = dzero - t_gth = dzero - t_init = dzero - t_post = dzero - t_wait = dzero - t_sct = dzero - t_dev = dzero - t_total = dzero - t0 = psb_wtime() - call_idx = psb_swap_timing_neighbor_calls + 1 - end if neighbor_comm_handle => null() select type(ch => comm_handle) @@ -832,19 +549,211 @@ contains ! START phase: build topology (if needed), gather, post MPI ! --------------------------------------------------------- if (do_start) then - if(debug) write(dbg_unit,*) me,' nbr_vect: starting data exchange' - if (neighbor_comm_handle%use_persistent_buffers) then - if (neighbor_comm_handle%persistent_in_flight) then + if(debug) write(*,*) my_rank,' nbr_vect: starting data exchange (nonblocking)' + if (.not. neighbor_comm_handle%is_initialized) then + if (debug) write(*,*) my_rank,' nbr_vect: building topology via handle' + call neighbor_comm_handle%topology_init(comm_indexes%v, num_neighbors, total_send, total_recv, ctxt, icomm, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_internal_error_, name, a_err='neighbor_topology_init') + goto 9999 + end if + end if + topology_total_send = neighbor_comm_handle%total_send + topology_total_recv = neighbor_comm_handle%total_recv + + ! Buffer layout: + ! combuf(1 : total_send) = send area + ! combuf(total_send+1 : total_send+total_recv) = recv area + buffer_size = topology_total_send + topology_total_recv + + if (buffer_size > 0) then + call y%new_buffer(buffer_size, info) + if (info /= 0) then + call psb_errpush(psb_err_alloc_dealloc_, name) + goto 9999 + end if + neighbor_comm_handle%comm_request = mpi_request_null + + ! Gather send data into contiguous send buffer (polymorphic for GPU) + if (debug) write(*,*) my_rank,' nbr_vect: gathering send data,', topology_total_send,' elems' + call y%gth(int(topology_total_send,psb_mpk_), & + & neighbor_comm_handle%send_indexes, & + & y%combuf(1:topology_total_send)) + else + ! No data to send/recv: ensure request indicates idle state + neighbor_comm_handle%comm_request = mpi_request_null + end if + + ! Wait for device (important for GPU subclasses) + call y%device_wait() + + ! Post non-blocking neighborhood alltoallv + if (debug) write(*,*) my_rank,' nbr_vect: posting MPI_Ineighbor_alltoallv' + if (buffer_size > 0) then + call mpi_ineighbor_alltoallv( & + & y%combuf(1), & ! send buffer + & neighbor_comm_handle%send_counts, & + & neighbor_comm_handle%send_displs, & + & psb_mpi_r_dpk_, & + & y%combuf(topology_total_send + 1), & ! recv buffer + & neighbor_comm_handle%recv_counts, & + & neighbor_comm_handle%recv_displs, & + & psb_mpi_r_dpk_, & + & neighbor_comm_handle%graph_comm, & + & neighbor_comm_handle%comm_request, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, m_err=(/iret/)) + goto 9999 + end if + else + neighbor_comm_handle%comm_request = mpi_request_null + end if + + end if ! do_start + + ! --------------------------------------------------------- + ! WAIT phase: complete MPI, scatter received data + ! --------------------------------------------------------- + if (do_wait) then + + topology_total_send = neighbor_comm_handle%total_send + topology_total_recv = neighbor_comm_handle%total_recv + + if ((topology_total_send + topology_total_recv) == 0) then + ! Valid no-op exchange: nothing was posted in START and nothing to wait/scatter. + neighbor_comm_handle%comm_request = mpi_request_null + else + if (neighbor_comm_handle%comm_request == mpi_request_null) then + write(psb_err_unit,*) my_rank, 'DBG: neighbor WAIT but comm_request is NULL; is_initialized=', & + & neighbor_comm_handle%is_initialized info = psb_err_mpi_error_ - call psb_errpush(info, name, a_err='Invalid START: persistent neighbor request already in flight') + call psb_errpush(info, name, m_err=(/-2/)) goto 9999 end if end if + + ! Only wait and scatter if there's data + if ((topology_total_send + topology_total_recv) > 0) then + ! Wait for the non-blocking collective to complete + if (debug) write(*,*) my_rank,' nbr_vect: waiting on MPI request' + call mpi_wait(neighbor_comm_handle%comm_request, p2pstat, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, m_err=(/iret/)) + goto 9999 + end if + + ! Scatter received data to local vector positions (polymorphic for GPU) + if (debug) write(*,*) my_rank,' nbr_vect: scattering recv data,', topology_total_recv,' elems' + call y%sct(int(topology_total_recv,psb_mpk_), & + & neighbor_comm_handle%recv_indexes, & + & y%combuf(topology_total_send+1:topology_total_send+topology_total_recv), & + & beta) + else + ! nothing to wait/scatter + end if + + + ! Clean up + neighbor_comm_handle%comm_request = mpi_request_null + call y%device_wait() + call y%maybe_free_buffer(info) + if (info /= 0) then + call psb_errpush(psb_err_alloc_dealloc_, name) + goto 9999 + end if + if (debug) write(*,*) my_rank,' nbr_vect: done' + + end if ! do_wait + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(ctxt,err_act) + + return + end subroutine psi_dswap_neighbor_topology_vect + + + + subroutine psi_dswap_neighbor_persistent_topology_vect(ctxt,swap_status,beta,y,comm_indexes,& + & num_neighbors,total_send,total_recv,comm_handle,info) +#ifdef PSB_MPI_MOD + use mpi +#endif + implicit none +#ifdef PSB_MPI_H + include 'mpif.h' +#endif + + type(psb_ctxt_type), intent(in) :: ctxt + integer(psb_ipk_), intent(in) :: swap_status + real(psb_dpk_), intent(in) :: beta + class(psb_d_base_vect_type), intent(inout) :: y + class(psb_i_base_vect_type), intent(inout) :: comm_indexes + integer(psb_ipk_), intent(in) :: num_neighbors,total_send,total_recv + class(psb_comm_handle_type), intent(inout) :: comm_handle + integer(psb_ipk_), intent(out) :: info + + ! locals + integer(psb_mpk_) :: icomm + integer(psb_mpk_) :: np, my_rank + integer(psb_mpk_) :: iret, p2pstat(mpi_status_size) + type(psb_comm_neighbor_handle), pointer :: neighbor_comm_handle + integer(psb_ipk_) :: err_act, topology_total_send, topology_total_recv, buffer_size + logical :: do_start, do_wait + logical :: debug + character(len=30) :: name + + + + info = psb_success_ + name = 'psi_dswap_neighbor_persistent_topology_vect' + call psb_erractionsave(err_act) + call psb_info(ctxt,my_rank,np) + if (np == -1) then + info=psb_err_context_error_ + call psb_errpush(info,name) + goto 9999 + endif + + icomm = ctxt%get_mpic() + + neighbor_comm_handle => null() + select type(ch => comm_handle) + type is(psb_comm_neighbor_handle) + neighbor_comm_handle => ch + class default + info = psb_err_mpi_error_ + call psb_errpush(info,name,a_err='Expected neighbor comm_handle in persistent neighbor swap') + goto 9999 + end select + + if(swap_status == psb_comm_status_unknown_) then + info = psb_err_mpi_error_ + call psb_errpush(info,name,a_err='Invalid swap_status: psb_comm_status_unknown_ is not allowed in neighbor swap') + goto 9999 + end if + + do_start = (swap_status == psb_comm_status_start_) .or. (swap_status == psb_comm_status_sync_) + do_wait = (swap_status == psb_comm_status_wait_) .or. (swap_status == psb_comm_status_sync_) + + call comm_indexes%sync() + + ! --------------------------------------------------------- + ! START phase: build topology (if needed), gather, post MPI + ! --------------------------------------------------------- + if (do_start) then + if(debug) write(*,*) my_rank,' nbr_vect: starting data exchange (persistent)' + if (neighbor_comm_handle%persistent_in_flight) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, a_err='Invalid START: persistent neighbor request already in flight') + goto 9999 + end if if (.not. neighbor_comm_handle%is_initialized) then - if (debug) write(dbg_unit,*) me,' nbr_vect: building topology via handle' - if (timing_on) t1 = psb_wtime() + if (debug) write(*,*) my_rank,' nbr_vect: building topology via handle' call neighbor_comm_handle%topology_init(comm_indexes%v, num_neighbors, total_send, total_recv, ctxt, icomm, info) - if (timing_on) t_topo = t_topo + (psb_wtime() - t1) if (info /= psb_success_) then call psb_errpush(psb_err_internal_error_, name, a_err='neighbor_topology_init') goto 9999 @@ -859,192 +768,92 @@ contains buffer_size = topology_total_send + topology_total_recv if (buffer_size > 0) then - if (timing_on) t1 = psb_wtime() - if (neighbor_comm_handle%use_persistent_buffers) then - if (.not. allocated(y%combuf)) then - neighbor_comm_handle%diag_buffer_reallocs = neighbor_comm_handle%diag_buffer_reallocs + 1 - if (neighbor_comm_handle%persistent_request_ready) then - if (neighbor_comm_handle%persistent_request /= mpi_request_null) then - call mpi_request_free(neighbor_comm_handle%persistent_request, iret) - end if - neighbor_comm_handle%persistent_request = mpi_request_null - neighbor_comm_handle%persistent_request_ready = .false. - neighbor_comm_handle%persistent_in_flight = .false. - neighbor_comm_handle%persistent_buffer_size = 0 + if (.not. allocated(y%combuf)) then + if (neighbor_comm_handle%persistent_request_ready) then + if (neighbor_comm_handle%persistent_request /= mpi_request_null) then + call mpi_request_free(neighbor_comm_handle%persistent_request, iret) end if - call y%new_buffer(buffer_size, info) - if (info /= 0) then - call psb_errpush(psb_err_alloc_dealloc_, name) - goto 9999 - end if - else if (size(y%combuf) < buffer_size) then - neighbor_comm_handle%diag_buffer_reallocs = neighbor_comm_handle%diag_buffer_reallocs + 1 - if (neighbor_comm_handle%persistent_request_ready) then - if (neighbor_comm_handle%persistent_request /= mpi_request_null) then - call mpi_request_free(neighbor_comm_handle%persistent_request, iret) - end if - neighbor_comm_handle%persistent_request = mpi_request_null - neighbor_comm_handle%persistent_request_ready = .false. - neighbor_comm_handle%persistent_in_flight = .false. - neighbor_comm_handle%persistent_buffer_size = 0 - end if - call y%new_buffer(buffer_size, info) - if (info /= 0) then - call psb_errpush(psb_err_alloc_dealloc_, name) - goto 9999 + neighbor_comm_handle%persistent_request = mpi_request_null + neighbor_comm_handle%persistent_request_ready = .false. + neighbor_comm_handle%persistent_in_flight = .false. + neighbor_comm_handle%persistent_buffer_size = 0 + end if + call y%new_buffer(buffer_size, info) + if (info /= 0) then + call psb_errpush(psb_err_alloc_dealloc_, name) + goto 9999 + end if + else if (size(y%combuf) < buffer_size) then + if (neighbor_comm_handle%persistent_request_ready) then + if (neighbor_comm_handle%persistent_request /= mpi_request_null) then + call mpi_request_free(neighbor_comm_handle%persistent_request, iret) end if + neighbor_comm_handle%persistent_request = mpi_request_null + neighbor_comm_handle%persistent_request_ready = .false. + neighbor_comm_handle%persistent_in_flight = .false. + neighbor_comm_handle%persistent_buffer_size = 0 end if - else call y%new_buffer(buffer_size, info) if (info /= 0) then call psb_errpush(psb_err_alloc_dealloc_, name) goto 9999 end if end if - if (timing_on) t_buf = t_buf + (psb_wtime() - t1) - neighbor_comm_handle%comm_request = mpi_request_null + end if + neighbor_comm_handle%comm_request = mpi_request_null + if (buffer_size > 0) then ! Gather send data into contiguous send buffer (polymorphic for GPU) - if (debug) write(*,*) me,' nbr_vect: gathering send data,', topology_total_send,' elems' - if (timing_on) t1 = psb_wtime() + if (debug) write(*,*) my_rank,' nbr_vect: gathering send data,', topology_total_send,' elems' call y%gth(int(topology_total_send,psb_mpk_), & & neighbor_comm_handle%send_indexes, & & y%combuf(1:topology_total_send)) - if (timing_on) t_gth = t_gth + (psb_wtime() - t1) else - ! No data to send/recv: ensure requests/buffers indicate idle state - neighbor_comm_handle%comm_request = mpi_request_null neighbor_comm_handle%persistent_in_flight = .false. - neighbor_comm_handle%persistent_request_ready = neighbor_comm_handle%persistent_request_ready end if ! Wait for device (important for GPU subclasses) - if (timing_on) t1 = psb_wtime() call y%device_wait() - if (timing_on) t_dev = t_dev + (psb_wtime() - t1) - - if (neighbor_comm_handle%use_persistent_buffers) then - ! Lazy persistent-init: build the request once, then reuse with START/WAIT. - if (.not. neighbor_comm_handle%persistent_request_ready) then -#ifdef PSB_HAVE_MPI_NEIGHBOR_PERSISTENT - if (buffer_size > 0) then - if (debug) write(*,*) me,' nbr_vect: posting MPI_Neighbor_alltoallv_init' - if (timing_on) t1 = psb_wtime() - call mpi_neighbor_alltoallv_init( & - & y%combuf(1), & ! send buffer - & neighbor_comm_handle%send_counts, & - & neighbor_comm_handle%send_displs, & - & psb_mpi_r_dpk_, & - & y%combuf(topology_total_send + 1), & ! recv buffer - & neighbor_comm_handle%recv_counts, & - & neighbor_comm_handle%recv_displs, & - & psb_mpi_r_dpk_, & - & neighbor_comm_handle%graph_comm, & - & mpi_info_null, & - & neighbor_comm_handle%persistent_request, iret) - if (timing_on) t_init = t_init + (psb_wtime() - t1) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/iret/)) - goto 9999 - end if - neighbor_comm_handle%diag_init_calls = neighbor_comm_handle%diag_init_calls + 1 - neighbor_comm_handle%persistent_request_ready = .true. - neighbor_comm_handle%persistent_buffer_size = buffer_size - else - neighbor_comm_handle%persistent_request_ready = .false. - neighbor_comm_handle%persistent_buffer_size = 0 - end if -#else - ! Fallback when persistent neighborhood collectives are not available - neighbor_comm_handle%persistent_request_ready = .false. - neighbor_comm_handle%persistent_buffer_size = 0 -#endif - end if -#ifdef PSB_HAVE_MPI_NEIGHBOR_PERSISTENT - if (buffer_size > 0) then - ! Count the attempt before MPI_Start so we can diagnose call reachability. - neighbor_comm_handle%diag_start_calls = neighbor_comm_handle%diag_start_calls + 1 - if (psb_swap_start_debug_should_report()) then - write(psb_out_unit,'("SWAP_DEBUG MPI_Start(pre) kind=vect rank=",i0,", bsz=",i0,", ready=",l1)') & - & me, buffer_size, neighbor_comm_handle%persistent_request_ready - write(psb_out_unit,'(" inflight=",l1,", req_null=",l1,", dstart=",i0)') & - & neighbor_comm_handle%persistent_in_flight, & - & (neighbor_comm_handle%persistent_request == mpi_request_null), & - & neighbor_comm_handle%diag_start_calls - end if - if (timing_on) t1 = psb_wtime() - call mpi_start(neighbor_comm_handle%persistent_request, iret) - if (timing_on) t_post = t_post + (psb_wtime() - t1) - if (psb_swap_start_debug_should_report()) then - write(psb_out_unit,'("SWAP_DEBUG MPI_Start(post) kind=vect rank=",i0,", iret=",i0)') & - & me, iret - write(psb_out_unit,'(" inflight=",l1,", dstart=",i0)') & - & neighbor_comm_handle%persistent_in_flight, neighbor_comm_handle%diag_start_calls - end if - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/iret/)) - goto 9999 - end if - neighbor_comm_handle%persistent_in_flight = .true. - else - neighbor_comm_handle%persistent_in_flight = .false. - end if -#else + ! Lazy persistent-init: build the request once, then reuse with START/WAIT. + if (.not. neighbor_comm_handle%persistent_request_ready) then if (buffer_size > 0) then - neighbor_comm_handle%diag_ineighbor_calls = neighbor_comm_handle%diag_ineighbor_calls + 1 - if (timing_on) t1 = psb_wtime() - call mpi_ineighbor_alltoallv( & - & y%combuf(1), & ! send buffer + if (debug) write(*,*) my_rank,' nbr_vect: posting MPI_Neighbor_alltoallv_init' + call mpi_neighbor_alltoallv_init( & + & y%combuf(1), & ! send buffer & neighbor_comm_handle%send_counts, & & neighbor_comm_handle%send_displs, & - & psb_mpi_r_dpk_, & - & y%combuf(topology_total_send + 1), & ! recv buffer + & psb_mpi_r_dpk_, & + & y%combuf(topology_total_send + 1), & ! recv buffer & neighbor_comm_handle%recv_counts, & & neighbor_comm_handle%recv_displs, & - & psb_mpi_r_dpk_, & + & psb_mpi_r_dpk_, & & neighbor_comm_handle%graph_comm, & - & neighbor_comm_handle%comm_request, iret) - if (timing_on) t_post = t_post + (psb_wtime() - t1) + & mpi_info_null, & + & neighbor_comm_handle%persistent_request, iret) if (iret /= mpi_success) then info = psb_err_mpi_error_ call psb_errpush(info, name, m_err=(/iret/)) goto 9999 end if - neighbor_comm_handle%persistent_in_flight = .true. + neighbor_comm_handle%persistent_request_ready = .true. + neighbor_comm_handle%persistent_buffer_size = buffer_size else - neighbor_comm_handle%persistent_in_flight = .false. - neighbor_comm_handle%comm_request = mpi_request_null + neighbor_comm_handle%persistent_request_ready = .false. + neighbor_comm_handle%persistent_buffer_size = 0 end if -#endif - else - ! Post non-blocking neighborhood alltoallv - if (debug) write(*,*) me,' nbr_vect: posting MPI_Ineighbor_alltoallv' - if (buffer_size > 0) then - neighbor_comm_handle%diag_ineighbor_calls = neighbor_comm_handle%diag_ineighbor_calls + 1 - if (timing_on) t1 = psb_wtime() - call mpi_ineighbor_alltoallv( & - & y%combuf(1), & ! send buffer - & neighbor_comm_handle%send_counts, & - & neighbor_comm_handle%send_displs, & - & psb_mpi_r_dpk_, & - & y%combuf(topology_total_send + 1), & ! recv buffer - & neighbor_comm_handle%recv_counts, & - & neighbor_comm_handle%recv_displs, & - & psb_mpi_r_dpk_, & - & neighbor_comm_handle%graph_comm, & - & neighbor_comm_handle%comm_request, iret) - if (timing_on) t_post = t_post + (psb_wtime() - t1) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/iret/)) - goto 9999 - end if - else - neighbor_comm_handle%comm_request = mpi_request_null + end if + + if (buffer_size > 0) then + call mpi_start(neighbor_comm_handle%persistent_request, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, m_err=(/iret/)) + goto 9999 end if + neighbor_comm_handle%persistent_in_flight = .true. + else + neighbor_comm_handle%persistent_in_flight = .false. end if end if ! do_start @@ -1059,119 +868,39 @@ contains if ((topology_total_send + topology_total_recv) == 0) then ! Valid no-op exchange: nothing was posted in START and nothing to wait/scatter. - if (neighbor_comm_handle%use_persistent_buffers) then - neighbor_comm_handle%persistent_in_flight = .false. - else - neighbor_comm_handle%comm_request = mpi_request_null - end if + neighbor_comm_handle%persistent_in_flight = .false. else - if (neighbor_comm_handle%use_persistent_buffers) then - if (.not. neighbor_comm_handle%persistent_in_flight) then - info = psb_err_mpi_error_ - call psb_errpush(info, name, a_err='Invalid WAIT: no persistent neighbor request in flight') - goto 9999 - end if - else - if (neighbor_comm_handle%comm_request == mpi_request_null) then - write(psb_err_unit,*) me, 'DBG: neighbor WAIT but comm_request is NULL; is_initialized=', & - & neighbor_comm_handle%is_initialized - info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/-2/)) - goto 9999 - end if + if (.not. neighbor_comm_handle%persistent_in_flight) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, a_err='Invalid WAIT: no persistent neighbor request in flight') + goto 9999 end if end if ! Only wait and scatter if there's data if ((topology_total_send + topology_total_recv) > 0) then - ! Wait for the non-blocking collective to complete - if (debug) write(*,*) me,' nbr_vect: waiting on MPI request' - if (timing_on) t1 = psb_wtime() - if (neighbor_comm_handle%use_persistent_buffers) then -#ifdef PSB_HAVE_MPI_NEIGHBOR_PERSISTENT - call mpi_wait(neighbor_comm_handle%persistent_request, p2pstat, iret) -#else - call mpi_wait(neighbor_comm_handle%comm_request, p2pstat, iret) -#endif - else - call mpi_wait(neighbor_comm_handle%comm_request, p2pstat, iret) - end if - if (timing_on) t_wait = t_wait + (psb_wtime() - t1) + ! Wait for the persistent collective to complete + if (debug) write(*,*) my_rank,' nbr_vect: waiting on persistent MPI request' + call mpi_wait(neighbor_comm_handle%persistent_request, p2pstat, iret) if (iret /= mpi_success) then info = psb_err_mpi_error_ call psb_errpush(info, name, m_err=(/iret/)) goto 9999 end if - if (neighbor_comm_handle%use_persistent_buffers) then - neighbor_comm_handle%diag_wait_calls = neighbor_comm_handle%diag_wait_calls + 1 - end if - if (neighbor_comm_handle%use_persistent_buffers) then - neighbor_comm_handle%persistent_in_flight = .false. - end if + neighbor_comm_handle%persistent_in_flight = .false. ! Scatter received data to local vector positions (polymorphic for GPU) - if (debug) write(*,*) me,' nbr_vect: scattering recv data,', topology_total_recv,' elems' - if (timing_on) t1 = psb_wtime() + if (debug) write(*,*) my_rank,' nbr_vect: scattering recv data,', topology_total_recv,' elems' call y%sct(int(topology_total_recv,psb_mpk_), & & neighbor_comm_handle%recv_indexes, & & y%combuf(topology_total_send+1:topology_total_send+topology_total_recv), & & beta) - if (timing_on) t_sct = t_sct + (psb_wtime() - t1) - else - ! nothing to wait/scatter - end if - - - ! Clean up - if ((.not. neighbor_comm_handle%use_persistent_buffers) .or. & - & (neighbor_comm_handle%use_persistent_buffers .and. .not. neighbor_comm_handle%persistent_request_ready)) then - neighbor_comm_handle%comm_request = mpi_request_null - end if - if (timing_on) t1 = psb_wtime() - call y%device_wait() - if (.not. neighbor_comm_handle%use_persistent_buffers) then - call y%maybe_free_buffer(info) - if (info /= 0) then - call psb_errpush(psb_err_alloc_dealloc_, name) - goto 9999 - end if - end if - if (timing_on) t_dev = t_dev + (psb_wtime() - t1) - if (debug) write(*,*) me,' nbr_vect: done' - - end if ! do_wait - - if (timing_on) then - t_total = psb_wtime() - t0 - call psb_amx(ctxt, t_topo) - call psb_amx(ctxt, t_buf) - call psb_amx(ctxt, t_gth) - call psb_amx(ctxt, t_init) - call psb_amx(ctxt, t_post) - call psb_amx(ctxt, t_wait) - call psb_amx(ctxt, t_sct) - call psb_amx(ctxt, t_dev) - call psb_amx(ctxt, t_total) - if (me == psb_root_) timing_report = psb_swap_timing_should_report() - if ((me == psb_root_) .and. timing_report) then - psb_swap_timing_neighbor_calls = call_idx - if (call_idx == 1) then - exchange_name = 'first' - else - exchange_name = 'steady' - end if - if (neighbor_comm_handle%use_persistent_buffers) then - write(psb_out_unit,'("SWAP_TIMING persistent_neighbor phase start=",l1,", wait=",l1)') do_start, do_wait - else - write(psb_out_unit,'("SWAP_TIMING neighbor phase start=",l1,", wait=",l1)') do_start, do_wait - end if - write(psb_out_unit,'(" exchange=",a,", call=",i0)') trim(exchange_name), call_idx - write(psb_out_unit,'(" topo=",es12.5,", buf=",es12.5,", gth=",es12.5,", init=",es12.5)') & - & t_topo, t_buf, t_gth, t_init - write(psb_out_unit,'(" post=",es12.5,", wait=",es12.5,", sct=",es12.5,", dev=",es12.5,", total=",es12.5)') & - & t_post, t_wait, t_sct, t_dev, t_total end if - end if + + call y%device_wait() + if (debug) write(*,*) my_rank,' nbr_vect: done' + + end if ! do_wait call psb_erractionrestore(err_act) return @@ -1179,7 +908,7 @@ contains 9999 call psb_error_handler(ctxt,err_act) return - end subroutine psi_dswap_neighbor_topology_vect + end subroutine psi_dswap_neighbor_persistent_topology_vect @@ -1208,12 +937,12 @@ contains integer(psb_ipk_), optional :: data ! communication scheme/status selectors - logical :: baseline, neighbor_a2av + logical :: baseline, ineighbor_a2av, ineighbor_a2av_persistent ! locals type(psb_ctxt_type) :: ctxt integer(psb_mpk_) :: icomm - integer(psb_ipk_) :: np, me, total_send, total_recv, num_neighbors, data_, err_act + integer(psb_ipk_) :: np, my_rank, total_send, total_recv, num_neighbors, data_, err_act class(psb_i_base_vect_type), pointer :: comm_indexes character(len=30) :: name @@ -1224,7 +953,7 @@ contains ctxt = desc_a%get_context() icomm = ctxt%get_mpic() - call psb_info(ctxt,me,np) + call psb_info(ctxt,my_rank,np) if (np == -1) then info=psb_err_context_error_ call psb_errpush(info,name) @@ -1271,10 +1000,13 @@ contains end if baseline = .false. - neighbor_a2av = .false. + ineighbor_a2av = .false. + ineighbor_a2av_persistent = .false. select case(y%comm_handle%comm_type) - case(psb_comm_ineighbor_alltoallv_, psb_comm_persistent_ineighbor_alltoallv_) - neighbor_a2av = .true. + case(psb_comm_ineighbor_alltoallv_) + ineighbor_a2av = .true. + case(psb_comm_persistent_ineighbor_alltoallv_) + ineighbor_a2av_persistent = .true. case default baseline = .true. end select @@ -1285,16 +1017,23 @@ contains call psb_errpush(info,name,a_err='baseline swap') goto 9999 end if - else if (neighbor_a2av) then + else if (ineighbor_a2av) then call psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_indexes, & & num_neighbors,total_send,total_recv,y%comm_handle,info) if (info /= psb_success_) then - call psb_errpush(info,name,a_err='neighbor a2av swap') + call psb_errpush(info,name,a_err='neighbor nonblocking swap') + goto 9999 + end if + else if (ineighbor_a2av_persistent) then + call psi_dswap_neighbor_topology_multivect_persistent(ctxt,swap_status,beta,y,comm_indexes, & + & num_neighbors,total_send,total_recv,y%comm_handle,info) + if (info /= psb_success_) then + call psb_errpush(info,name,a_err='neighbor persistent swap') goto 9999 end if else info = psb_err_mpi_error_ - call psb_errpush(info,name,a_err='Incompatible swap_status settings: neither baseline nor neighbor_a2av is true') + call psb_errpush(info,name,a_err='Incompatible swap_status settings: no valid communication mode selected') goto 9999 end if @@ -1330,7 +1069,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & ! locals integer(psb_mpk_) :: icomm - integer(psb_mpk_) :: np, me, nesd, nerv, n + integer(psb_mpk_) :: np, my_rank, nesd, nerv, n integer(psb_mpk_) :: proc_to_comm, p2ptag, p2pstat(mpi_status_size), iret integer(psb_mpk_), allocatable :: prcid(:) type(psb_comm_baseline_handle), pointer :: baseline_comm_handle @@ -1343,7 +1082,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & info = psb_success_ name = 'psi_dswap_baseline_multivect' call psb_erractionsave(err_act) - call psb_info(ctxt,me,np) + call psb_info(ctxt,my_rank,np) if (np == -1) then info = psb_err_context_error_ call psb_errpush(info,name) @@ -1370,7 +1109,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & call comm_indexes%sync() - if (debug) write(*,*) me,'Internal buffer' + if (debug) write(*,*) my_rank,'Internal buffer' if (do_send) then if (allocated(baseline_comm_handle%comid)) then if (any(baseline_comm_handle%comid /= mpi_request_null)) then @@ -1379,7 +1118,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & goto 9999 end if end if - if (debug) write(*,*) me,'do_send start' + if (debug) write(*,*) my_rank,'do_send start' call y%new_buffer(ione*size(comm_indexes%v),info) call psb_realloc(num_neighbors,2_psb_ipk_,baseline_comm_handle%comid,info) if (info /= psb_success_) then @@ -1397,8 +1136,8 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & nerv = comm_indexes%v(pnti+psb_n_elem_recv_) nesd = comm_indexes%v(pnti+nerv+psb_n_elem_send_) prcid(i) = psb_get_mpi_rank(ctxt,proc_to_comm) - if ((nerv>0).and.(proc_to_comm /= me)) then - if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt + if ((nerv>0).and.(proc_to_comm /= my_rank)) then + if (debug) write(*,*) my_rank,'Posting receive from',prcid(i),rcv_pt p2ptag = psb_double_swap_tag call mpi_irecv(y%combuf(rcv_pt),n*nerv,& & psb_mpi_r_dpk_,prcid(i),& @@ -1408,7 +1147,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & snd_pt = snd_pt + n*nesd pnti = pnti + nerv + nesd + 3 end do - if (debug) write(*,*) me,' Gather ' + if (debug) write(*,*) my_rank,' Gather ' ! ! Then gather for sending. ! @@ -1430,7 +1169,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & ! call y%device_wait() - if (debug) write(*,*) me,' isend' + if (debug) write(*,*) my_rank,' isend' ! ! Then send ! @@ -1444,7 +1183,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & nerv = comm_indexes%v(pnti+psb_n_elem_recv_) nesd = comm_indexes%v(pnti+nerv+psb_n_elem_send_) - if ((nesd>0).and.(proc_to_comm /= me)) then + if ((nesd>0).and.(proc_to_comm /= my_rank)) then call mpi_isend(y%combuf(snd_pt),n*nesd,& & psb_mpi_r_dpk_,prcid(i),& & p2ptag,icomm,baseline_comm_handle%comid(i,1),iret) @@ -1462,7 +1201,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & end if if (do_recv) then - if (debug) write(*,*) me,' do_Recv' + if (debug) write(*,*) my_rank,' do_Recv' if (.not.allocated(baseline_comm_handle%comid)) then ! ! No matching send? Something is wrong.... @@ -1473,7 +1212,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & end if call psb_realloc(num_neighbors,prcid,info) - if (debug) write(*,*) me,' wait' + if (debug) write(*,*) my_rank,' wait' pnti = 1 snd_pt = total_recv_+1 rcv_pt = 1 @@ -1482,7 +1221,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & proc_to_comm = comm_indexes%v(pnti+psb_proc_id_) nerv = comm_indexes%v(pnti+psb_n_elem_recv_) nesd = comm_indexes%v(pnti+nerv+psb_n_elem_send_) - if (proc_to_comm /= me)then + if (proc_to_comm /= my_rank)then if (nesd>0) then call mpi_wait(baseline_comm_handle%comid(i,1),p2pstat,iret) if(iret /= mpi_success) then @@ -1499,7 +1238,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & goto 9999 end if end if - else if (proc_to_comm == me) then + else if (proc_to_comm == my_rank) then if (nesd /= nerv) then write(psb_err_unit,*) & & 'Fatal error in swapdata: mismatch on self send',& @@ -1512,7 +1251,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & pnti = pnti + nerv + nesd + 3 end do - if (debug) write(*,*) me,' scatter' + if (debug) write(*,*) my_rank,' scatter' pnti = 1 snd_pt = total_recv_+1 rcv_pt = 1 @@ -1522,7 +1261,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & nesd = comm_indexes%v(pnti+nerv+psb_n_elem_send_) idx_pt = 1+pnti+psb_n_elem_recv_ - if (debug) write(0,*)me,' Received from: ',prcid(i),& + if (debug) write(0,*)my_rank,' Received from: ',prcid(i),& & y%combuf(rcv_pt:rcv_pt+n*nerv-1) call y%sct(idx_pt,rcv_pt,nerv,comm_indexes,beta) rcv_pt = rcv_pt + n*nerv @@ -1537,9 +1276,9 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & ! ! Then wait for device ! - if (debug) write(*,*) me,' wait' + if (debug) write(*,*) my_rank,' wait' call y%device_wait() - if (debug) write(*,*) me,' free buffer' + if (debug) write(*,*) my_rank,' free buffer' call y%free_buffer(info) if (info == 0) then if (allocated(y%comm_handle)) call psb_comm_free(y%comm_handle, info) @@ -1548,7 +1287,7 @@ subroutine psi_dswap_baseline_multivect(ctxt,swap_status,beta,y,comm_indexes, & call psb_errpush(psb_err_alloc_dealloc_,name) goto 9999 end if - if (debug) write(*,*) me,' done' + if (debug) write(*,*) my_rank,' done' end if @@ -1584,7 +1323,7 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in ! locals integer(psb_mpk_) :: icomm - integer(psb_mpk_) :: np, me + integer(psb_mpk_) :: np, my_rank integer(psb_mpk_) :: iret, p2pstat(mpi_status_size) type(psb_comm_neighbor_handle), pointer :: neighbor_comm_handle integer(psb_ipk_) :: err_act, topology_total_send, topology_total_recv, buffer_size @@ -1596,7 +1335,7 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in info = psb_success_ name = 'psi_dswap_neighbor_topology_multivect' call psb_erractionsave(err_act) - call psb_info(ctxt,me,np) + call psb_info(ctxt,my_rank,np) if (np == -1) then info=psb_err_context_error_ call psb_errpush(info,name) @@ -1624,16 +1363,200 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in ! START phase: build topology (if needed), gather, post MPI ! --------------------------------------------------------- if (do_start) then - if(debug) write(*,*) me,' nbr_vect: starting data exchange' - if (neighbor_comm_handle%use_persistent_buffers) then - if (neighbor_comm_handle%persistent_in_flight) then + if(debug) write(*,*) my_rank,' nbr_vect: starting data exchange (nonblocking)' + if (.not. neighbor_comm_handle%is_initialized) then + if (debug) write(*,*) my_rank,' nbr_vect: building topology via handle' + call neighbor_comm_handle%topology_init(comm_indexes%v, num_neighbors, total_send, total_recv, & + & ctxt, icomm, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_internal_error_, name, a_err='neighbor_topology_init') + goto 9999 + end if + end if + topology_total_send = neighbor_comm_handle%total_send + topology_total_recv = neighbor_comm_handle%total_recv + + ! Buffer layout: + ! combuf(1 : total_send) = send area + ! combuf(total_send+1 : total_send+total_recv) = recv area + buffer_size = topology_total_send + topology_total_recv + + if (buffer_size > 0) then + call y%new_buffer(buffer_size, info) + if (info /= 0) then + call psb_errpush(psb_err_alloc_dealloc_, name) + goto 9999 + end if + end if + neighbor_comm_handle%comm_request = mpi_request_null + + ! Gather send data into contiguous send buffer (polymorphic for GPU) + if (buffer_size > 0) then + if (debug) write(*,*) my_rank,' nbr_vect: gathering send data,', topology_total_send,' elems' + call y%gth(int(topology_total_send,psb_mpk_), & + & neighbor_comm_handle%send_indexes, & + & y%combuf(1:topology_total_send)) + end if + + ! Wait for device (important for GPU subclasses) + call y%device_wait() + + ! Post non-blocking neighborhood alltoallv + if (debug) write(*,*) my_rank,' nbr_vect: posting MPI_Ineighbor_alltoallv' + if (buffer_size > 0) then + call mpi_ineighbor_alltoallv( & + & y%combuf(1), & ! send buffer + & neighbor_comm_handle%send_counts, & + & neighbor_comm_handle%send_displs, & + & psb_mpi_r_dpk_, & + & y%combuf(topology_total_send + 1), & ! recv buffer + & neighbor_comm_handle%recv_counts, & + & neighbor_comm_handle%recv_displs, & + & psb_mpi_r_dpk_, & + & neighbor_comm_handle%graph_comm, & + & neighbor_comm_handle%comm_request, iret) + if (iret /= mpi_success) then info = psb_err_mpi_error_ - call psb_errpush(info, name, a_err='Invalid START: persistent neighbor request already in flight') + call psb_errpush(info, name, m_err=(/iret/)) + goto 9999 + end if + else + neighbor_comm_handle%comm_request = mpi_request_null + end if + + end if ! do_start + + ! --------------------------------------------------------- + ! WAIT phase: complete MPI, scatter received data + ! --------------------------------------------------------- + if (do_wait) then + + if ((topology_total_send + topology_total_recv) > 0) then + if (neighbor_comm_handle%comm_request == mpi_request_null) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, m_err=(/-2/)) + goto 9999 + end if + else + neighbor_comm_handle%comm_request = mpi_request_null + end if + + topology_total_send = neighbor_comm_handle%total_send + topology_total_recv = neighbor_comm_handle%total_recv + + ! Wait for the non-blocking collective to complete + if ((topology_total_send + topology_total_recv) > 0) then + if (debug) write(*,*) my_rank,' nbr_vect: waiting on MPI request' + call mpi_wait(neighbor_comm_handle%comm_request, p2pstat, iret) + if (iret /= mpi_success) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, m_err=(/iret/)) goto 9999 end if end if + + ! Scatter received data to local vector positions (polymorphic for GPU) + if ((topology_total_send + topology_total_recv) > 0) then + if (debug) write(*,*) my_rank,' nbr_vect: scattering recv data,', topology_total_recv,' elems' + call y%sct(int(topology_total_recv,psb_mpk_), & + & neighbor_comm_handle%recv_indexes, & + & y%combuf(topology_total_send+1:topology_total_send+topology_total_recv), & + & beta) + end if + + + ! Clean up + neighbor_comm_handle%comm_request = mpi_request_null + call y%device_wait() + call y%maybe_free_buffer(info) + if (info /= 0) then + call psb_errpush(psb_err_alloc_dealloc_, name) + goto 9999 + end if + if (debug) write(*,*) my_rank,' nbr_vect: done' + + end if ! do_wait + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(ctxt,err_act) + + return +end subroutine psi_dswap_neighbor_topology_multivect + + + +subroutine psi_dswap_neighbor_topology_multivect_persistent(ctxt,swap_status,beta,y,comm_indexes,& + & num_neighbors,total_send,total_recv,comm_handle,info) +#ifdef PSB_MPI_MOD + use mpi +#endif + implicit none +#ifdef PSB_MPI_H + include 'mpif.h' +#endif + + type(psb_ctxt_type), intent(in) :: ctxt + integer(psb_ipk_), intent(in) :: swap_status + real(psb_dpk_), intent(in) :: beta + class(psb_d_base_multivect_type), intent(inout) :: y + class(psb_i_base_vect_type), intent(inout) :: comm_indexes + integer(psb_ipk_), intent(in) :: num_neighbors,total_send, total_recv + class(psb_comm_handle_type), intent(inout) :: comm_handle + integer(psb_ipk_), intent(out) :: info + + + ! locals + integer(psb_mpk_) :: icomm + integer(psb_mpk_) :: np, my_rank + integer(psb_mpk_) :: iret, p2pstat(mpi_status_size) + type(psb_comm_neighbor_handle), pointer :: neighbor_comm_handle + integer(psb_ipk_) :: err_act, topology_total_send, topology_total_recv, buffer_size + logical :: do_start, do_wait + logical, parameter :: debug = .false. + character(len=30) :: name + + + info = psb_success_ + name = 'psi_dswap_neighbor_topology_multivect_persistent' + call psb_erractionsave(err_act) + call psb_info(ctxt,my_rank,np) + if (np == -1) then + info=psb_err_context_error_ + call psb_errpush(info,name) + goto 9999 + endif + + icomm = ctxt%get_mpic() + + neighbor_comm_handle => null() + select type(ch => comm_handle) + type is(psb_comm_neighbor_handle) + neighbor_comm_handle => ch + class default + info = psb_err_mpi_error_ + call psb_errpush(info,name,a_err='Expected neighbor comm_handle in persistent neighbor multivect swap') + goto 9999 + end select + + do_start = (swap_status == psb_comm_status_start_) .or. (swap_status == psb_comm_status_unknown_) + do_wait = (swap_status == psb_comm_status_wait_) .or. (swap_status == psb_comm_status_unknown_) + + call comm_indexes%sync() + + ! --------------------------------------------------------- + ! START phase: build topology (if needed), gather, post MPI + ! --------------------------------------------------------- + if (do_start) then + if(debug) write(*,*) my_rank,' nbr_vect: starting data exchange (persistent)' + if (neighbor_comm_handle%persistent_in_flight) then + info = psb_err_mpi_error_ + call psb_errpush(info, name, a_err='Invalid START: persistent neighbor request already in flight') + goto 9999 + end if if (.not. neighbor_comm_handle%is_initialized) then - if (debug) write(*,*) me,' nbr_vect: building topology via handle' + if (debug) write(*,*) my_rank,' nbr_vect: building topology via handle' call neighbor_comm_handle%topology_init(comm_indexes%v, num_neighbors, total_send, total_recv, & & ctxt, icomm, info) if (info /= psb_success_) then @@ -1649,9 +1572,8 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in ! combuf(total_send+1 : total_send+total_recv) = recv area buffer_size = topology_total_send + topology_total_recv - if (neighbor_comm_handle%use_persistent_buffers) then + if (buffer_size > 0) then if (.not. allocated(y%combuf)) then - neighbor_comm_handle%diag_buffer_reallocs = neighbor_comm_handle%diag_buffer_reallocs + 1 if (neighbor_comm_handle%persistent_request_ready) then if (neighbor_comm_handle%persistent_request /= mpi_request_null) then call mpi_request_free(neighbor_comm_handle%persistent_request, iret) @@ -1667,7 +1589,6 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in goto 9999 end if else if (size(y%combuf) < buffer_size) then - neighbor_comm_handle%diag_buffer_reallocs = neighbor_comm_handle%diag_buffer_reallocs + 1 if (neighbor_comm_handle%persistent_request_ready) then if (neighbor_comm_handle%persistent_request /= mpi_request_null) then call mpi_request_free(neighbor_comm_handle%persistent_request, iret) @@ -1683,28 +1604,25 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in goto 9999 end if end if - else - call y%new_buffer(buffer_size, info) - if (info /= 0) then - call psb_errpush(psb_err_alloc_dealloc_, name) - goto 9999 - end if end if neighbor_comm_handle%comm_request = mpi_request_null - ! Gather send data into contiguous send buffer (polymorphic for GPU) - if (debug) write(*,*) me,' nbr_vect: gathering send data,', topology_total_send,' elems' - call y%gth(int(topology_total_send,psb_mpk_), & - & neighbor_comm_handle%send_indexes, & - & y%combuf(1:topology_total_send)) + if (buffer_size > 0) then + ! Gather send data into contiguous send buffer (polymorphic for GPU) + if (debug) write(*,*) my_rank,' nbr_vect: gathering send data,', topology_total_send,' elems' + call y%gth(int(topology_total_send,psb_mpk_), & + & neighbor_comm_handle%send_indexes, & + & y%combuf(1:topology_total_send)) + else + neighbor_comm_handle%persistent_in_flight = .false. + end if ! Wait for device (important for GPU subclasses) call y%device_wait() - if (neighbor_comm_handle%use_persistent_buffers) then - if (.not. neighbor_comm_handle%persistent_request_ready) then -#ifdef PSB_HAVE_MPI_NEIGHBOR_PERSISTENT - if (debug) write(*,*) me,' nbr_vect: posting MPI_Neighbor_alltoallv_init' + if (.not. neighbor_comm_handle%persistent_request_ready) then + if (buffer_size > 0) then + if (debug) write(*,*) my_rank,' nbr_vect: posting MPI_Neighbor_alltoallv_init' call mpi_neighbor_alltoallv_init( & & y%combuf(1), & ! send buffer & neighbor_comm_handle%send_counts, & @@ -1722,80 +1640,24 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in call psb_errpush(info, name, m_err=(/iret/)) goto 9999 end if - neighbor_comm_handle%diag_init_calls = neighbor_comm_handle%diag_init_calls + 1 neighbor_comm_handle%persistent_request_ready = .true. neighbor_comm_handle%persistent_buffer_size = buffer_size -#else - ! Fallback when persistent neighborhood collectives are not available + else neighbor_comm_handle%persistent_request_ready = .false. neighbor_comm_handle%persistent_buffer_size = 0 -#endif end if + end if -#ifdef PSB_HAVE_MPI_NEIGHBOR_PERSISTENT - ! Count the attempt before MPI_Start so we can diagnose call reachability. - neighbor_comm_handle%diag_start_calls = neighbor_comm_handle%diag_start_calls + 1 - if (psb_swap_start_debug_should_report()) then - write(psb_out_unit,'("SWAP_DEBUG MPI_Start(pre) kind=multivect rank=",i0,", bsz=",i0,", ready=",l1)') & - & me, buffer_size, neighbor_comm_handle%persistent_request_ready - write(psb_out_unit,'(" inflight=",l1,", req_null=",l1,", dstart=",i0)') & - & neighbor_comm_handle%persistent_in_flight, & - & (neighbor_comm_handle%persistent_request == mpi_request_null), & - & neighbor_comm_handle%diag_start_calls - end if + if (buffer_size > 0) then call mpi_start(neighbor_comm_handle%persistent_request, iret) - if (psb_swap_start_debug_should_report()) then - write(psb_out_unit,'("SWAP_DEBUG MPI_Start(post) kind=multivect rank=",i0,", iret=",i0)') & - & me, iret - write(psb_out_unit,'(" inflight=",l1,", dstart=",i0)') & - & neighbor_comm_handle%persistent_in_flight, neighbor_comm_handle%diag_start_calls - end if - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/iret/)) - goto 9999 - end if - neighbor_comm_handle%persistent_in_flight = .true. -#else - neighbor_comm_handle%diag_ineighbor_calls = neighbor_comm_handle%diag_ineighbor_calls + 1 - call mpi_ineighbor_alltoallv( & - & y%combuf(1), & ! send buffer - & neighbor_comm_handle%send_counts, & - & neighbor_comm_handle%send_displs, & - & psb_mpi_r_dpk_, & - & y%combuf(topology_total_send + 1), & ! recv buffer - & neighbor_comm_handle%recv_counts, & - & neighbor_comm_handle%recv_displs, & - & psb_mpi_r_dpk_, & - & neighbor_comm_handle%graph_comm, & - & neighbor_comm_handle%comm_request, iret) if (iret /= mpi_success) then info = psb_err_mpi_error_ call psb_errpush(info, name, m_err=(/iret/)) goto 9999 end if neighbor_comm_handle%persistent_in_flight = .true. -#endif else - ! Post non-blocking neighborhood alltoallv - if (debug) write(*,*) me,' nbr_vect: posting MPI_Ineighbor_alltoallv' - neighbor_comm_handle%diag_ineighbor_calls = neighbor_comm_handle%diag_ineighbor_calls + 1 - call mpi_ineighbor_alltoallv( & - & y%combuf(1), & ! send buffer - & neighbor_comm_handle%send_counts, & - & neighbor_comm_handle%send_displs, & - & psb_mpi_r_dpk_, & - & y%combuf(topology_total_send + 1), & ! recv buffer - & neighbor_comm_handle%recv_counts, & - & neighbor_comm_handle%recv_displs, & - & psb_mpi_r_dpk_, & - & neighbor_comm_handle%graph_comm, & - & neighbor_comm_handle%comm_request, iret) - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/iret/)) - goto 9999 - end if + neighbor_comm_handle%persistent_in_flight = .false. end if end if ! do_start @@ -1805,68 +1667,38 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in ! --------------------------------------------------------- if (do_wait) then - if (neighbor_comm_handle%use_persistent_buffers) then + topology_total_send = neighbor_comm_handle%total_send + topology_total_recv = neighbor_comm_handle%total_recv + + if ((topology_total_send + topology_total_recv) > 0) then if (.not. neighbor_comm_handle%persistent_in_flight) then info = psb_err_mpi_error_ call psb_errpush(info, name, a_err='Invalid WAIT: no persistent neighbor request in flight') goto 9999 end if - else - if (neighbor_comm_handle%comm_request == mpi_request_null) then + + ! Wait for the persistent collective to complete + if (debug) write(*,*) my_rank,' nbr_vect: waiting on persistent MPI request' + call mpi_wait(neighbor_comm_handle%persistent_request, p2pstat, iret) + if (iret /= mpi_success) then info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/-2/)) + call psb_errpush(info, name, m_err=(/iret/)) goto 9999 end if - end if - - topology_total_send = neighbor_comm_handle%total_send - topology_total_recv = neighbor_comm_handle%total_recv + neighbor_comm_handle%persistent_in_flight = .false. - ! Wait for the non-blocking collective to complete - if (debug) write(*,*) me,' nbr_vect: waiting on MPI request' - if (neighbor_comm_handle%use_persistent_buffers) then -#ifdef PSB_HAVE_MPI_NEIGHBOR_PERSISTENT - call mpi_wait(neighbor_comm_handle%persistent_request, p2pstat, iret) -#else - call mpi_wait(neighbor_comm_handle%comm_request, p2pstat, iret) -#endif + ! Scatter received data to local vector positions (polymorphic for GPU) + if (debug) write(*,*) my_rank,' nbr_vect: scattering recv data,', topology_total_recv,' elems' + call y%sct(int(topology_total_recv,psb_mpk_), & + & neighbor_comm_handle%recv_indexes, & + & y%combuf(topology_total_send+1:topology_total_send+topology_total_recv), & + & beta) else - call mpi_wait(neighbor_comm_handle%comm_request, p2pstat, iret) - end if - if (iret /= mpi_success) then - info = psb_err_mpi_error_ - call psb_errpush(info, name, m_err=(/iret/)) - goto 9999 - end if - if (neighbor_comm_handle%use_persistent_buffers) then - neighbor_comm_handle%diag_wait_calls = neighbor_comm_handle%diag_wait_calls + 1 - end if - if (neighbor_comm_handle%use_persistent_buffers) then neighbor_comm_handle%persistent_in_flight = .false. end if - ! Scatter received data to local vector positions (polymorphic for GPU) - if (debug) write(*,*) me,' nbr_vect: scattering recv data,', topology_total_recv,' elems' - call y%sct(int(topology_total_recv,psb_mpk_), & - & neighbor_comm_handle%recv_indexes, & - & y%combuf(topology_total_send+1:topology_total_send+topology_total_recv), & - & beta) - - - ! Clean up - if ((.not. neighbor_comm_handle%use_persistent_buffers) .or. & - & (neighbor_comm_handle%use_persistent_buffers .and. .not. neighbor_comm_handle%persistent_request_ready)) then - neighbor_comm_handle%comm_request = mpi_request_null - end if call y%device_wait() - if (.not. neighbor_comm_handle%use_persistent_buffers) then - call y%maybe_free_buffer(info) - if (info /= 0) then - call psb_errpush(psb_err_alloc_dealloc_, name) - goto 9999 - end if - end if - if (debug) write(*,*) me,' nbr_vect: done' + if (debug) write(*,*) my_rank,' nbr_vect: done' end if ! do_wait @@ -1876,7 +1708,7 @@ subroutine psi_dswap_neighbor_topology_multivect(ctxt,swap_status,beta,y,comm_in 9999 call psb_error_handler(ctxt,err_act) return -end subroutine psi_dswap_neighbor_topology_multivect +end subroutine psi_dswap_neighbor_topology_multivect_persistent end submodule psi_d_swapdata_impl diff --git a/base/modules/comm/comm_schemes/psb_comm_neighbor_impl_mod.F90 b/base/modules/comm/comm_schemes/psb_comm_neighbor_impl_mod.F90 index 4f8a29b8..742dc89c 100644 --- a/base/modules/comm/comm_schemes/psb_comm_neighbor_impl_mod.F90 +++ b/base/modules/comm/comm_schemes/psb_comm_neighbor_impl_mod.F90 @@ -30,11 +30,6 @@ module psb_comm_neighbor_impl_mod logical :: persistent_request_ready = .false. logical :: persistent_in_flight = .false. integer(psb_ipk_) :: persistent_buffer_size = 0 - integer(psb_ipk_) :: diag_init_calls = 0 - integer(psb_ipk_) :: diag_start_calls = 0 - integer(psb_ipk_) :: diag_ineighbor_calls = 0 - integer(psb_ipk_) :: diag_wait_calls = 0 - integer(psb_ipk_) :: diag_buffer_reallocs = 0 contains procedure, pass :: init => psb_comm_neighbor_init procedure, pass :: free => neighbor_topology_free @@ -398,11 +393,6 @@ contains this%persistent_request_ready = .false. this%persistent_in_flight = .false. this%persistent_buffer_size = 0 - this%diag_init_calls = 0 - this%diag_start_calls = 0 - this%diag_ineighbor_calls = 0 - this%diag_wait_calls = 0 - this%diag_buffer_reallocs = 0 call this%free(info) end subroutine psb_comm_neighbor_destroy @@ -440,16 +430,6 @@ contains this%persistent_request_ready = .false. this%persistent_in_flight = .false. this%persistent_buffer_size = 0 - this%diag_init_calls = 0 - this%diag_start_calls = 0 - this%diag_ineighbor_calls = 0 - this%diag_wait_calls = 0 - this%diag_buffer_reallocs = 0 - this%diag_init_calls = 0 - this%diag_start_calls = 0 - this%diag_ineighbor_calls = 0 - this%diag_wait_calls = 0 - this%diag_buffer_reallocs = 0 end subroutine psb_comm_neighbor_init diff --git a/test/comm/cg/Makefile b/test/comm/cg/Makefile index b1e4e53d..6bc3ee30 100644 --- a/test/comm/cg/Makefile +++ b/test/comm/cg/Makefile @@ -4,8 +4,8 @@ MODDIR=$(INSTALLDIR)/modules/ include $(INCDIR)/Make.inc.psblas LIBDIR=$(INSTALLDIR)/lib/ -PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base -LDLIBS=$(PSBLDLIBS) +PSBLAS_LIB= -L$(LIBDIR) $(LCUDA) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base -lpsb_cuda -lpsb_ext +LDLIBS=$(PSBGPULDLIBS) FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG). @@ -24,7 +24,7 @@ runsd: (if test ! -d runs ; then mkdir runs; fi) psb_comm_cg_test.o: $(PROGSRC) - $(FC) $(FCOPT) $(FINCLUDES) $(FDEFINES) -c $(PROGSRC) -o $@ + $(FC) $(FCOPT) $(FINCLUDES) $(FDEFINES) $(FCUDEFINES) -c $(PROGSRC) -o $@ $(EXE): $(TOBJS) $(FLINK) $(LOPT) $(TOBJS) -o $(EXE) $(PSBLAS_LIB) $(LDLIBS) diff --git a/test/comm/cg/psb_comm_cg_test.F90 b/test/comm/cg/psb_comm_cg_test.F90 index 1b75dad7..baf1bab6 100644 --- a/test/comm/cg/psb_comm_cg_test.F90 +++ b/test/comm/cg/psb_comm_cg_test.F90 @@ -1,5 +1,8 @@ program psb_comm_cg_test use psb_base_mod +#ifdef PSB_HAVE_CUDA + use psb_cuda_mod +#endif use psb_prec_mod use psb_linsolve_mod use psb_comm_factory_mod @@ -14,8 +17,14 @@ program psb_comm_cg_test type(psb_desc_type) :: desc_a type(psb_d_vect_type) :: b, x type(psb_dprec_type) :: prec - - integer(psb_ipk_) :: info, iam, np +#ifdef PSB_HAVE_CUDA + type(psb_d_vect_cuda) :: vmold + type(psb_i_vect_cuda) :: imold + type(psb_d_cuda_hlg_sparse_mat), target :: ahlg + class(psb_d_base_sparse_mat), pointer :: agmold +#endif + + integer(psb_ipk_) :: info, my_rank, np integer(psb_ipk_) :: desc_me, desc_np integer(psb_ipk_) :: idim, itmax, itrace, istop, iter integer(psb_ipk_) :: scheme_idx, prec_idx, rep, nrep, nwarm @@ -35,7 +44,9 @@ program psb_comm_cg_test character(len=20) :: prec_name(n_precs) character(len=5) :: afmt character(len=256) :: arg + character(len=16) :: gpu_arg logical :: setup_done + logical :: use_gpu info = psb_success_ afmt = 'CSR' @@ -47,6 +58,11 @@ program psb_comm_cg_test itrace = -1 istop = 2 eps = 1.d-6 +#ifdef PSB_HAVE_CUDA + use_gpu = .true. +#else + use_gpu = .false. +#endif scheme_type = (/ psb_comm_isend_irecv_, psb_comm_ineighbor_alltoallv_, & & psb_comm_persistent_ineighbor_alltoallv_ /) scheme_name(1) = 'isend_irecv' @@ -89,15 +105,31 @@ program psb_comm_cg_test info = psb_success_ end if end if + + call parse_gpu_arg(use_gpu, info) + if (info /= psb_success_) then + write(psb_err_unit,'("Invalid value for --gpu option. Use --gpu=TRUE or --gpu=FALSE")') + stop 1 + end if ! call psb_set_debug_level(psb_debug_ext_) ! call probe_ieee('before psb_init') call psb_init(ctxt) +#ifdef PSB_HAVE_CUDA + if (use_gpu) call psb_cuda_init(ctxt) +#endif ! call probe_ieee('after psb_init') call clear_ieee_flags() ! call probe_ieee('after clear_ieee_flags') - call psb_info(ctxt, iam, np) + call psb_info(ctxt, my_rank, np) + +#ifndef PSB_HAVE_CUDA + if (use_gpu .and. my_rank == psb_root_) then + write(psb_out_unit,'("Warning: --gpu=TRUE requested but this executable was built without CUDA support. Running on CPU.")') + end if + use_gpu = .false. +#endif allocate(setup_time(n_precs,n_schemes,nrep), solve_time(n_precs,n_schemes,nrep), & & total_time(n_precs,n_schemes,nrep), final_error(n_precs,n_schemes,nrep), & @@ -107,7 +139,7 @@ program psb_comm_cg_test & comm_set_time(n_precs,n_schemes,nrep), krylov_time(n_precs,n_schemes,nrep), stat=info) if (info /= psb_success_) stop 1 - if (iam == psb_root_) then + if (my_rank == psb_root_) then write(psb_out_unit,*) 'Welcome to PSBLAS version: ', psb_version_string_ write(psb_out_unit,*) 'This is the comm/cg test program' write(psb_out_unit,'("Grid dimensions : ",i4," x ",i4," x ",i4)') idim,idim,idim @@ -117,18 +149,32 @@ program psb_comm_cg_test write(psb_out_unit,'("Max iterations (CG) : ",i0)') itmax write(psb_out_unit,'("Repetitions : ",i0)') nrep write(psb_out_unit,'("Warmup solves : ",i0)') nwarm + write(psb_out_unit,'("GPU enabled : ",l1)') use_gpu write(psb_out_unit,'(" ")') - write(psb_out_unit,'("Usage: ./psb_comm_cg_test [idim] [nrep] [nwarm] [itmax]")') + write(psb_out_unit,'("Usage: ./psb_comm_cg_test [idim] [nrep] [nwarm] [itmax] [--gpu=TRUE|FALSE]")') write(psb_out_unit,'(" ")') end if call psb_barrier(ctxt) - t_start = psb_wtime() ! call probe_ieee('before psb_d_gen_pde3d') call psb_d_gen_pde3d(ctxt,idim,a,b,x,desc_a,afmt,info) ! call probe_ieee('after psb_d_gen_pde3d') if (info /= psb_success_) goto 9999 +#ifdef PSB_HAVE_CUDA + if (use_gpu) then + agmold => ahlg + call a%cscnv(info,mold=agmold) + if (info /= psb_success_) goto 9999 + call desc_a%cnv(mold=imold) + if (info /= psb_success_) goto 9999 + call psb_geasb(x,desc_a,info,mold=vmold) + if (info /= psb_success_) goto 9999 + call psb_geasb(b,desc_a,info,mold=vmold) + if (info /= psb_success_) goto 9999 + end if +#endif + do prec_idx = 1, n_precs do scheme_idx = 1, n_schemes do rep = 1, nrep @@ -206,31 +252,20 @@ program psb_comm_cg_test final_error(prec_idx,scheme_idx,rep) = err solve_info(prec_idx,scheme_idx,rep) = info - if (iam == psb_root_) then - select type(ch => x%v%comm_handle) - type is(psb_comm_neighbor_handle) - write(psb_out_unit,'("DIAG_COMM scheme=",a,", prec=",a,", rep=",i0)') & - & trim(scheme_name(scheme_idx)), trim(prec_name(prec_idx)), rep - write(psb_out_unit,'("DIAG_COMM counters: init=",i0,", start=",i0,", ineighbor=",i0,", wait=",i0,", realloc=",i0)') & - & ch%diag_init_calls, ch%diag_start_calls, ch%diag_ineighbor_calls, & - & ch%diag_wait_calls, ch%diag_buffer_reallocs - write(psb_out_unit,'("DIAG_COMM state: ready=",l1,", bsz=",i0)') & - & ch%persistent_request_ready, ch%persistent_buffer_size - class default - continue - end select - end if - if (info /= psb_success_) goto 9999 end do end do end do - if (iam == psb_root_) then + if (my_rank == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'(100("="))') write(psb_out_unit,'("CG TIMING STATISTICS - FINAL RESULTS TABLE")') write(psb_out_unit,'(100("="))') + write(psb_out_unit,'("Legend:")') + write(psb_out_unit,'(" - Each phase time for one repetition is reduced with max across MPI ranks.")') + write(psb_out_unit,'(" - Minimum/Average/Maximum/Std Dev are computed across repetitions.")') + write(psb_out_unit,'(" - KrylovPerIter = KrylovSolve/iterations; TotalPerIter = TotalTime/iterations.")') do prec_idx = 1, n_precs write(psb_out_unit,'(" ")') @@ -307,6 +342,10 @@ program psb_comm_cg_test & prec_init_time,prec_bld_time,comm_set_time,krylov_time, & & krylov_it_time,total_it_time) + +#ifdef PSB_HAVE_CUDA + if (use_gpu) call psb_cuda_exit() +#endif call psb_exit(ctxt) stop @@ -449,6 +488,32 @@ contains call ieee_set_flag(ieee_underflow, .false.) end subroutine clear_ieee_flags + subroutine parse_gpu_arg(use_gpu, info) + logical, intent(inout) :: use_gpu + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: i, argc + character(len=256) :: carg, uarg, val + + info = psb_success_ + argc = command_argument_count() + do i = 1, argc + call get_command_argument(i,carg) + uarg = psb_toupper(trim(carg)) + if (index(uarg,'--GPU=') == 1) then + val = psb_toupper(adjustl(carg(7:len_trim(carg)))) + select case (trim(val)) + case ('TRUE','T','1','YES','Y','ON') + use_gpu = .true. + case ('FALSE','F','0','NO','N','OFF') + use_gpu = .false. + case default + info = psb_err_internal_error_ + return + end select + end if + end do + end subroutine parse_gpu_arg + subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info) implicit none integer(psb_ipk_), intent(in) :: idim @@ -464,7 +529,7 @@ contains integer(psb_lpk_) :: m,n,glob_row integer(psb_ipk_) :: nnz,nlr,i,ii,ib,k integer(psb_ipk_) :: ix,iy,iz - integer(psb_ipk_) :: np, iam, nr, nt + integer(psb_ipk_) :: np, my_rank, nr, nt integer(psb_ipk_) :: icoeff integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) real(psb_dpk_), allocatable :: val(:) @@ -477,7 +542,7 @@ contains name = 'create_matrix' call psb_erractionsave(err_act) - call psb_info(ctxt, iam, np) + call psb_info(ctxt, my_rank, np) if (idim <= 0) then info = psb_err_internal_error_ @@ -489,9 +554,9 @@ contains call psb_errpush(info,name,a_err='invalid context: np <= 0') goto 9999 end if - if (iam < 0) then + if (my_rank < 0) then info = psb_err_context_error_ - call psb_errpush(info,name,a_err='invalid context: iam < 0') + call psb_errpush(info,name,a_err='invalid context: my_rank < 0') goto 9999 end if @@ -528,14 +593,14 @@ contains call psb_errpush(info,name,a_err='invalid local nnz estimate: nnz <= 0') goto 9999 end if - if(iam == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n + if(my_rank == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n nt = (m+np-1)/np - nr = max(0,min(nt,m-(iam*nt))) + nr = max(0,min(nt,m-(my_rank*nt))) nt = nr call psb_sum(ctxt,nt) if (nt /= m) then - write(psb_err_unit,*) iam, 'Initialization error ',nr,nt,m + write(psb_err_unit,*) my_rank, 'Initialization error ',nr,nt,m info = -1 call psb_barrier(ctxt) call psb_abort(ctxt) @@ -661,7 +726,7 @@ contains ! call probe_ieee('after psb_spins') if(info /= psb_success_) then write(psb_err_unit,'("INSERT FAIL rank=",i0,", call=psb_spins, ii=",i0,", ib=",i0,", icoeff=",i0)') & - iam, ii, ib, icoeff + my_rank, ii, ib, icoeff write(psb_err_unit,'(" glob_row=",i0,", ix=",i0,", iy=",i0,", iz=",i0)') glob_row, ix, iy, iz exit end if @@ -669,7 +734,7 @@ contains ! call probe_ieee('after psb_geins bv') if(info /= psb_success_) then write(psb_err_unit,'("INSERT FAIL rank=",i0,", call=psb_geins bv, ii=",i0,", ib=",i0,", icoeff=",i0)') & - iam, ii, ib, icoeff + my_rank, ii, ib, icoeff write(psb_err_unit,'(" glob_row=",i0,", ix=",i0,", iy=",i0,", iz=",i0)') glob_row, ix, iy, iz exit end if @@ -678,7 +743,7 @@ contains ! call probe_ieee('after psb_geins xv') if(info /= psb_success_) then write(psb_err_unit,'("INSERT FAIL rank=",i0,", call=psb_geins xv, ii=",i0,", ib=",i0,", icoeff=",i0)') & - iam, ii, ib, icoeff + my_rank, ii, ib, icoeff write(psb_err_unit,'(" glob_row=",i0,", ix=",i0,", iy=",i0,", iz=",i0)') glob_row, ix, iy, iz exit end if @@ -730,7 +795,7 @@ contains call psb_amx(ctxt,tgen) call psb_amx(ctxt,tasb) call psb_amx(ctxt,ttot) - if(iam == psb_root_) then + if(my_rank == psb_root_) then tmpfmt = a%get_fmt() write(psb_out_unit,'("The matrix has been generated and assembled in ",a3," format.")') tmpfmt write(psb_out_unit,'("-allocation time : ",es12.5)') talc diff --git a/test/comm/cg/psb_pde3d_cg_noprec.inp b/test/comm/cg/psb_pde3d_cg_noprec.inp deleted file mode 100644 index fbfeeaf1..00000000 --- a/test/comm/cg/psb_pde3d_cg_noprec.inp +++ /dev/null @@ -1,18 +0,0 @@ -17 Number of entries below this -CG Iterative method BICGSTAB CGS BICG BICGSTABL RGMRES FCG CGR RICHARDSON -NONE Preconditioner NONE DIAG BJAC -CSR Storage format for matrix A: CSR COO -100 Domain size (actual system is this**3 in pde3d) -3 Partition: 1 BLOCK 3 3D -2 Stopping criterion 1 2 -0200 MAXIT -10 ITRACE -002 IRST restart for RGMRES and BiCGSTABL -INVK Block Solver ILU,ILUT,INVK,INVT,AINV -NONE If ILU : MILU or NONE otherwise ignored -NONE Scaling if ILUT: NONE, MAXVAL otherwise ignored -0 Level of fill for forward factorization -1 Level of fill for inverse factorization (only INVK,INVT) -1E-1 Threshold for forward factorization -1E-1 Threshold for inverse factorization (Only INVK, INVT) -LLK Orthogonalization algorithm (only AINV) diff --git a/test/comm/spmv/Makefile b/test/comm/spmv/Makefile index 1416389d..dd1df48b 100644 --- a/test/comm/spmv/Makefile +++ b/test/comm/spmv/Makefile @@ -6,12 +6,12 @@ include $(INCDIR)/Make.inc.psblas # Libraries used # LIBDIR=$(INSTALLDIR)/lib/ -PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base -LDLIBS=$(PSBLDLIBS) +PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_cuda -lpsb_ext -lpsb_base +LDLIBS=$(PSBGPULDLIBS) FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG). -TOBJS=psb_spmv_overlap_test.o spmv_overlap.o +TOBJS=psb_spmv_test.o EXEDIR=./runs @@ -20,13 +20,16 @@ all: runsd spmv_overlap runsd: (if test ! -d runs ; then mkdir runs; fi) +psb_spmv_test.o: psb_spmv_test.f90 + $(FC) $(FCOPT) -cpp $(FINCLUDES) $(FDEFINES) $(FCUDEFINES) -c $< -o $@ + spmv_overlap: $(TOBJS) $(FLINK) $(LOPT) $(TOBJS) -o spmv_overlap $(PSBLAS_LIB) $(LDLIBS) /bin/mv spmv_overlap $(EXEDIR) - /bin/cp -f $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_overlap_test + /bin/cp -f $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel clean: - /bin/rm -f $(TOBJS) $(TOBJS_API) *$(.mod) $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_overlap_test + /bin/rm -f $(TOBJS) $(TOBJS_API) *$(.mod) $(EXEDIR)/spmv_overlap $(EXEDIR)/psb_spmv_kernel lib: (cd ../../; make library) diff --git a/test/comm/spmv/psb_spmv_overlap_test.f90 b/test/comm/spmv/psb_spmv_overlap_test.f90 deleted file mode 100644 index 25226fcf..00000000 --- a/test/comm/spmv/psb_spmv_overlap_test.f90 +++ /dev/null @@ -1,1077 +0,0 @@ -!> Test program for overlapping communication and computation with psb_spmm. -!! -!! This benchmark compares two equivalent SpMV paths: -!! 1. Serialized halo exchange + compute -!! 2. Overlapped psb_spmm(..., doswap=.true.) -!! -module psb_spmv_overlap_test - - use psb_base_mod - use psb_util_mod - use psb_comm_factory_mod, only: psb_comm_set - use psb_comm_schemes_mod, only: psb_comm_isend_irecv_, psb_comm_ineighbor_alltoallv_, & - & psb_comm_persistent_ineighbor_alltoallv_ - use psb_comm_neighbor_impl_mod, only: psb_comm_neighbor_handle - - implicit none - - interface - function d_func_3d(x,y,z) result(val) - import :: psb_dpk_ - real(psb_dpk_), intent(in) :: x,y,z - real(psb_dpk_) :: val - end function d_func_3d - end interface - -contains - - - function d_null_func_3d(x,y,z) result(val) - - real(psb_dpk_), intent(in) :: x,y,z - real(psb_dpk_) :: val - - val = dzero - - end function d_null_func_3d - ! - ! functions parametrizing the differential equation - ! - - ! - ! Note: b1, b2 and b3 are the coefficients of the first - ! derivative of the unknown function. The default - ! we apply here is to have them zero, so that the resulting - ! matrix is symmetric/hermitian and suitable for - ! testing with CG and FCG. - ! When testing methods for non-hermitian matrices you can - ! change the B1/B2/B3 functions to e.g. done/sqrt((3*done)) - ! - function b1(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: b1 - real(psb_dpk_), intent(in) :: x,y,z - b1=dzero - end function b1 - function b2(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: b2 - real(psb_dpk_), intent(in) :: x,y,z - b2=dzero - end function b2 - function b3(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: b3 - real(psb_dpk_), intent(in) :: x,y,z - b3=dzero - end function b3 - function c(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: c - real(psb_dpk_), intent(in) :: x,y,z - c=dzero - end function c - function a1(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: a1 - real(psb_dpk_), intent(in) :: x,y,z - a1=done/80 - end function a1 - function a2(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: a2 - real(psb_dpk_), intent(in) :: x,y,z - a2=done/80 - end function a2 - function a3(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: a3 - real(psb_dpk_), intent(in) :: x,y,z - a3=done/80 - end function a3 - function g(x,y,z) - use psb_base_mod, only : psb_dpk_, done, dzero - implicit none - real(psb_dpk_) :: g - real(psb_dpk_), intent(in) :: x,y,z - g = dzero - if (x == done) then - g = done - else if (x == dzero) then - g = exp(y**2-z**2) - end if - end function g - - ! - ! subroutine to allocate and fill in the coefficient matrix and - ! the rhs. - ! - subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info,& - & f,amold,vmold,imold,partition,nrl,iv,tnd) - use psb_base_mod - use psb_util_mod - ! - ! Discretizes the partial differential equation - ! - ! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u) - ! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f - ! dxdx dydy dzdz dx dy dz - ! - ! with Dirichlet boundary conditions - ! u = g - ! - ! on the unit cube 0<=x,y,z<=1. - ! - ! - ! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation. - ! - implicit none - integer(psb_ipk_) :: idim - type(psb_dspmat_type) :: a - type(psb_d_vect_type) :: xv,bv - type(psb_desc_type) :: desc_a - type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: info - character(len=*) :: afmt - procedure(d_func_3d), optional :: f - class(psb_d_base_sparse_mat), optional :: amold - class(psb_d_base_vect_type), optional :: vmold - class(psb_i_base_vect_type), optional :: imold - integer(psb_ipk_), optional :: partition, nrl,iv(:) - logical, optional :: tnd - ! Local variables. - - integer(psb_ipk_), parameter :: nb=20 - type(psb_d_csc_sparse_mat) :: acsc - type(psb_d_coo_sparse_mat) :: acoo - type(psb_d_csr_sparse_mat) :: acsr - real(psb_dpk_) :: zt(nb),x,y,z - integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ - integer(psb_lpk_) :: m,n,glob_row,nt - integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner - ! For 3D partition - ! Note: integer control variables going directly into an MPI call - ! must be 4 bytes, i.e. psb_mpk_ - integer(psb_mpk_) :: npdims(3), npp, minfo - integer(psb_ipk_) :: npx,npy,npz, iamx,iamy,iamz,mynx,myny,mynz - integer(psb_ipk_), allocatable :: bndx(:),bndy(:),bndz(:) - ! Process grid - integer(psb_ipk_) :: np, iam, nth - integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) - ! deltah dimension of each grid cell - ! deltat discretization time - real(psb_dpk_) :: deltah, sqdeltah, deltah2 - real(psb_dpk_), parameter :: rhs=dzero,one=done,zero=dzero - real(psb_dpk_) :: t0, t1, t2, t3, tasb, talc, ttot, tgen, tcdasb - integer(psb_ipk_) :: err_act - procedure(d_func_3d), pointer :: f_ - logical :: tnd_ - character(len=20) :: name, ch_err,tmpfmt - - info = psb_success_ - name = 'create_matrix' - call psb_erractionsave(err_act) - - call psb_info(ctxt, iam, np) - - - if (present(f)) then - f_ => f - else - f_ => d_null_func_3d - end if - - deltah = done/(idim+1) - sqdeltah = deltah*deltah - deltah2 = (2*done)* deltah - - if (present(partition)) then - if ((1<= partition).and.(partition <= 3)) then - partition_ = partition - else - write(*,*) 'Invalid partition choice ',partition,' defaulting to 3' - partition_ = 3 - end if - else - partition_ = 3 - end if - - ! initialize array descriptor and sparse matrix storage. provide an - ! estimate of the number of non zeroes - - m = (1_psb_lpk_*idim)*idim*idim - n = m - nnz = ((n*7)/(np)) - if(iam == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n - t0 = psb_wtime() - select case(partition_) - case(1) - ! A BLOCK partition - if (present(nrl)) then - nr = nrl - else - ! - ! Using a simple BLOCK distribution. - ! - nt = (m+np-1)/np - nr = max(0,min(nt,m-(iam*nt))) - end if - - nt = nr - call psb_sum(ctxt,nt) - if (nt /= m) then - write(psb_err_unit,*) iam, 'Initialization error ',nr,nt,m - info = -1 - call psb_barrier(ctxt) - call psb_abort(ctxt) - return - end if - - ! - ! First example of use of CDALL: specify for each process a number of - ! contiguous rows - ! - call psb_cdall(ctxt,desc_a,info,nl=nr) - myidx = desc_a%get_global_indices() - nlr = size(myidx) - - case(2) - ! A partition defined by the user through IV - - if (present(iv)) then - if (size(iv) /= m) then - write(psb_err_unit,*) iam, 'Initialization error: wrong IV size',size(iv),m - info = -1 - call psb_barrier(ctxt) - call psb_abort(ctxt) - return - end if - else - write(psb_err_unit,*) iam, 'Initialization error: IV not present' - info = -1 - call psb_barrier(ctxt) - call psb_abort(ctxt) - return - end if - - ! - ! Second example of use of CDALL: specify for each row the - ! process that owns it - ! - call psb_cdall(ctxt,desc_a,info,vg=iv) - myidx = desc_a%get_global_indices() - nlr = size(myidx) - - case(3) - ! A 3-dimensional partition - - ! A nifty MPI function will split the process list - npdims = 0 - call mpi_dims_create(np,3,npdims,info) - npx = npdims(1) - npy = npdims(2) - npz = npdims(3) - - allocate(bndx(0:npx),bndy(0:npy),bndz(0:npz)) - ! We can reuse idx2ijk for process indices as well. - call idx2ijk(iamx,iamy,iamz,iam,npx,npy,npz,base=0) - ! Now let's split the 3D cube in hexahedra - call dist1Didx(bndx,idim,npx) - mynx = bndx(iamx+1)-bndx(iamx) - call dist1Didx(bndy,idim,npy) - myny = bndy(iamy+1)-bndy(iamy) - call dist1Didx(bndz,idim,npz) - mynz = bndz(iamz+1)-bndz(iamz) - - ! How many indices do I own? - nlr = mynx*myny*mynz - allocate(myidx(nlr)) - ! Now, let's generate the list of indices I own - nr = 0 - do i=bndx(iamx),bndx(iamx+1)-1 - do j=bndy(iamy),bndy(iamy+1)-1 - do k=bndz(iamz),bndz(iamz+1)-1 - nr = nr + 1 - call ijk2idx(myidx(nr),i,j,k,idim,idim,idim) - end do - end do - end do - if (nr /= nlr) then - write(psb_err_unit,*) iam,iamx,iamy,iamz, 'Initialization error: NR vs NLR ',& - & nr,nlr,mynx,myny,mynz - info = -1 - call psb_barrier(ctxt) - call psb_abort(ctxt) - end if - - ! - ! Third example of use of CDALL: specify for each process - ! the set of global indices it owns. - ! - call psb_cdall(ctxt,desc_a,info,vl=myidx) - - case default - write(psb_err_unit,*) iam, 'Initialization error: should not get here' - info = -1 - call psb_barrier(ctxt) - call psb_abort(ctxt) - return - end select - - - if (info == psb_success_) call psb_spall(a,desc_a,info,nnz=nnz) - ! define rhs from boundary conditions; also build initial guess - if (info == psb_success_) call psb_geall(xv,desc_a,info) - if (info == psb_success_) call psb_geall(bv,desc_a,info) - - call psb_barrier(ctxt) - talc = psb_wtime()-t0 - - if (info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='allocation rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - - call psb_barrier(ctxt) - t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - - tgen = psb_wtime()-t1 - if(info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='insert rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - deallocate(val,irow,icol) - - call psb_barrier(ctxt) - t1 = psb_wtime() - call psb_cdasb(desc_a,info,mold=imold) - tcdasb = psb_wtime()-t1 - call psb_barrier(ctxt) - t1 = psb_wtime() - if (info == psb_success_) then - if (present(amold)) then - call psb_spasb(a,desc_a,info,mold=amold,bld_and=tnd) - else - call psb_spasb(a,desc_a,info,afmt=afmt,bld_and=tnd) - end if - end if - call psb_barrier(ctxt) - if(info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='asb rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - if (info == psb_success_) call psb_geasb(xv,desc_a,info,mold=vmold) - if (info == psb_success_) call psb_geasb(bv,desc_a,info,mold=vmold) - if(info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='asb rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - tasb = psb_wtime()-t1 - call psb_barrier(ctxt) - ttot = psb_wtime() - t0 - - call psb_amx(ctxt,talc) - call psb_amx(ctxt,tgen) - call psb_amx(ctxt,tasb) - call psb_amx(ctxt,ttot) - if(iam == psb_root_) then - tmpfmt = a%get_fmt() - write(psb_out_unit,'("The matrix has been generated and assembled in ",a3," format.")')& - & tmpfmt - write(psb_out_unit,'("-allocation time : ",es12.5)') talc - write(psb_out_unit,'("-coeff. gen. time : ",es12.5)') tgen - write(psb_out_unit,'("-desc asbly time : ",es12.5)') tcdasb - write(psb_out_unit,'("- mat asbly time : ",es12.5)') tasb - write(psb_out_unit,'("-total time : ",es12.5)') ttot - - end if - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(ctxt,err_act) - - return - end subroutine psb_d_gen_pde3d - - subroutine psb_spmv_overlap_kernel(ctxt) - use psb_base_mod - use psb_util_mod - - implicit none - - type(psb_ctxt_type), intent(in) :: ctxt - real(psb_dpk_) :: alpha, beta - - type(psb_dspmat_type) :: a - type(psb_d_vect_type) :: x_isend, x_neighbor, x_persistent - type(psb_d_vect_type) :: y_ov_isend, y_ov_neighbor, y_ov_persistent - type(psb_d_vect_type) :: y_no_isend, y_no_neighbor, y_no_persistent - type(psb_desc_type) :: desc_a - - character(len=64) :: env_buf - real(psb_dpk_), allocatable :: x_global(:), y_global(:) - integer(psb_ipk_) :: my_rank, np, info, err_act - integer :: env_len, env_status, ios - integer(psb_ipk_) :: n_global, idim - integer(psb_ipk_) :: i, times - real(psb_dpk_) :: t0, t1, dt - real(psb_dpk_) :: t_ov_isend, t_ov_neighbor, t_ov_persistent - real(psb_dpk_) :: t_no_isend, t_no_neighbor, t_no_persistent - real(psb_dpk_) :: err_isend, err_neighbor, err_persistent, tol - real(psb_dpk_) :: avg_ov, avg_no, speedup, gain_pct - integer(psb_ipk_) :: n_init_neighbor_l, n_start_neighbor_l, n_wait_neighbor_l, n_realloc_neighbor_l - integer(psb_ipk_) :: n_init_persist_l, n_start_persist_l, n_wait_persist_l, n_realloc_persist_l - integer(psb_ipk_) :: n_init_neighbor_g, n_start_neighbor_g, n_wait_neighbor_g, n_realloc_neighbor_g - integer(psb_ipk_) :: n_init_persist_g, n_start_persist_g, n_wait_persist_g, n_realloc_persist_g - integer(psb_ipk_) :: n_ineighbor_neighbor_l, n_ineighbor_neighbor_g - integer(psb_ipk_) :: peak_start_neighbor_l, peak_start_persist_l - integer(psb_ipk_) :: peak_start_neighbor_g, peak_start_persist_g - integer(psb_ipk_) :: last_start_neighbor_l, last_start_persist_l - integer(psb_ipk_) :: accum_start_neighbor_l, accum_start_persist_l - integer(psb_ipk_) :: accum_start_neighbor_g, accum_start_persist_g - integer(psb_ipk_) :: comm_type_neighbor_l, comm_type_persist_l - integer(psb_ipk_) :: comm_type_neighbor_g, comm_type_persist_g - integer(psb_ipk_) :: is_neighbor_handle_l, is_persist_handle_l - integer(psb_ipk_) :: is_neighbor_handle_g, is_persist_handle_g - integer(psb_ipk_) :: use_persistent_neighbor_l, use_persistent_persist_l - integer(psb_ipk_) :: use_persistent_neighbor_g, use_persistent_persist_g - - info = psb_success_ - tol = 1.0d-10 - times = 100 - t_ov_isend = 0.0_psb_dpk_ - t_ov_neighbor = 0.0_psb_dpk_ - t_ov_persistent = 0.0_psb_dpk_ - t_no_isend = 0.0_psb_dpk_ - t_no_neighbor = 0.0_psb_dpk_ - t_no_persistent = 0.0_psb_dpk_ - peak_start_neighbor_l = 0 - peak_start_persist_l = 0 - n_ineighbor_neighbor_l = 0 - last_start_neighbor_l = -1 - last_start_persist_l = -1 - accum_start_neighbor_l = 0 - accum_start_persist_l = 0 - comm_type_neighbor_l = -1 - comm_type_persist_l = -1 - is_neighbor_handle_l = 0 - is_persist_handle_l = 0 - use_persistent_neighbor_l = 0 - use_persistent_persist_l = 0 - idim = 10 - call psb_erractionsave(err_act) - - call get_environment_variable('IDIM', env_buf, length=env_len, status=env_status) - if ((env_status == 0) .and. (env_len > 0)) then - read(env_buf(1:env_len), *, iostat=ios) idim - if ((ios /= 0) .or. (idim < 2)) idim = 10 - end if - - call get_environment_variable('TIMES', env_buf, length=env_len, status=env_status) - if ((env_status == 0) .and. (env_len > 0)) then - read(env_buf(1:env_len), *, iostat=ios) times - if ((ios /= 0) .or. (times < 1)) times = 100 - end if - - n_global = idim * idim * idim - alpha = done - beta = dzero - - call psb_info(ctxt, my_rank, np) - - - call psb_barrier(ctxt) - call psb_d_gen_pde3d(ctxt,idim,a,y_ov_isend,x_isend,desc_a,"CSR",info,partition=1) - - if (info /= psb_success_) goto 9999 - call psb_barrier(ctxt) - - if (my_rank == psb_root_) then - allocate(x_global(n_global)) - allocate(y_global(n_global)) - do i = 1, n_global - x_global(i) = real(mod(i,17)+1, psb_dpk_) / real(17, psb_dpk_) - y_global(i) = real(mod(i,13), psb_dpk_) / real(29, psb_dpk_) - end do - end if - - call psb_geall(x_neighbor, desc_a, info) - call psb_geall(x_persistent, desc_a, info) - call psb_geall(y_ov_neighbor, desc_a, info) - call psb_geall(y_ov_persistent, desc_a, info) - call psb_geall(y_no_isend, desc_a, info) - call psb_geall(y_no_neighbor, desc_a, info) - call psb_geall(y_no_persistent, desc_a, info) - if (info /= psb_success_) goto 9999 - - call psb_scatter(x_global, x_isend, desc_a, info, root=psb_root_) - call psb_scatter(x_global, x_neighbor, desc_a, info, root=psb_root_) - call psb_scatter(x_global, x_persistent, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_ov_isend, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_ov_neighbor, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_ov_persistent, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_no_isend, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_no_neighbor, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_no_persistent, desc_a, info, root=psb_root_) - if (info /= psb_success_) goto 9999 - - ! Set communication schemes on the x vectors used by psb_spmm. - call psb_comm_set(psb_comm_isend_irecv_, x_isend%v%comm_handle, info) - if (info /= psb_success_) goto 9999 - call psb_comm_set(psb_comm_ineighbor_alltoallv_, x_neighbor%v%comm_handle, info) - if (info /= psb_success_) goto 9999 - call psb_comm_set(psb_comm_persistent_ineighbor_alltoallv_, x_persistent%v%comm_handle, info) - if (info /= psb_success_) goto 9999 - - select type(ch => x_neighbor%v%comm_handle) - type is(psb_comm_neighbor_handle) - is_neighbor_handle_l = 1 - comm_type_neighbor_l = ch%comm_type - if (ch%use_persistent_buffers) use_persistent_neighbor_l = 1 - class default - continue - end select - - select type(ch => x_persistent%v%comm_handle) - type is(psb_comm_neighbor_handle) - is_persist_handle_l = 1 - comm_type_persist_l = ch%comm_type - if (ch%use_persistent_buffers) use_persistent_persist_l = 1 - class default - continue - end select - - ! Warm-up all schemes once: overlap and non-overlap paths. - call psb_spmm(alpha, a, x_isend, beta, y_ov_isend, desc_a, info, doswap=.true.) - call psb_halo(x_isend, desc_a, info) - call psb_spmm(alpha, a, x_isend, beta, y_no_isend, desc_a, info, doswap=.false.) - - call psb_spmm(alpha, a, x_neighbor, beta, y_ov_neighbor, desc_a, info, doswap=.true.) - select type(ch => x_neighbor%v%comm_handle) - type is(psb_comm_neighbor_handle) - n_ineighbor_neighbor_l = ch%diag_ineighbor_calls - if (last_start_neighbor_l < 0) then - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_neighbor_l) then - accum_start_neighbor_l = accum_start_neighbor_l + (ch%diag_start_calls - last_start_neighbor_l) - else - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - end if - last_start_neighbor_l = ch%diag_start_calls - peak_start_neighbor_l = max(peak_start_neighbor_l, ch%diag_start_calls) - class default - continue - end select - call psb_halo(x_neighbor, desc_a, info) - select type(ch => x_neighbor%v%comm_handle) - type is(psb_comm_neighbor_handle) - n_ineighbor_neighbor_l = ch%diag_ineighbor_calls - if (last_start_neighbor_l < 0) then - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_neighbor_l) then - accum_start_neighbor_l = accum_start_neighbor_l + (ch%diag_start_calls - last_start_neighbor_l) - else - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - end if - last_start_neighbor_l = ch%diag_start_calls - peak_start_neighbor_l = max(peak_start_neighbor_l, ch%diag_start_calls) - class default - continue - end select - call psb_spmm(alpha, a, x_neighbor, beta, y_no_neighbor, desc_a, info, doswap=.false.) - select type(ch => x_neighbor%v%comm_handle) - type is(psb_comm_neighbor_handle) - n_ineighbor_neighbor_l = ch%diag_ineighbor_calls - if (last_start_neighbor_l < 0) then - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_neighbor_l) then - accum_start_neighbor_l = accum_start_neighbor_l + (ch%diag_start_calls - last_start_neighbor_l) - else - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - end if - last_start_neighbor_l = ch%diag_start_calls - peak_start_neighbor_l = max(peak_start_neighbor_l, ch%diag_start_calls) - class default - continue - end select - - call psb_spmm(alpha, a, x_persistent, beta, y_ov_persistent, desc_a, info, doswap=.true.) - select type(ch => x_persistent%v%comm_handle) - type is(psb_comm_neighbor_handle) - if (last_start_persist_l < 0) then - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_persist_l) then - accum_start_persist_l = accum_start_persist_l + (ch%diag_start_calls - last_start_persist_l) - else - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - end if - last_start_persist_l = ch%diag_start_calls - peak_start_persist_l = max(peak_start_persist_l, ch%diag_start_calls) - class default - continue - end select - call psb_halo(x_persistent, desc_a, info) - select type(ch => x_persistent%v%comm_handle) - type is(psb_comm_neighbor_handle) - if (last_start_persist_l < 0) then - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_persist_l) then - accum_start_persist_l = accum_start_persist_l + (ch%diag_start_calls - last_start_persist_l) - else - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - end if - last_start_persist_l = ch%diag_start_calls - peak_start_persist_l = max(peak_start_persist_l, ch%diag_start_calls) - class default - continue - end select - call psb_spmm(alpha, a, x_persistent, beta, y_no_persistent, desc_a, info, doswap=.false.) - select type(ch => x_persistent%v%comm_handle) - type is(psb_comm_neighbor_handle) - if (last_start_persist_l < 0) then - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_persist_l) then - accum_start_persist_l = accum_start_persist_l + (ch%diag_start_calls - last_start_persist_l) - else - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - end if - last_start_persist_l = ch%diag_start_calls - peak_start_persist_l = max(peak_start_persist_l, ch%diag_start_calls) - class default - continue - end select - if (info /= psb_success_) goto 9999 - - ! ----------------------------- - ! isend/irecv scheme - ! ----------------------------- - call psb_scatter(x_global, x_isend, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_ov_isend, desc_a, info, root=psb_root_) - call psb_barrier(ctxt) - t0 = psb_wtime() - do i = 1, times - call psb_spmm(alpha, a, x_isend, beta, y_ov_isend, desc_a, info, doswap=.true.) - end do - t1 = psb_wtime() - dt = t1 - t0 - call psb_amx(ctxt, dt) - t_ov_isend = dt - - call psb_scatter(x_global, x_isend, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_no_isend, desc_a, info, root=psb_root_) - call psb_barrier(ctxt) - t0 = psb_wtime() - do i = 1, times - call psb_halo(x_isend, desc_a, info) - call psb_spmm(alpha, a, x_isend, beta, y_no_isend, desc_a, info, doswap=.false.) - end do - t1 = psb_wtime() - dt = t1 - t0 - call psb_amx(ctxt, dt) - t_no_isend = dt - - ! ----------------------------- - ! ineighbor_alltoallv scheme - ! ----------------------------- - call psb_scatter(x_global, x_neighbor, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_ov_neighbor, desc_a, info, root=psb_root_) - call psb_barrier(ctxt) - t0 = psb_wtime() - do i = 1, times - call psb_spmm(alpha, a, x_neighbor, beta, y_ov_neighbor, desc_a, info, doswap=.true.) - select type(ch => x_neighbor%v%comm_handle) - type is(psb_comm_neighbor_handle) - if (last_start_neighbor_l < 0) then - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_neighbor_l) then - accum_start_neighbor_l = accum_start_neighbor_l + (ch%diag_start_calls - last_start_neighbor_l) - else - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - end if - last_start_neighbor_l = ch%diag_start_calls - peak_start_neighbor_l = max(peak_start_neighbor_l, ch%diag_start_calls) - class default - continue - end select - end do - t1 = psb_wtime() - dt = t1 - t0 - call psb_amx(ctxt, dt) - t_ov_neighbor = dt - - call psb_scatter(x_global, x_neighbor, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_no_neighbor, desc_a, info, root=psb_root_) - call psb_barrier(ctxt) - t0 = psb_wtime() - do i = 1, times - call psb_halo(x_neighbor, desc_a, info) - call psb_spmm(alpha, a, x_neighbor, beta, y_no_neighbor, desc_a, info, doswap=.false.) - select type(ch => x_neighbor%v%comm_handle) - type is(psb_comm_neighbor_handle) - if (last_start_neighbor_l < 0) then - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_neighbor_l) then - accum_start_neighbor_l = accum_start_neighbor_l + (ch%diag_start_calls - last_start_neighbor_l) - else - accum_start_neighbor_l = accum_start_neighbor_l + ch%diag_start_calls - end if - last_start_neighbor_l = ch%diag_start_calls - peak_start_neighbor_l = max(peak_start_neighbor_l, ch%diag_start_calls) - class default - continue - end select - end do - t1 = psb_wtime() - dt = t1 - t0 - call psb_amx(ctxt, dt) - t_no_neighbor = dt - - ! ---------------------------------------- - ! persistent_ineighbor_alltoallv scheme - ! ---------------------------------------- - call psb_scatter(x_global, x_persistent, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_ov_persistent, desc_a, info, root=psb_root_) - call psb_barrier(ctxt) - t0 = psb_wtime() - do i = 1, times - call psb_spmm(alpha, a, x_persistent, beta, y_ov_persistent, desc_a, info, doswap=.true.) - select type(ch => x_persistent%v%comm_handle) - type is(psb_comm_neighbor_handle) - if (last_start_persist_l < 0) then - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_persist_l) then - accum_start_persist_l = accum_start_persist_l + (ch%diag_start_calls - last_start_persist_l) - else - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - end if - last_start_persist_l = ch%diag_start_calls - peak_start_persist_l = max(peak_start_persist_l, ch%diag_start_calls) - class default - continue - end select - end do - t1 = psb_wtime() - dt = t1 - t0 - call psb_amx(ctxt, dt) - t_ov_persistent = dt - - call psb_scatter(x_global, x_persistent, desc_a, info, root=psb_root_) - call psb_scatter(y_global, y_no_persistent, desc_a, info, root=psb_root_) - call psb_barrier(ctxt) - t0 = psb_wtime() - do i = 1, times - call psb_halo(x_persistent, desc_a, info) - call psb_spmm(alpha, a, x_persistent, beta, y_no_persistent, desc_a, info, doswap=.false.) - select type(ch => x_persistent%v%comm_handle) - type is(psb_comm_neighbor_handle) - if (last_start_persist_l < 0) then - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - else if (ch%diag_start_calls >= last_start_persist_l) then - accum_start_persist_l = accum_start_persist_l + (ch%diag_start_calls - last_start_persist_l) - else - accum_start_persist_l = accum_start_persist_l + ch%diag_start_calls - end if - last_start_persist_l = ch%diag_start_calls - peak_start_persist_l = max(peak_start_persist_l, ch%diag_start_calls) - class default - continue - end select - end do - t1 = psb_wtime() - dt = t1 - t0 - call psb_amx(ctxt, dt) - t_no_persistent = dt - - if (info /= psb_success_) goto 9999 - - err_isend = maxval(abs(y_ov_isend%get_vect() - y_no_isend%get_vect())) - err_neighbor = maxval(abs(y_ov_neighbor%get_vect() - y_no_neighbor%get_vect())) - err_persistent = maxval(abs(y_ov_persistent%get_vect() - y_no_persistent%get_vect())) - call psb_amx(ctxt, err_isend) - call psb_amx(ctxt, err_neighbor) - call psb_amx(ctxt, err_persistent) - - n_init_neighbor_l = 0 - n_start_neighbor_l = 0 - n_wait_neighbor_l = 0 - n_realloc_neighbor_l = 0 - n_init_persist_l = 0 - n_start_persist_l = 0 - n_wait_persist_l = 0 - n_realloc_persist_l = 0 - - select type(ch => x_neighbor%v%comm_handle) - type is(psb_comm_neighbor_handle) - n_init_neighbor_l = ch%diag_init_calls - n_start_neighbor_l = ch%diag_start_calls - n_wait_neighbor_l = ch%diag_wait_calls - n_realloc_neighbor_l = ch%diag_buffer_reallocs - class default - continue - end select - - select type(ch => x_persistent%v%comm_handle) - type is(psb_comm_neighbor_handle) - n_init_persist_l = ch%diag_init_calls - n_start_persist_l = ch%diag_start_calls - n_wait_persist_l = ch%diag_wait_calls - n_realloc_persist_l = ch%diag_buffer_reallocs - class default - continue - end select - - n_init_neighbor_g = n_init_neighbor_l - n_start_neighbor_g = n_start_neighbor_l - n_wait_neighbor_g = n_wait_neighbor_l - n_realloc_neighbor_g = n_realloc_neighbor_l - n_init_persist_g = n_init_persist_l - n_start_persist_g = n_start_persist_l - n_wait_persist_g = n_wait_persist_l - n_realloc_persist_g = n_realloc_persist_l - n_ineighbor_neighbor_g = n_ineighbor_neighbor_l - peak_start_neighbor_g = peak_start_neighbor_l - peak_start_persist_g = peak_start_persist_l - accum_start_neighbor_g = accum_start_neighbor_l - accum_start_persist_g = accum_start_persist_l - comm_type_neighbor_g = comm_type_neighbor_l - comm_type_persist_g = comm_type_persist_l - is_neighbor_handle_g = is_neighbor_handle_l - is_persist_handle_g = is_persist_handle_l - use_persistent_neighbor_g = use_persistent_neighbor_l - use_persistent_persist_g = use_persistent_persist_l - - call psb_sum(ctxt, n_init_neighbor_g) - call psb_sum(ctxt, n_start_neighbor_g) - call psb_sum(ctxt, n_wait_neighbor_g) - call psb_sum(ctxt, n_realloc_neighbor_g) - call psb_sum(ctxt, n_init_persist_g) - call psb_sum(ctxt, n_start_persist_g) - call psb_sum(ctxt, n_wait_persist_g) - call psb_sum(ctxt, n_realloc_persist_g) - call psb_sum(ctxt, n_ineighbor_neighbor_g) - call psb_sum(ctxt, peak_start_neighbor_g) - call psb_sum(ctxt, peak_start_persist_g) - call psb_sum(ctxt, accum_start_neighbor_g) - call psb_sum(ctxt, accum_start_persist_g) - call psb_sum(ctxt, comm_type_neighbor_g) - call psb_sum(ctxt, comm_type_persist_g) - call psb_sum(ctxt, is_neighbor_handle_g) - call psb_sum(ctxt, is_persist_handle_g) - call psb_sum(ctxt, use_persistent_neighbor_g) - call psb_sum(ctxt, use_persistent_persist_g) - - if (my_rank == 0) then - write(psb_out_unit,'(/,"SpMV overlap benchmark")') - write(psb_out_unit,'(" idim : ",i0)') idim - write(psb_out_unit,'(" global unknowns : ",i0)') n_global - write(psb_out_unit,'(" repetitions : ",i0)') times - write(psb_out_unit,'(" timing metric : max over MPI ranks")') - write(psb_out_unit,'(" gain(%) = 100*(1 - overlap/no_overlap)")') - - write(psb_out_unit,'(/,"Scheme: isend_irecv")') - avg_ov = t_ov_isend / real(times, psb_dpk_) - avg_no = t_no_isend / real(times, psb_dpk_) - speedup = t_no_isend / max(t_ov_isend, tiny(done)) - gain_pct = 100.0_psb_dpk_ * (done - (t_ov_isend / max(t_no_isend, tiny(done)))) - write(psb_out_unit,'(" total overlap : ",es12.5)') t_ov_isend - write(psb_out_unit,'(" total no_overlap : ",es12.5)') t_no_isend - write(psb_out_unit,'(" avg overlap : ",es12.5)') avg_ov - write(psb_out_unit,'(" avg no_overlap : ",es12.5)') avg_no - write(psb_out_unit,'(" speedup (no/ov) : ",f10.4)') speedup - write(psb_out_unit,'(" gain (%) : ",f10.4)') gain_pct - write(psb_out_unit,'(" overlap vs no_overlap err = ",es12.5)') err_isend - - write(psb_out_unit,'(/,"Scheme: ineighbor_alltoallv")') - avg_ov = t_ov_neighbor / real(times, psb_dpk_) - avg_no = t_no_neighbor / real(times, psb_dpk_) - speedup = t_no_neighbor / max(t_ov_neighbor, tiny(done)) - gain_pct = 100.0_psb_dpk_ * (done - (t_ov_neighbor / max(t_no_neighbor, tiny(done)))) - write(psb_out_unit,'(" total overlap : ",es12.5)') t_ov_neighbor - write(psb_out_unit,'(" total no_overlap : ",es12.5)') t_no_neighbor - write(psb_out_unit,'(" avg overlap : ",es12.5)') avg_ov - write(psb_out_unit,'(" avg no_overlap : ",es12.5)') avg_no - write(psb_out_unit,'(" speedup (no/ov) : ",f10.4)') speedup - write(psb_out_unit,'(" gain (%) : ",f10.4)') gain_pct - write(psb_out_unit,'(" overlap vs no_overlap err = ",es12.5)') err_neighbor - - write(psb_out_unit,'(/,"Scheme: persistent_ineighbor_alltoallv")') - avg_ov = t_ov_persistent / real(times, psb_dpk_) - avg_no = t_no_persistent / real(times, psb_dpk_) - speedup = t_no_persistent / max(t_ov_persistent, tiny(done)) - gain_pct = 100.0_psb_dpk_ * (done - (t_ov_persistent / max(t_no_persistent, tiny(done)))) - write(psb_out_unit,'(" total overlap : ",es12.5)') t_ov_persistent - write(psb_out_unit,'(" total no_overlap : ",es12.5)') t_no_persistent - write(psb_out_unit,'(" avg overlap : ",es12.5)') avg_ov - write(psb_out_unit,'(" avg no_overlap : ",es12.5)') avg_no - write(psb_out_unit,'(" speedup (no/ov) : ",f10.4)') speedup - write(psb_out_unit,'(" gain (%) : ",f10.4)') gain_pct - write(psb_out_unit,'(" overlap vs no_overlap err = ",es12.5)') err_persistent - - write(psb_out_unit,'(/,"Communication diagnostics (global sum over MPI ranks):")') - write(psb_out_unit,'(" ineighbor_alltoallv: init=",i0,", start=",i0,", wait=",i0,", realloc=",i0)') & - & n_init_neighbor_g, n_start_neighbor_g, n_wait_neighbor_g, n_realloc_neighbor_g - write(psb_out_unit,'(" ineighbor_alltoallv calls : ",i0)') n_ineighbor_neighbor_g - write(psb_out_unit,'(" ineighbor peak start observed during run: ",i0)') peak_start_neighbor_g - write(psb_out_unit,'(" ineighbor accumulated starts (robust): ",i0)') accum_start_neighbor_g - write(psb_out_unit,'(" persistent_ineighbor_a2av: init=",i0,", start=",i0,", wait=",i0,", realloc=",i0)') & - & n_init_persist_g, n_start_persist_g, n_wait_persist_g, n_realloc_persist_g - write(psb_out_unit,'(" persistent peak start observed during run: ",i0)') peak_start_persist_g - write(psb_out_unit,'(" persistent accumulated starts (robust): ",i0)') accum_start_persist_g - write(psb_out_unit,'(" handle check (sum over ranks): neighbor is_type=",i0,", comm_type=",i0,", use_persistent=",i0)') & - & is_neighbor_handle_g, comm_type_neighbor_g, use_persistent_neighbor_g - write(psb_out_unit,'(" handle check (sum over ranks): persistent is_type=",i0,", comm_type=",i0,", use_persistent=",i0)') & - & is_persist_handle_g, comm_type_persist_g, use_persistent_persist_g - - if ((err_isend > tol) .or. (err_neighbor > tol) .or. (err_persistent > tol)) then - write(psb_out_unit,'(" WARNING: mismatch exceeds tolerance ",es12.5)') tol - end if - end if - - call psb_gefree(x_isend, desc_a, info) - call psb_gefree(x_neighbor, desc_a, info) - call psb_gefree(x_persistent, desc_a, info) - call psb_gefree(y_ov_isend, desc_a, info) - call psb_gefree(y_ov_neighbor, desc_a, info) - call psb_gefree(y_ov_persistent, desc_a, info) - call psb_gefree(y_no_isend, desc_a, info) - call psb_gefree(y_no_neighbor, desc_a, info) - call psb_gefree(y_no_persistent, desc_a, info) - call psb_spfree(a, desc_a, info) - call psb_cdfree(desc_a, info) - - if (my_rank == 0) then - deallocate(x_global) - deallocate(y_global) - end if - - return - -9999 call psb_error(ctxt) - call psb_error_handler(ctxt, err_act) - - end subroutine psb_spmv_overlap_kernel - - -end module psb_spmv_overlap_test diff --git a/test/comm/spmv/psb_spmv_test.f90 b/test/comm/spmv/psb_spmv_test.f90 new file mode 100644 index 00000000..897f9874 --- /dev/null +++ b/test/comm/spmv/psb_spmv_test.f90 @@ -0,0 +1,694 @@ +!> Test program for overlapping communication and computation with psb_spmm. +!! +!! This benchmark compares two equivalent SpMV paths: +!! 1. Serialized halo exchange + compute +!! 2. Overlapped psb_spmm(..., doswap=.true.) +!! +module psb_spmv_overlap_test + + use psb_base_mod + use psb_util_mod + use psb_comm_factory_mod, only: psb_comm_set + use psb_comm_schemes_mod, only: psb_comm_isend_irecv_, psb_comm_ineighbor_alltoallv_, & + & psb_comm_persistent_ineighbor_alltoallv_ + use psb_comm_neighbor_impl_mod, only: psb_comm_neighbor_handle +#ifdef PSB_HAVE_CUDA + use psb_cuda_mod +#endif + + implicit none + + interface + function d_func_3d(x,y,z) result(val) + import :: psb_dpk_ + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + end function d_func_3d + end interface + +contains + + + function d_null_func_3d(x,y,z) result(val) + + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + + val = dzero + + end function d_null_func_3d + ! + ! functions parametrizing the differential equation + ! + + ! + ! Note: b1, b2 and b3 are the coefficients of the first + ! derivative of the unknown function. The default + ! we apply here is to have them zero, so that the resulting + ! matrix is symmetric/hermitian and suitable for + ! testing with CG and FCG. + ! When testing methods for non-hermitian matrices you can + ! change the B1/B2/B3 functions to e.g. done/sqrt((3*done)) + ! + function b1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b1 + real(psb_dpk_), intent(in) :: x,y,z + b1=dzero + end function b1 + function b2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b2 + real(psb_dpk_), intent(in) :: x,y,z + b2=dzero + end function b2 + function b3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b3 + real(psb_dpk_), intent(in) :: x,y,z + b3=dzero + end function b3 + function c(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: c + real(psb_dpk_), intent(in) :: x,y,z + c=dzero + end function c + function a1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a1 + real(psb_dpk_), intent(in) :: x,y,z + a1=done/80 + end function a1 + function a2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a2 + real(psb_dpk_), intent(in) :: x,y,z + a2=done/80 + end function a2 + function a3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a3 + real(psb_dpk_), intent(in) :: x,y,z + a3=done/80 + end function a3 + function g(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: g + real(psb_dpk_), intent(in) :: x,y,z + g = dzero + if (x == done) then + g = done + else if (x == dzero) then + g = exp(y**2-z**2) + end if + end function g + + ! + ! subroutine to allocate and fill in the coefficient matrix and + ! the rhs. + ! + subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info,& + & f,amold,vmold,imold,partition,nrl,iv,tnd) + use psb_base_mod + use psb_util_mod + ! + ! Discretizes the partial differential equation + ! + ! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u) + ! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f + ! dxdx dydy dzdz dx dy dz + ! + ! with Dirichlet boundary conditions + ! u = g + ! + ! on the unit cube 0<=x,y,z<=1. + ! + ! + ! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation. + ! + implicit none + integer(psb_ipk_) :: idim + type(psb_dspmat_type) :: a + type(psb_d_vect_type) :: xv,bv + type(psb_desc_type) :: desc_a + type(psb_ctxt_type) :: ctxt + integer(psb_ipk_) :: info + character(len=*) :: afmt + procedure(d_func_3d), optional :: f + class(psb_d_base_sparse_mat), optional :: amold + class(psb_d_base_vect_type), optional :: vmold + class(psb_i_base_vect_type), optional :: imold + integer(psb_ipk_), optional :: partition, nrl,iv(:) + logical, optional :: tnd + ! Local variables. + + integer(psb_ipk_), parameter :: nb=20 + type(psb_d_csc_sparse_mat) :: acsc + type(psb_d_coo_sparse_mat) :: acoo + type(psb_d_csr_sparse_mat) :: acsr + real(psb_dpk_) :: zt(nb),x,y,z + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ + integer(psb_lpk_) :: m,n,glob_row,nt + integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner + ! For 3D partition + ! Note: integer control variables going directly into an MPI call + ! must be 4 bytes, i.e. psb_mpk_ + integer(psb_mpk_) :: npdims(3), npp, minfo + integer(psb_ipk_) :: npx,npy,npz, iamx,iamy,iamz,mynx,myny,mynz + integer(psb_ipk_), allocatable :: bndx(:),bndy(:),bndz(:) + ! Process grid + integer(psb_ipk_) :: np, iam, nth + integer(psb_ipk_) :: icoeff + integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) + real(psb_dpk_), allocatable :: val(:) + ! deltah dimension of each grid cell + ! deltat discretization time + real(psb_dpk_) :: deltah, sqdeltah, deltah2 + real(psb_dpk_), parameter :: rhs=dzero,one=done,zero=dzero + real(psb_dpk_) :: t0, t1, t2, t3, tasb, talc, ttot, tgen, tcdasb + integer(psb_ipk_) :: err_act + procedure(d_func_3d), pointer :: f_ + logical :: tnd_ + character(len=20) :: name, ch_err,tmpfmt + + info = psb_success_ + name = 'create_matrix' + call psb_erractionsave(err_act) + + call psb_info(ctxt, iam, np) + + + if (present(f)) then + f_ => f + else + f_ => d_null_func_3d + end if + + deltah = done/(idim+1) + sqdeltah = deltah*deltah + deltah2 = (2*done)* deltah + + if (present(partition)) then + if ((1<= partition).and.(partition <= 3)) then + partition_ = partition + else + write(*,*) 'Invalid partition choice ',partition,' defaulting to 3' + partition_ = 3 + end if + else + partition_ = 3 + end if + + tnd_ = .false. + if (present(tnd)) tnd_ = tnd + + ! initialize array descriptor and sparse matrix storage. provide an + ! estimate of the number of non zeroes + + m = (1_psb_lpk_*idim)*idim*idim + n = m + nnz = ((n*7)/(np)) + if(iam == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n + t0 = psb_wtime() + select case(partition_) + case(1) + ! A BLOCK partition + if (present(nrl)) then + nr = nrl + else + ! + ! Using a simple BLOCK distribution. + ! + nt = (m+np-1)/np + nr = max(0,min(nt,m-(iam*nt))) + end if + + nt = nr + call psb_sum(ctxt,nt) + if (nt /= m) then + write(psb_err_unit,*) iam, 'Initialization error ',nr,nt,m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! First example of use of CDALL: specify for each process a number of + ! contiguous rows + ! + call psb_cdall(ctxt,desc_a,info,nl=nr) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(2) + ! A partition defined by the user through IV + + if (present(iv)) then + if (size(iv) /= m) then + write(psb_err_unit,*) iam, 'Initialization error: wrong IV size',size(iv),m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + else + write(psb_err_unit,*) iam, 'Initialization error: IV not present' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! Second example of use of CDALL: specify for each row the + ! process that owns it + ! + call psb_cdall(ctxt,desc_a,info,vg=iv) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(3) + ! A 3-dimensional partition + + ! A nifty MPI function will split the process list + npdims = 0 + call mpi_dims_create(np,3,npdims,info) + npx = npdims(1) + npy = npdims(2) + npz = npdims(3) + + allocate(bndx(0:npx),bndy(0:npy),bndz(0:npz)) + ! We can reuse idx2ijk for process indices as well. + call idx2ijk(iamx,iamy,iamz,iam,npx,npy,npz,base=0) + ! Now let's split the 3D cube in hexahedra + call dist1Didx(bndx,idim,npx) + mynx = bndx(iamx+1)-bndx(iamx) + call dist1Didx(bndy,idim,npy) + myny = bndy(iamy+1)-bndy(iamy) + call dist1Didx(bndz,idim,npz) + mynz = bndz(iamz+1)-bndz(iamz) + + ! How many indices do I own? + nlr = mynx*myny*mynz + allocate(myidx(nlr)) + ! Now, let's generate the list of indices I own + nr = 0 + do i=bndx(iamx),bndx(iamx+1)-1 + do j=bndy(iamy),bndy(iamy+1)-1 + do k=bndz(iamz),bndz(iamz+1)-1 + nr = nr + 1 + call ijk2idx(myidx(nr),i,j,k,idim,idim,idim) + end do + end do + end do + if (nr /= nlr) then + write(psb_err_unit,*) iam,iamx,iamy,iamz, 'Initialization error: NR vs NLR ',& + & nr,nlr,mynx,myny,mynz + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + end if + + ! + ! Third example of use of CDALL: specify for each process + ! the set of global indices it owns. + ! + call psb_cdall(ctxt,desc_a,info,vl=myidx) + + case default + write(psb_err_unit,*) iam, 'Initialization error: should not get here' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end select + + + if (info == psb_success_) call psb_spall(a,desc_a,info,nnz=nnz) + ! define rhs from boundary conditions; also build initial guess + if (info == psb_success_) call psb_geall(xv,desc_a,info) + if (info == psb_success_) call psb_geall(bv,desc_a,info) + + call psb_barrier(ctxt) + talc = psb_wtime()-t0 + + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='allocation rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + ! we build an auxiliary matrix consisting of one row at a + ! time; just a small matrix. might be extended to generate + ! a bunch of rows per call. + ! + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + goto 9999 + endif + + + ! loop over rows belonging to current process in a block + ! distribution. + + call psb_barrier(ctxt) + t1 = psb_wtime() + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) exit + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) exit + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) exit + end do + + tgen = psb_wtime()-t1 + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='insert rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + deallocate(val,irow,icol) + + call psb_barrier(ctxt) + t1 = psb_wtime() + if (present(imold)) then + call psb_cdasb(desc_a,info,mold=imold) + else + call psb_cdasb(desc_a,info) + end if + tcdasb = psb_wtime()-t1 + call psb_barrier(ctxt) + t1 = psb_wtime() + if (info == psb_success_) then + if (present(amold)) then + call psb_spasb(a,desc_a,info,mold=amold) + else + call psb_spasb(a,desc_a,info,afmt=afmt) + end if + end if + call psb_barrier(ctxt) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + if (info == psb_success_) then + if (present(vmold)) then + call psb_geasb(xv,desc_a,info,mold=vmold) + if (info == psb_success_) call psb_geasb(bv,desc_a,info,mold=vmold) + else + call psb_geasb(xv,desc_a,info) + if (info == psb_success_) call psb_geasb(bv,desc_a,info) + end if + end if + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + tasb = psb_wtime()-t1 + call psb_barrier(ctxt) + ttot = psb_wtime() - t0 + + call psb_amx(ctxt,talc) + call psb_amx(ctxt,tgen) + call psb_amx(ctxt,tasb) + call psb_amx(ctxt,ttot) + if(iam == psb_root_) then + tmpfmt = a%get_fmt() + write(psb_out_unit,'("The matrix has been generated and assembled in ",a3," format.")')& + & tmpfmt + write(psb_out_unit,'("-allocation time : ",es12.5)') talc + write(psb_out_unit,'("-coeff. gen. time : ",es12.5)') tgen + write(psb_out_unit,'("-desc asbly time : ",es12.5)') tcdasb + write(psb_out_unit,'("- mat asbly time : ",es12.5)') tasb + write(psb_out_unit,'("-total time : ",es12.5)') ttot + + end if + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(ctxt,err_act) + + return + end subroutine psb_d_gen_pde3d + + subroutine run_spmv_kernel(ctxt,use_gpu) + use psb_base_mod +#ifdef PSB_HAVE_CUDA + use psb_cuda_mod +#endif + implicit none + + type(psb_ctxt_type), intent(in) :: ctxt + logical, intent(in) :: use_gpu + + type(psb_dspmat_type) :: a + type(psb_d_vect_type) :: x, y + type(psb_desc_type) :: desc_a + character(len=5) :: afmt + character(len=64) :: env_buf + integer(psb_ipk_) :: my_rank, np, info, err_act + integer(psb_ipk_) :: idim, times, i, n_global + integer :: env_len, env_status, ios + real(psb_dpk_) :: alpha, beta, t0, t1, dt, avg_t + +#ifdef PSB_HAVE_CUDA + type(psb_d_vect_cuda) :: vmold + type(psb_i_vect_cuda) :: imold + type(psb_d_cuda_hlg_sparse_mat), target :: ahlg + class(psb_d_base_sparse_mat), pointer :: agmold +#endif + + info = psb_success_ + afmt = 'CSR' + idim = 10 + times = 100 + alpha = done + beta = dzero + + call psb_erractionsave(err_act) + call psb_info(ctxt, my_rank, np) + + call get_environment_variable('IDIM', env_buf, length=env_len, status=env_status) + if ((env_status == 0) .and. (env_len > 0)) then + read(env_buf(1:env_len), *, iostat=ios) idim + if ((ios /= 0) .or. (idim < 2)) idim = 10 + end if + call get_environment_variable('TIMES', env_buf, length=env_len, status=env_status) + if ((env_status == 0) .and. (env_len > 0)) then + read(env_buf(1:env_len), *, iostat=ios) times + if ((ios /= 0) .or. (times < 1)) times = 100 + end if + + n_global = idim * idim * idim + + call psb_barrier(ctxt) + call psb_d_gen_pde3d(ctxt,idim,a,y,x,desc_a,afmt,info) + if (info /= psb_success_) goto 9999 + +#ifdef PSB_HAVE_CUDA + if (use_gpu) then + agmold => ahlg + call a%cscnv(info,mold=agmold) + if (info /= psb_success_) goto 9999 + call desc_a%cnv(mold=imold) + if (info /= psb_success_) goto 9999 + call psb_geasb(x,desc_a,info,mold=vmold) + if (info /= psb_success_) goto 9999 + call psb_geasb(y,desc_a,info,mold=vmold) + if (info /= psb_success_) goto 9999 + end if +#endif + + ! warm-up + call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.false.) + if (info /= psb_success_) goto 9999 + + call psb_barrier(ctxt) + t0 = psb_wtime() + do i = 1, times + call psb_spmm(alpha, a, x, beta, y, desc_a, info, doswap=.false.) + if (info /= psb_success_) exit + end do + t1 = psb_wtime() + if (info /= psb_success_) goto 9999 + + dt = t1 - t0 + call psb_amx(ctxt, dt) + avg_t = dt / real(times, psb_dpk_) + + if (my_rank == psb_root_) then + write(psb_out_unit,'(/,"SpMV benchmark (no overlap)")') + write(psb_out_unit,'(" idim : ",i0)') idim + write(psb_out_unit,'(" global unknowns : ",i0)') n_global + write(psb_out_unit,'(" repetitions : ",i0)') times + write(psb_out_unit,'(" total time [s] : ",es12.5)') dt + write(psb_out_unit,'(" avg time [s] : ",es12.5)') avg_t + end if + + call psb_gefree(y, desc_a, info) + call psb_gefree(x, desc_a, info) + call psb_spfree(a, desc_a, info) + call psb_cdfree(desc_a, info) + call psb_erractionrestore(err_act) + return + +9999 call psb_error(ctxt) + call psb_error_handler(ctxt, err_act) + end subroutine run_spmv_kernel + +end module psb_spmv_overlap_test + +program psb_spmv_kernel + use psb_spmv_overlap_test + use psb_base_mod +#ifdef PSB_HAVE_CUDA + use psb_cuda_mod +#endif + implicit none + + type(psb_ctxt_type) :: ctxt + logical :: use_gpu + integer(psb_ipk_) :: my_rank, np, k + character(len=256) :: arg + + call psb_init(ctxt) + call psb_info(ctxt, my_rank, np) + +#ifdef PSB_HAVE_CUDA + use_gpu = .true. +#else + use_gpu = .false. +#endif + + do k = 1, command_argument_count() + call get_command_argument(k, arg) + if (index(psb_toupper(trim(arg)), '--GPU=') == 1) then + select case (psb_toupper(adjustl(arg(7:len_trim(arg))))) + case ('TRUE','T','1','YES','Y','ON') + use_gpu = .true. + case ('FALSE','F','0','NO','N','OFF') + use_gpu = .false. + end select + end if + end do + +#ifdef PSB_HAVE_CUDA + if (use_gpu) call psb_cuda_init(ctxt) +#else + use_gpu = .false. +#endif + + if (my_rank == psb_root_) then + write(psb_out_unit,*) 'Welcome to PSBLAS version: ', psb_version_string_ + write(psb_out_unit,*) 'This is the psb_spmv_kernel sample program' + write(psb_out_unit,'("GPU enabled : ",l1)') use_gpu + end if + + call run_spmv_kernel(ctxt,use_gpu) + +#ifdef PSB_HAVE_CUDA + if (use_gpu) call psb_cuda_exit() +#endif + call psb_exit(ctxt) +end program psb_spmv_kernel diff --git a/test/comm/spmv/spmv_overlap.f90 b/test/comm/spmv/spmv_overlap.f90 deleted file mode 100644 index 5d42b4f3..00000000 --- a/test/comm/spmv/spmv_overlap.f90 +++ /dev/null @@ -1,28 +0,0 @@ -program main - use psb_spmv_overlap_test - use psb_base_mod - - implicit none - - integer(psb_ipk_) :: my_rank, np - integer(psb_ipk_) :: k, h - type(psb_ctxt_type) :: ctxt - - - - call psb_init(ctxt) - call psb_info(ctxt, my_rank, np) - - if (my_rank == psb_root_) then - write(psb_out_unit,*) 'Welcome to PSBLAS version: ', psb_version_string_ - write(psb_out_unit,*) 'This is the psb_spmv_overlap_test sample program' - end if - - call psb_barrier(ctxt) - - - call psb_spmv_overlap_kernel(ctxt) - - - call psb_exit(ctxt) -end program main