diff --git a/base/internals/psi_desc_impl.f90 b/base/internals/psi_desc_impl.f90 index db1c385e..59377a08 100644 --- a/base/internals/psi_desc_impl.f90 +++ b/base/internals/psi_desc_impl.f90 @@ -72,12 +72,12 @@ subroutine psi_cnv_v2xch(ictxt, vidx_in, xch_idx,info) ! ....local scalars.... integer(psb_ipk_) :: np, me, img integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: nxch, nsnd, nrcv, nesd,nerv, ip, j, k, ixch + integer(psb_ipk_) :: nxch, nsnd, nrcv, nesd,nerv, ip, j, k, ixch,mxnrcv ! ...parameters integer(psb_ipk_) :: debug_level, debug_unit logical, parameter :: debug=.false. character(len=20) :: name - integer(psb_ipk_), allocatable :: buf_rmt_rcv_bnd(:)[:], buf_rmt_snd_bnd(:)[:] + integer(psb_ipk_), allocatable :: buf_rmt_rcv_bnd(:)[:], buf_rmt_snd_bnd(:)[:], buf_rmt_idx(:)[:] type(event_type), allocatable, save :: snd_done(:)[:] type(event_type), save :: rcv_done[*] name='psi_cnv_v2xch' @@ -100,6 +100,7 @@ subroutine psi_cnv_v2xch(ictxt, vidx_in, xch_idx,info) call psb_get_xch_idx(vidx_in, nxch, nsnd, nrcv) xch_idx%max_buffer_size = max(nsnd,nrcv) call psb_amx(ictxt,xch_idx%max_buffer_size) + mxnrcv = xch_idx%max_buffer_size if (info == 0) call psb_realloc(nxch,xch_idx%prcs_xch,info) if (info == 0) call psb_realloc(nxch,2,xch_idx%rmt_snd_bnd,info) if (info == 0) call psb_realloc(nxch,2,xch_idx%rmt_rcv_bnd,info) @@ -107,6 +108,7 @@ subroutine psi_cnv_v2xch(ictxt, vidx_in, xch_idx,info) if (info == 0) call psb_realloc(nxch+1,xch_idx%loc_rcv_bnd,info) if (info == 0) call psb_realloc(nsnd,xch_idx%loc_snd_idx,info) if (info == 0) call psb_realloc(nrcv,xch_idx%loc_rcv_idx,info) + if (info == 0) call psb_realloc(nrcv,xch_idx%rmt_rcv_idx,info) if (info /= psb_success_) then info = psb_err_from_subroutine_ call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate') @@ -119,10 +121,12 @@ subroutine psi_cnv_v2xch(ictxt, vidx_in, xch_idx,info) ixch = 1 xch_idx%loc_snd_bnd(1) = 1 xch_idx%loc_rcv_bnd(1) = 1 - if (allocated(buf_rmt_rcv_bnd)) deallocate(buf_rmt_rcv_bnd) - if (allocated(buf_rmt_snd_bnd)) deallocate(buf_rmt_snd_bnd) - if (allocated(snd_done)) deallocate(snd_done) - allocate(buf_rmt_rcv_bnd(np*2)[*], buf_rmt_snd_bnd(np*2)[*], snd_done(np)[*]) + if (allocated(buf_rmt_rcv_bnd)) deallocate(buf_rmt_rcv_bnd) + if (allocated(buf_rmt_snd_bnd)) deallocate(buf_rmt_snd_bnd) + if (allocated(snd_done)) deallocate(snd_done) + + allocate(buf_rmt_rcv_bnd(np*2)[*], buf_rmt_snd_bnd(np*2)[*], snd_done(np)[*]) + allocate(buf_rmt_idx(mxnrcv)[*]) do if (ip > size(vidx_in)) then write(psb_err_unit,*) trim(name),': Warning: out of size of input vector ' @@ -142,21 +146,24 @@ subroutine psi_cnv_v2xch(ictxt, vidx_in, xch_idx,info) xch_idx%loc_snd_bnd(ixch+1) = xch_idx%loc_snd_bnd(ixch) + nesd img = xch_idx%prcs_xch(ixch) + 1 !Here I am assuming that all the data exchange between two images takes place in one exchange - buf_rmt_rcv_bnd(me*2 - 1 : me*2)[img]= xch_idx%loc_rcv_bnd(ixch:ixch+1) - buf_rmt_snd_bnd(me*2 - 1 : me*2)[img]= xch_idx%loc_snd_bnd(ixch:ixch+1) - event post(snd_done(me)[img]) - event wait(snd_done(img)) - xch_idx%rmt_rcv_bnd(ixch,1:2)=buf_rmt_rcv_bnd(img*2 - 1 : img*2) - xch_idx%rmt_snd_bnd(ixch,1:2)=buf_rmt_snd_bnd(img*2 - 1 : img*2) + buf_rmt_rcv_bnd(me*2 - 1 : me*2)[img]= xch_idx%loc_rcv_bnd(ixch:ixch+1) + buf_rmt_snd_bnd(me*2 - 1 : me*2)[img]= xch_idx%loc_snd_bnd(ixch:ixch+1) + buf_rmt_idx(1:nesd)[img] = xch_idx%loc_snd_idx(xch_idx%loc_snd_bnd(ixch):xch_idx%loc_snd_bnd(ixch)+nesd-1) + event post(snd_done(me)[img]) + event wait(snd_done(img)) + xch_idx%rmt_rcv_idx(xch_idx%loc_rcv_bnd(ixch):xch_idx%loc_rcv_bnd(ixch)+nerv-1) = & + & buf_rmt_idx(1:nerv) + xch_idx%rmt_rcv_bnd(ixch,1:2)=buf_rmt_rcv_bnd(img*2 - 1 : img*2) + xch_idx%rmt_snd_bnd(ixch,1:2)=buf_rmt_snd_bnd(img*2 - 1 : img*2) ip = ip+nerv+nesd+3 ixch = ixch + 1 end do xch_idx%rmt_rcv_bnd(:,2) = xch_idx%rmt_rcv_bnd(:,2) - 1 xch_idx%rmt_snd_bnd(:,2) = xch_idx%rmt_snd_bnd(:,2) - 1 - if (allocated(buf_rmt_rcv_bnd)) deallocate(buf_rmt_rcv_bnd) - if (allocated(buf_rmt_snd_bnd)) deallocate(buf_rmt_snd_bnd) - if (allocated(snd_done)) deallocate(snd_done) - sync all + if (allocated(buf_rmt_rcv_bnd)) deallocate(buf_rmt_rcv_bnd) + if (allocated(buf_rmt_snd_bnd)) deallocate(buf_rmt_snd_bnd) + if (allocated(snd_done)) deallocate(snd_done) + !sync all call psb_erractionrestore(err_act) return diff --git a/base/internals/psi_extrct_dl.F90 b/base/internals/psi_extrct_dl.F90 index 95076ab3..ac654560 100644 --- a/base/internals/psi_extrct_dl.F90 +++ b/base/internals/psi_extrct_dl.F90 @@ -163,7 +163,7 @@ subroutine psi_extract_dep_list(ictxt,is_bld,is_upd,desc_str,dep_list,& i=1 if (debug_level >= psb_debug_inner_)& & write(debug_unit,*) me,' ',trim(name),': start ',info - +!!$ write(0,*) 'extract_dep_list ',me,npr pointer_dep_list=1 if (is_bld) then do while (desc_str(i) /= -1) @@ -272,7 +272,12 @@ subroutine psi_extract_dep_list(ictxt,is_bld,is_upd,desc_str,dep_list,& itmp(1:dl_lda) = dep_list(1:dl_lda,me) dl_mpi = dl_lda - call caf_allgather(itmp,dl_mpi, dep_list, minfo) + if (.false.) then + call caf_allgather(itmp, dl_mpi, dep_list, minfo) + else + call mpi_allgather(itmp,dl_mpi,psb_mpi_ipk_integer,& + & dep_list,dl_mpi,psb_mpi_ipk_integer,icomm,minfo) + end if if (info == 0) deallocate(itmp,stat=info) if (info /= psb_success_) then info=psb_err_alloc_dealloc_ diff --git a/base/modules/Makefile b/base/modules/Makefile index dad6164a..dac4529b 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -155,7 +155,7 @@ psblas/psb_s_psblas_mod.o psblas/psb_c_psblas_mod.o psblas/psb_d_psblas_mod.o ps psb_base_mod.o: $(MODULES) -psb_caf_mod.o: psb_caf_mod.f90 psb_const_mod.o +psb_caf_mod.o: psb_caf_mod.f90 psb_const_mod.o psb_realloc_mod.o $(FC) $(FINCLUDES) $(FDEFINES) $(F90COPT) $(EXTRA_OPT) -c $< -o $@ psi_penv_mod.o: psi_penv_mod.F90 $(BASIC_MODS) diff --git a/base/modules/desc/psb_desc_mod.F90 b/base/modules/desc/psb_desc_mod.F90 index 38bd7e5f..acb39d0d 100644 --- a/base/modules/desc/psb_desc_mod.F90 +++ b/base/modules/desc/psb_desc_mod.F90 @@ -204,6 +204,7 @@ module psb_desc_mod integer(psb_ipk_), allocatable :: rmt_rcv_bnd(:,:) integer(psb_ipk_), allocatable :: loc_rcv_bnd(:) integer(psb_ipk_), allocatable :: loc_snd_bnd(:) + integer(psb_ipk_), allocatable :: rmt_rcv_idx(:) integer(psb_ipk_), allocatable :: loc_rcv_idx(:) integer(psb_ipk_), allocatable :: loc_snd_idx(:) integer(psb_ipk_) :: max_buffer_size=0 diff --git a/base/modules/psb_caf_mod.f90 b/base/modules/psb_caf_mod.f90 index 17ce2d2a..f7100575 100644 --- a/base/modules/psb_caf_mod.f90 +++ b/base/modules/psb_caf_mod.f90 @@ -1131,6 +1131,7 @@ t2 = mpi_wtime() - t1 end subroutine caf_zgatherv subroutine caf_iallgatherv(snd, scount, rcv, rcount, rdispls, info) + use psb_realloc_mod implicit none integer(psb_ipk_), intent(in) :: scount, snd(:), rcount(:), rdispls(:) integer(psb_ipk_), allocatable, intent(inout) :: rcv(:) @@ -1149,16 +1150,22 @@ t2 = mpi_wtime() - t1 info = -4 return endif - +! write(0,*) 'caf_allgatherv',me,np +! call flush(0) +! sync all if (allocated(snd_buf)) deallocate(snd_buf) allocate(snd_buf(scount)[*], STAT=info) - if (info/=0) return + if (info/=0)then + write(0,*) me, 'Info on allocate ',info + return + end if if (allocated(rcv).and.(size(rcv,1) < rdispls(np) + rcount(np))) then - call move_alloc(rcv,rcv_tmp) - allocate(rcv(rdispls(np)+rcount(np))) - rcv(1:size(rcv_tmp,1))=rcv_tmp - deallocate(rcv_tmp) + call psb_realloc(rdispls(np)+rcount(np),rcv,info) +!!$ call move_alloc(rcv,rcv_tmp) +!!$ allocate(rcv(rdispls(np)+rcount(np))) +!!$ rcv(1:size(rcv_tmp,1))=rcv_tmp +!!$ deallocate(rcv_tmp) endif snd_buf(1:scount)=snd(1:scount) @@ -1365,39 +1372,75 @@ t2 = mpi_wtime() - t1 integer(psb_ipk_), allocatable :: snd_buf(:)[:] type(event_type), allocatable :: snd_copied(:)[:] double precision :: t1, t2 - t1 = mpi_wtime() - np = num_images() - me = this_image() - info = 0 - if (size(rcv,1) < scount) then - info = -3 - print*,'error', info, size(rcv,1), scount - return - endif - if (size(rcv,2) < np) then - info = -4 - print*,'error', info, size(rcv,2), np - return - endif - if (allocated(snd_buf)) deallocate(snd_buf) - if (allocated(snd_copied)) deallocate(snd_copied) - allocate(snd_buf(size(snd,1))[*]) - allocate(snd_copied(np)[*]) - !allocate(snd_buf(size(snd,1))[*]) - snd_buf=snd - do img=1,np - event post(snd_copied(me)[img]) - enddo - !sync all - do img=1,np - event wait(snd_copied(img)) - rcv(:,img)=snd_buf(:)[img] - enddo - if (allocated(snd_buf)) deallocate(snd_buf) - if (allocated(snd_copied)) deallocate(snd_copied) - !Not sure this is necessary... - sync all - t2 = mpi_wtime() - t1 + t1 = mpi_wtime() + np = num_images() + me = this_image() + info = 0 + if (size(rcv,1) < scount) then + info = -3 + print*,'error', info, size(rcv,1), scount + return + endif + if (size(rcv,2) < np) then + info = -4 + print*,'error', info, size(rcv,2), np + return + endif +! write(*,*) 'Hello from ',me,' of:', np,size(snd,1) +! sync all + if (.true.) then +!!$ if (allocated(snd_buf)) deallocate(snd_buf) +!!$ if (allocated(snd_copied)) deallocate(snd_copied) + allocate(snd_buf(size(snd,1))[*],stat=info) + if (info /= 0) then + write(*,*) 'Error on allocating snd_buf ',me + stop + end if + allocate(snd_copied(np)[*],stat=info) + if (info /= 0) then + write(*,*) 'Error on allocating snd_copied ',me + stop + end if + !allocate(snd_buf(size(snd,1))[*]) + snd_buf(:)=snd(:) +! write(*,*) 'Sending from ',me,':', snd(:) + do img=1,np + event post(snd_copied(me)[img]) + enddo + !sync all + if(.false.) then + do img=1,np + event wait(snd_copied(img)) + rcv(:,img)=snd_buf(:)[img] + enddo + else + do img=me+1,np + event wait(snd_copied(img)) + rcv(:,img)=snd_buf(:)[img] + enddo + do img=1,me + event wait(snd_copied(img)) + rcv(:,img)=snd_buf(:)[img] + enddo + + end if + !if (allocated(snd_buf)) + deallocate(snd_buf) + !if (allocated(snd_copied)) + deallocate(snd_copied) + !Not sure this is necessary... + !sync all + else if(.false.) then + do img=1,np + if (me == img) rcv(:,img) = snd(:) + call co_broadcast(rcv(:,img),img) + end do + else if(.false.) then + rcv(:,:) = 0 + rcv(:,me) = snd(:) + call co_sum(rcv) + end if + t2 = mpi_wtime() - t1 end subroutine caf_allgather diff --git a/include/Make.inc.psblas b/include/Make.inc.psblas index 61a1e60b..2287943c 100644 --- a/include/Make.inc.psblas +++ b/include/Make.inc.psblas @@ -28,24 +28,24 @@ FLINK=$(MPFC) LIBS= # BLAS, BLACS and METIS libraries. -BLAS=-lcblas -lf77blas -latlas -L/opt/atlas/3.8.4/gnu/7.1.0/lib -METIS_LIB=-lmetis -L/opt/parmetis/4.0.3/mpich/3.2.0/gnu/7.1.0/Lib -L/opt/parmetis/4.0.3/mpich/3.2.0/gnu/7.1.0/lib +BLAS=-lopenblas -L/mnt/gpfs0/home/e802756/builds/openblas/0.2.20 +METIS_LIB= AMD_LIB= -LAPACK=-llapack +LAPACK= EXTRA_COBJS= -PSBFDEFINES=-DHAVE_METIS -DHAVE_LAPACK -DHAVE_MOLD -DHAVE_EXTENDS_TYPE_OF -DHAVE_SAME_TYPE_AS -DHAVE_FINAL -DHAVE_ISO_FORTRAN_ENV -DHAVE_FLUSH_STMT -DHAVE_VOLATILE -DMPI_MOD -PSBCDEFINES=-DHAVE_METIS_ -I/opt/parmetis/4.0.3/mpich/3.2.0/gnu/7.1.0/include -I/opt/parmetis/4.0.3/mpich/3.2.0/gnu/7.1.0/Include -DLowerUnderscore -DPtr64Bits +PSBFDEFINES=-DHAVE_LAPACK -DHAVE_MOLD -DHAVE_EXTENDS_TYPE_OF -DHAVE_SAME_TYPE_AS -DHAVE_FINAL -DHAVE_ISO_FORTRAN_ENV -DHAVE_FLUSH_STMT -DHAVE_VOLATILE -DMPI_MOD +PSBCDEFINES=-DLowerUnderscore -DPtr64Bits AR=ar -cur RANLIB=ranlib INSTALL=/usr/bin/install -c INSTALL_DATA=${INSTALL} -m 644 -INSTALL_DIR=/opt/psblas/CAF/7.1.0-csr -INSTALL_LIBDIR=/opt/psblas/CAF/7.1.0-csr/lib -INSTALL_INCLUDEDIR=/opt/psblas/CAF/7.1.0-csr/include -INSTALL_DOCSDIR=/opt/psblas/CAF/7.1.0-csr/docs -INSTALL_SAMPLESDIR=/opt/psblas/CAF/7.1.0-csr/samples +INSTALL_DIR=/mnt/gpfs0/home/e802756/NUMERICAL/CAF/7.1.0-ompi1107 +INSTALL_LIBDIR=/mnt/gpfs0/home/e802756/NUMERICAL/CAF/7.1.0-ompi1107/lib +INSTALL_INCLUDEDIR=/mnt/gpfs0/home/e802756/NUMERICAL/CAF/7.1.0-ompi1107/include +INSTALL_DOCSDIR=/mnt/gpfs0/home/e802756/NUMERICAL/CAF/7.1.0-ompi1107/docs +INSTALL_SAMPLESDIR=/mnt/gpfs0/home/e802756/NUMERICAL/CAF/7.1.0-ompi1107/samples # the following is the flag for /bin/cp which shall copy the file only for updating (timestamp based)--on GNU Linux, '-u' CPUPDFLAG= diff --git a/test/hello/Makefile b/test/hello/Makefile index 81c337bb..12d85ef5 100644 --- a/test/hello/Makefile +++ b/test/hello/Makefile @@ -28,8 +28,7 @@ pingpong: pingpong.o clean: - /bin/rm -f hello.o pingpong.o - $(EXEDIR)/hello + /bin/rm -f hello.o pingpong.o $(EXEDIR)/hello verycleanlib: (cd ../..; make veryclean) lib: diff --git a/test/hello/hello.f90 b/test/hello/hello.f90 index f84de007..2b7ba8f1 100644 --- a/test/hello/hello.f90 +++ b/test/hello/hello.f90 @@ -1,21 +1,22 @@ program hello - use psb_base_mod + use iso_fortran_env implicit none - integer iam, np, icontxt, ip, jp, idummy + integer me, np, icontxt, ip, jp, idummy + integer :: snd_buf(4)[*] + type(event_type), allocatable :: snd_copied(:)[:] - call psb_init(icontxt) - call psb_info(icontxt,iam,np) - ! have all processes check in - if ((iam >= 0).and.(iam < np)) then - if (iam == 0) then - do ip = 1, np-1 - call psb_rcv(icontxt,idummy,ip) - enddo - write(*,*) 'Hello, world: all ',np, & - & ' processes checked in!' - else - call psb_snd(icontxt,idummy,0) - endif + + me = this_image() + np = num_images() + + write(*,*) 'Hello from ',me,' of:', np + snd_buf(1:4) = me*(/1,2,3,4/) + + sync all + if (me == 1) then + do ip=1,np + write(*,*) 'From ',ip,' :',snd_buf(:)[ip] + end do end if - call psb_exit(icontxt) + end program hello diff --git a/test/kernel/pdgenspmv.f90 b/test/kernel/pdgenspmv.f90 index 9d72fd91..98e4815b 100644 --- a/test/kernel/pdgenspmv.f90 +++ b/test/kernel/pdgenspmv.f90 @@ -47,20 +47,24 @@ program pdgenspmv ! sparse matrix and preconditioner type(psb_dspmat_type) :: a, ad, ah, ah1 + type(psb_d_csr_sparse_mat) :: acsr ! descriptor type(psb_desc_type) :: desc_a ! dense matrices - type(psb_d_vect_type) :: xv,bv, vtst - real(psb_dpk_), allocatable :: tst(:) + type(psb_d_vect_type) :: xv,bv, vtst,bvh + real(psb_dpk_), allocatable :: tst(:), work(:) + real(psb_dpk_), allocatable :: xvc(:)[:], bvc(:)[:] ! blacs parameters integer(psb_ipk_) :: ictxt, iam, np - type(psb_d_csre_sparse_mat) :: acsre + ! type(psb_d_csre_sparse_mat) :: acsre ! solver parameters - integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nr,nrl,ncl + integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, nr,nrl,ncl, lwork, nclmx + integer(psb_ipk_) :: ip, img, nxch, p1,p2 integer(psb_long_int_k_) :: amatsize, precsize, descsize, d2size, annz, nbytes, ahnnz real(psb_dpk_) :: err, eps - integer(psb_ipk_), parameter :: times=50 + integer(psb_ipk_) :: times + integer(psb_ipk_), parameter :: iwarm=10 ! other variables integer(psb_ipk_) :: info, i @@ -91,7 +95,7 @@ program pdgenspmv ! ! get parameters ! - call get_parms(ictxt,afmt,idim) + call get_parms(ictxt,afmt,idim, times) ! ! allocate and fill in the coefficient matrix, rhs and initial guess @@ -108,52 +112,87 @@ program pdgenspmv call psb_errpush(info,name,a_err=ch_err) goto 9999 end if + call psb_geasb(bvh,desc_a,info,scratch=.true.) if (iam == psb_root_) write(psb_out_unit,'("Overall matrix creation time : ",es12.5)')t2 if (iam == psb_root_) write(psb_out_unit,'(" ")') !!$ write(fname,'(a,i3.3,a,i3.3,a,i3.3,a)') 'testmat-',idim,'-',np,'-',iam,'.mtx' !!$ call a%print(fname,head="psb-testing") - - call xv%set(done) - call psb_barrier(ictxt) - t1 = psb_wtime() - do i=1,times - call psb_spmm(done,a,xv,dzero,bv,desc_a,info,'n') - end do - call psb_barrier(ictxt) - t2 = psb_wtime() - t1 - call psb_amx(ictxt,t2) + ! + ! Coarrays for experiment + ! nrl = desc_a%get_local_rows() ncl = desc_a%get_local_cols() - call a%csclip(ad,info,jmax=nrl) - call a%csclip(ah,info,jmin=nrl+1,jmax=ncl,cscale=.true.) - call a%csclip(ah1,info,jmin=nrl+1,jmax=ncl,cscale=.true.) - call ah%cscnv(info,mold=acsre) - call ah1%cscnv(info,mold=acsre) - + nclmx = ncl + call psb_amx(ictxt,nclmx) + allocate(xvc(nclmx)[*]) + allocate(bvc(nclmx)[*]) + xvc(1:ncl) = done *(/(i,i=1,ncl)/) + bvc(:) = dzero + call xv%set(xvc(1:ncl)) + + + do i=1,iwarm + call psb_spmm(done,a,xv,dzero,bv,desc_a,info,'n') + end do + ! FIXME: cache flush needed here - call psb_barrier(ictxt) - tt1 = psb_wtime() - do i=1,times - call psb_csmm(done,ah,xv,done,bv,info) + do i=1,iwarm + call psb_spmm(done,a,xv,dzero,bv,desc_a,info,'n') end do call psb_barrier(ictxt) - th = psb_wtime() - tt1 - call psb_barrier(ictxt) tt1 = psb_wtime() do i=1,times - call psb_csmm(done,ah1,xv,done,bv,info) + call psb_spmm(done,a,xv,dzero,bv,desc_a,info,'n') end do call psb_barrier(ictxt) - th1 = psb_wtime() - tt1 - call psb_amx(ictxt,th1) + th = psb_wtime() - tt1 + call psb_amx(ictxt,th) + + if (.true.) then + associate(xchg => desc_a%halo_xch) + ! FIXME: cache flush needed here + nxch = size(xchg%prcs_xch) + write(0,*) this_image(),nxch,nrl,ncl,' Exchanging with ',xchg%prcs_xch+1 + do i=1,iwarm + ! Sync images + sync images(xchg%prcs_xch+1) + do ip = 1, nxch + img = xchg%prcs_xch(ip) + 1 + p1 = xchg%loc_rcv_bnd(ip) + p2 = xchg%loc_rcv_bnd(ip+1)-1 + write(0,*) this_image(),'Boundaries ',p1,p2,' :',xchg%loc_rcv_idx(p1:p2),':',xchg%rmt_rcv_idx(p1:p2) + xvc(xchg%loc_rcv_idx(p1:p2)) = xvc(xchg%rmt_rcv_idx(p1:p2))[img] + end do + call a%csmv(done,xvc,dzero,bvc,info) + end do + call psb_barrier(ictxt) + tt1 = psb_wtime() + do i=1,times + ! Sync images + sync images(xchg%prcs_xch+1) + do ip = 1, nxch + img = xchg%prcs_xch(ip) + 1 + p1 = xchg%loc_rcv_bnd(ip) + p2 = xchg%loc_rcv_bnd(ip+1)-1 + !xvc(xchg%loc_rcv_idx(p1:p2)) = xvc(xchg%rmt_rcv_idx(p1:p2))[img] + end do + call a%csmv(done,xvc,dzero,bvc,info) + end do + call psb_barrier(ictxt) + tt2 = psb_wtime() - tt1 + call psb_amx(ictxt,tt2) + end associate + call bvh%set(bvc(1:ncl)) + endif + + call psb_geaxpby(-done,bv,done,bvh,desc_a,info) + err = psb_genrm2(bvh,desc_a,info) - call psb_amx(ictxt,t2) nr = desc_a%get_global_rows() annz = a%get_nzeros() - ahnnz = ah%get_nzeros() amatsize = a%sizeof() descsize = psb_sizeof(desc_a) call psb_sum(ictxt,annz) @@ -163,39 +202,36 @@ program pdgenspmv if (iam == psb_root_) then flops = 2.d0*times*annz - tflops = 2.d0*times*ahnnz - tflops1 = 2.d0*times*ahnnz + tflops = flops + write(psb_out_unit,'("Matrix: ell1 ",i0)') idim write(psb_out_unit,'("Test on : ",i20," processors")') np write(psb_out_unit,'("Size of matrix : ",i20," ")') nr write(psb_out_unit,'("Number of nonzeros : ",i20," ")') annz - write(psb_out_unit,'("Number of nonzeros : ",i20," ")') ahnnz write(psb_out_unit,'("Memory occupation : ",i20," ")') amatsize write(psb_out_unit,'("Number of flops (",i0," prod) : ",F20.0," ")') times,flops - flops = flops / (t2) - tflops = tflops / (th) - tflops1 = tflops1 / (th1) - write(psb_out_unit,'("Time for ",i0," products (s) : ",F20.3)')times, t2 - write(psb_out_unit,'("Time per product (ms) : ",F20.3)') t2*1.d3/(1.d0*times) + flops = flops / (th) + tflops = tflops / (tt2) + + write(psb_out_unit,'("Time for ",i0," products (s) : ",F20.3)')times, th + write(psb_out_unit,'("Time per product (ms) : ",F20.3)') th*1.d3/(1.d0*times) write(psb_out_unit,'("MFLOPS : ",F20.3)') flops/1.d6 - write(psb_out_unit,'("Time for ",i0," products (s) (trans.): ",F20.3)') times,th - write(psb_out_unit,'("Time per product (ms) (trans.): ",F20.3)') th*1.d3/(1.d0*times) - write(psb_out_unit,'("MFLOPS (trans.): ",F20.3)') tflops/1.d6 - write(psb_out_unit,'("Time for ",i0," products (s) (trans.): ",F20.3)') times,th1 - write(psb_out_unit,'("Time per product (ms) (trans.): ",F20.3)') th1*1.d3/(1.d0*times) - write(psb_out_unit,'("MFLOPS (trans.): ",F20.3)') tflops1/1.d6 + write(psb_out_unit,'("Time for ",i0," products (s) nw : ",F20.3)')times, tt2 + write(psb_out_unit,'("Time per product (ms) nw : ",F20.3)') tt2*1.d3/(1.d0*times) + write(psb_out_unit,'("MFLOPS : ",F20.3)') tflops/1.d6 + + write(psb_out_unit,'("Difference : ",E20.12)') err ! ! This computation is valid for CSR ! nbytes = nr*(2*psb_sizeof_dp + psb_sizeof_int)+& & annz*(psb_sizeof_dp + psb_sizeof_int) - bdwdth = times*nbytes/(t2*1.d6) + bdwdth = times*nbytes/(th*1.d6) write(psb_out_unit,*) write(psb_out_unit,'("MBYTES/S : ",F20.3)') bdwdth - bdwdth = times*nbytes/(tt2*1.d6) - write(psb_out_unit,'("MBYTES/S (trans): ",F20.3)') bdwdth + write(psb_out_unit,'("Storage type for DESC_A: ",a)') desc_a%get_fmt() write(psb_out_unit,'("Total memory occupation for DESC_A: ",i12)')descsize @@ -227,10 +263,10 @@ contains ! ! get iteration parameters from standard input ! - subroutine get_parms(ictxt,afmt,idim) + subroutine get_parms(ictxt,afmt,idim, times) integer(psb_ipk_) :: ictxt character(len=*) :: afmt - integer(psb_ipk_) :: idim + integer(psb_ipk_) :: idim, times integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: intbuf(10), ip @@ -239,9 +275,11 @@ contains if (iam == 0) then read(psb_inp_unit,*) afmt read(psb_inp_unit,*) idim + read(psb_inp_unit,*) times endif call psb_bcast(ictxt,afmt) call psb_bcast(ictxt,idim) + call psb_bcast(ictxt,times) if (iam == 0) then write(psb_out_unit,'("Testing matrix : ell1")') diff --git a/test/kernel/runs/spmv.inp b/test/kernel/runs/spmv.inp index 7539285f..3e4c5f79 100644 --- a/test/kernel/runs/spmv.inp +++ b/test/kernel/runs/spmv.inp @@ -1,3 +1,3 @@ CSR -120 - +008 +80 diff --git a/test/pargen/psb_d_pde2d.f90 b/test/pargen/psb_d_pde2d.f90 index c1477c66..f7ea328c 100644 --- a/test/pargen/psb_d_pde2d.f90 +++ b/test/pargen/psb_d_pde2d.f90 @@ -236,6 +236,7 @@ program psb_d_pde2d if (iam == psb_root_) then write(psb_out_unit,'(" ")') + write(psb_out_unit,'("Number of processors : ",i0)') np write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter write(psb_out_unit,'("Number of iterations : ",i0)')iter diff --git a/test/pargen/psb_d_pde3d.f90 b/test/pargen/psb_d_pde3d.f90 index b3700f38..69391695 100644 --- a/test/pargen/psb_d_pde3d.f90 +++ b/test/pargen/psb_d_pde3d.f90 @@ -131,7 +131,7 @@ program psb_d_pde3d ! miscellaneous real(psb_dpk_), parameter :: one = 1.d0 - real(psb_dpk_) :: t1, t2, tprec + real(psb_dpk_) :: t1, t2, tprec, tslv, tprcavg, tslvavg ! sparse matrix and preconditioner type(psb_dspmat_type) :: a @@ -152,7 +152,8 @@ program psb_d_pde3d integer(psb_ipk_) :: info, i character(len=20) :: name,ch_err character(len=40) :: fname - + integer(psb_ipk_) :: itest, ntests + integer, parameter :: ntdef=10, nwrmup=4 info=psb_success_ @@ -166,7 +167,7 @@ program psb_d_pde3d endif if(psb_get_errstatus() /= 0) goto 9999 name='pde3d90' - call psb_set_errverbosity(itwo) + call psb_set_errverbosity(itwo*500) call psb_cd_set_large_threshold(itwo) ! ! Hello world @@ -178,7 +179,7 @@ program psb_d_pde3d ! ! get parameters ! - call get_parms(ictxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst) + call get_parms(ictxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ntests) ! ! allocate and fill in the coefficient matrix, rhs and initial guess @@ -201,45 +202,70 @@ program psb_d_pde3d ! prepare the preconditioner. ! if(iam == psb_root_) write(psb_out_unit,'("Setting preconditioner to : ",a)')ptype - call prec%init(ptype,info) + tslv = 1d300 + tprec = 1d300 + tprcavg = 0 + tslvavg = 0 + do itest=1, ntests+nwrmup + + call prec%init(ptype,info) + + call psb_barrier(ictxt) + t1 = psb_wtime() + call prec%build(a,desc_a,info) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='psb_precbld' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if - call psb_barrier(ictxt) - t1 = psb_wtime() + t2 = psb_wtime()-t1 + call psb_amx(ictxt,t2) + if (itest>nwrmup) then + tprec = min(tprec,t2) + tprcavg = tprcavg + t2 + end if + call prec%free(info) + end do + + if (iam == psb_root_) write(psb_out_unit,'(" ")') + call prec%init(ptype,info) call prec%build(a,desc_a,info) - if(info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='psb_precbld' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if + call prec%descr() - tprec = psb_wtime()-t1 - call psb_amx(ictxt,tprec) + do itest=1, ntests+nwrmup + + ! + ! iterative method parameters + ! + call xxv%zero() + if(iam == psb_root_) write(psb_out_unit,'("Calling iterative method ",a)')kmethd + call psb_barrier(ictxt) + t1 = psb_wtime() + eps = 1.d-9 + call psb_krylov(kmethd,a,prec,bv,xxv,eps,desc_a,info,& + & itmax=itmax,iter=iter,err=err,itrace=itrace,istop=istopc,irst=irst) + + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='solver routine' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if - if (iam == psb_root_) write(psb_out_unit,'("Preconditioner time : ",es12.5)')tprec - if (iam == psb_root_) write(psb_out_unit,'(" ")') - call prec%descr() - ! - ! iterative method parameters - ! - if(iam == psb_root_) write(psb_out_unit,'("Calling iterative method ",a)')kmethd - call psb_barrier(ictxt) - t1 = psb_wtime() - eps = 1.d-9 - call psb_krylov(kmethd,a,prec,bv,xxv,eps,desc_a,info,& - & itmax=itmax,iter=iter,err=err,itrace=itrace,istop=istopc,irst=irst) + call psb_barrier(ictxt) + t2 = psb_wtime() - t1 + call psb_amx(ictxt,t2) + if (itest>nwrmup) then + tslv = min(tslv,t2) + tslvavg = tslvavg + t2 + end if + + end do - if(info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='solver routine' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - call psb_barrier(ictxt) - t2 = psb_wtime() - t1 - call psb_amx(ictxt,t2) amatsize = a%sizeof() descsize = desc_a%sizeof() precsize = prec%sizeof() @@ -249,8 +275,12 @@ program psb_d_pde3d if (iam == psb_root_) then write(psb_out_unit,'(" ")') - write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 - write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter + write(psb_out_unit,'("Number of processors : ",i0)') np + write(psb_out_unit,'("Preconditioner time min : ",es12.5)')tprec + write(psb_out_unit,'("Preconditioner time avg : ",es12.5)')tprcavg + write(psb_out_unit,'("Time to solve system min : ",es12.5)')tslv + write(psb_out_unit,'("Time to solve system avg : ",es12.5)')tslvavg + write(psb_out_unit,'("Time per iteration avg : ",es12.5)')tslvavg/iter write(psb_out_unit,'("Number of iterations : ",i0)')iter write(psb_out_unit,'("Convergence indicator on exit : ",es12.5)')err write(psb_out_unit,'("Info on exit : ",i0)')info @@ -267,7 +297,6 @@ program psb_d_pde3d call psb_gefree(bv,desc_a,info) call psb_gefree(xxv,desc_a,info) call psb_spfree(a,desc_a,info) - call prec%free(info) call psb_cdfree(desc_a,info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -276,7 +305,7 @@ program psb_d_pde3d goto 9999 end if - call psb_exit(ictxt) + call psb_exit(ictxt,close=.false.) stop 9999 call psb_error(ictxt) @@ -287,10 +316,10 @@ contains ! ! get iteration parameters from standard input ! - subroutine get_parms(ictxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst) + subroutine get_parms(ictxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ntests) integer(psb_ipk_) :: ictxt character(len=*) :: kmethd, ptype, afmt - integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst + integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst, ntests integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: ip @@ -325,6 +354,11 @@ contains else irst=1 endif + if (ip >= 8) then + read(psb_inp_unit,*) ntests + else + ntests = ntdef + endif ! broadcast parameters to all processors @@ -353,6 +387,7 @@ contains call psb_bcast(ictxt,itmax) call psb_bcast(ictxt,itrace) call psb_bcast(ictxt,irst) + call psb_bcast(ictxt,ntests) return diff --git a/test/pargen/psb_s_pde2d.f90 b/test/pargen/psb_s_pde2d.f90 index 5b81f114..9fde6ce9 100644 --- a/test/pargen/psb_s_pde2d.f90 +++ b/test/pargen/psb_s_pde2d.f90 @@ -235,6 +235,7 @@ program psb_s_pde2d if (iam == psb_root_) then write(psb_out_unit,'(" ")') + write(psb_out_unit,'("Number of processors : ",i0)') np write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter write(psb_out_unit,'("Number of iterations : ",i0)')iter diff --git a/test/pargen/psb_s_pde3d.f90 b/test/pargen/psb_s_pde3d.f90 index f769fa73..dec11be2 100644 --- a/test/pargen/psb_s_pde3d.f90 +++ b/test/pargen/psb_s_pde3d.f90 @@ -249,6 +249,7 @@ program psb_s_pde3d if (iam == psb_root_) then write(psb_out_unit,'(" ")') + write(psb_out_unit,'("Number of processors : ",i0)') np write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter write(psb_out_unit,'("Number of iterations : ",i0)')iter diff --git a/test/pargen/runs/ppde.inp b/test/pargen/runs/ppde.inp index e88fd748..ae24cfca 100644 --- a/test/pargen/runs/ppde.inp +++ b/test/pargen/runs/ppde.inp @@ -1,11 +1,11 @@ -7 Number of entries below this +8 Number of entries below this BICGSTAB Iterative method BICGSTAB CGS BICG BICGSTABL RGMRES BJAC Preconditioner NONE DIAG BJAC CSR Storage format for matrix A: CSR COO JAD -1000 Domain size (acutal system is this**3) +200 Domain size (acutal system is this**3) 2 Stopping criterion 1000 MAXIT -1 ITRACE 002 IRST restart for RGMRES and BiCGSTABL - +10 NTESTS