diff --git a/samples/advanced/pdegen/amg_d_genpde_mod.F90 b/samples/advanced/pdegen/amg_d_genpde_mod.F90 index d6acd01c..ec3affc5 100644 --- a/samples/advanced/pdegen/amg_d_genpde_mod.F90 +++ b/samples/advanced/pdegen/amg_d_genpde_mod.F90 @@ -93,6 +93,9 @@ contains & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -128,7 +131,6 @@ contains type(psb_d_csc_sparse_mat) :: acsc type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr - real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -141,8 +143,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2 @@ -368,119 +369,128 @@ contains call psb_barrier(ctxt) talc = psb_wtime()-t0 - if (info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='allocation rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if (info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + !write(0,*) ' Outer in_parallel ',omp_in_parallel() + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -490,7 +500,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) call psb_barrier(ctxt) t1 = psb_wtime() @@ -557,6 +566,9 @@ contains & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -591,7 +603,6 @@ contains type(psb_d_csc_sparse_mat) :: acsc type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr - real(psb_dpk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -604,8 +615,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2, dd @@ -791,7 +801,7 @@ contains !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() end if end block - + case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -816,93 +826,109 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(done,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + ! loop over rows belonging to current process in a block + ! distribution. + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(done,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -912,8 +938,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info) diff --git a/samples/advanced/pdegen/amg_s_genpde_mod.F90 b/samples/advanced/pdegen/amg_s_genpde_mod.F90 index 7d32cf30..dfa79ab3 100644 --- a/samples/advanced/pdegen/amg_s_genpde_mod.F90 +++ b/samples/advanced/pdegen/amg_s_genpde_mod.F90 @@ -93,6 +93,9 @@ contains & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -128,7 +131,6 @@ contains type(psb_s_csc_sparse_mat) :: acsc type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr - real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -141,8 +143,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2 @@ -368,119 +369,128 @@ contains call psb_barrier(ctxt) talc = psb_wtime()-t0 - if (info /= psb_success_) then - info=psb_err_from_subroutine_ - ch_err='allocation rout.' - call psb_errpush(info,name,a_err=ch_err) - goto 9999 - end if - - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if (info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + !write(0,*) ' Outer in_parallel ',omp_in_parallel() + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -490,7 +500,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) call psb_barrier(ctxt) t1 = psb_wtime() @@ -557,6 +566,9 @@ contains & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -591,7 +603,6 @@ contains type(psb_s_csc_sparse_mat) :: acsc type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr - real(psb_spk_) :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner @@ -604,8 +615,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2, dd @@ -791,7 +801,7 @@ contains !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() end if end block - + case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -816,93 +826,109 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + ! loop over rows belonging to current process in a block + ! distribution. + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do + !$omp end do + + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -912,8 +938,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info)