From 9c248a31e2a74248cbab7028f00b16ffcdf34b75 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 19 Apr 2023 12:12:58 +0200 Subject: [PATCH] Refactored EXSCAN and its OpenMP usage. --- base/serial/impl/psb_c_csr_impl.F90 | 20 ++--- base/serial/impl/psb_d_csr_impl.F90 | 20 ++--- base/serial/impl/psb_s_csr_impl.F90 | 20 ++--- base/serial/impl/psb_z_csr_impl.F90 | 20 ++--- base/serial/psi_c_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_d_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_e_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_i2_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_m_serial_impl.F90 | 115 +++++++++++++++------------- base/serial/psi_s_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_z_serial_impl.F90 | 113 +++++++++++++++------------ 11 files changed, 473 insertions(+), 400 deletions(-) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 388e9e2b..b742204d 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=cone,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_c_mv_csr_from_coo diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 1762604d..86287e32 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=done,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_d_mv_csr_from_coo diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 71da21a4..46ead8fc 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=sone,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_s_mv_csr_from_coo diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 34332f46..b6ec8fe7 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=zone,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_z_mv_csr_from_coo diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index 91f463f4..2391becb 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase complex(psb_spk_) :: shift_, tp, ts - complex(psb_spk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_c_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_c_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_c_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + complex(psb_spk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_c_exscan +#endif end subroutine psi_c_exscanv subroutine psb_m_cgelp(trans,iperm,x,info) diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index d8fb92d7..099dd1d4 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase real(psb_dpk_) :: shift_, tp, ts - real(psb_dpk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_d_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_d_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_d_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + real(psb_dpk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_d_exscan +#endif end subroutine psi_d_exscanv subroutine psb_m_dgelp(trans,iperm,x,info) diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index 24ce41f1..10ca93b4 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase integer(psb_epk_) :: shift_, tp, ts - integer(psb_epk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_e_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_e_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_e_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + integer(psb_epk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_e_exscan +#endif end subroutine psi_e_exscanv subroutine psb_m_egelp(trans,iperm,x,info) diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index c5889013..3ccd35bc 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase integer(psb_i2pk_) :: shift_, tp, ts - integer(psb_i2pk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_i2_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_i2_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_i2_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + integer(psb_i2pk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_i2_exscan +#endif end subroutine psi_i2_exscanv subroutine psb_m_i2gelp(trans,iperm,x,info) diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index d7849cd7..c3d73e5a 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -44,9 +44,8 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase integer(psb_mpk_) :: shift_, tp, ts - integer(psb_mpk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ - logical is_nested + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -60,56 +59,14 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - is_nested = omp_get_nested() - call omp_set_nested(.true.) - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_m_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + !$OMP PARALLEL default(shared) + call inner_m_exscan() + !$OMP END PARALLEL end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - !$OMP END PARALLEL - call omp_set_nested(is_nested) #else tp = shift_ do i=1,n @@ -119,7 +76,61 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_m_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + integer(psb_mpk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_m_exscan +#endif end subroutine psi_m_exscanv subroutine psb_m_mgelp(trans,iperm,x,info) diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index ec22ade7..fd1c4cd4 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase real(psb_spk_) :: shift_, tp, ts - real(psb_spk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_s_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_s_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_s_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + real(psb_spk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_s_exscan +#endif end subroutine psi_s_exscanv subroutine psb_m_sgelp(trans,iperm,x,info) diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index 9bcdfd7e..bf055476 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase complex(psb_dpk_) :: shift_, tp, ts - complex(psb_dpk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_z_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_z_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_z_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + complex(psb_dpk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_z_exscan +#endif end subroutine psi_z_exscanv subroutine psb_m_zgelp(trans,iperm,x,info)