Refactored EXSCAN and its OpenMP usage.

omp-threadsafe
sfilippone 2 years ago
parent 02dd204351
commit 9c248a31e2

@ -2920,16 +2920,14 @@ subroutine psb_c_cp_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -2937,6 +2935,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -2944,8 +2943,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
call psi_exscan(nr+1,a%irp,info,shift=cone,ibase=ione)
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
call a%set_host()
@ -3089,16 +3088,14 @@ subroutine psb_c_mv_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -3106,6 +3103,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -3113,11 +3111,9 @@ subroutine psb_c_mv_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
#endif
!write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1))
!write(0,*) name,a%irp(:)
call a%set_host()
end subroutine psb_c_mv_csr_from_coo

@ -2920,16 +2920,14 @@ subroutine psb_d_cp_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -2937,6 +2935,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -2944,8 +2943,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
call psi_exscan(nr+1,a%irp,info,shift=done,ibase=ione)
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
call a%set_host()
@ -3089,16 +3088,14 @@ subroutine psb_d_mv_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -3106,6 +3103,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -3113,11 +3111,9 @@ subroutine psb_d_mv_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
#endif
!write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1))
!write(0,*) name,a%irp(:)
call a%set_host()
end subroutine psb_d_mv_csr_from_coo

@ -2920,16 +2920,14 @@ subroutine psb_s_cp_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -2937,6 +2935,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -2944,8 +2943,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
call psi_exscan(nr+1,a%irp,info,shift=sone,ibase=ione)
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
call a%set_host()
@ -3089,16 +3088,14 @@ subroutine psb_s_mv_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -3106,6 +3103,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -3113,11 +3111,9 @@ subroutine psb_s_mv_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
#endif
!write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1))
!write(0,*) name,a%irp(:)
call a%set_host()
end subroutine psb_s_mv_csr_from_coo

@ -2920,16 +2920,14 @@ subroutine psb_z_cp_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -2937,6 +2935,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -2944,8 +2943,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
call psi_exscan(nr+1,a%irp,info,shift=zone,ibase=ione)
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
call a%set_host()
@ -3089,16 +3088,14 @@ subroutine psb_z_mv_csr_from_coo(a,b,info)
#if defined(OPENMP)
!$OMP PARALLEL default(none) &
!$OMP shared(nr,a,itemp,nza) &
!$OMP private(i,info)
!$OMP PARALLEL default(shared) reduction(max:info)
!$OMP WORKSHARE
a%irp(:) = 0
!$OMP END WORKSHARE
!$OMP DO schedule(STATIC) &
!$OMP private(k)
!$OMP private(k,i)
do k=1,nza
i = itemp(k)
!$OMP ATOMIC UPDATE
@ -3106,6 +3103,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info)
!$OMP END ATOMIC
end do
!$OMP END DO
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
!$OMP END PARALLEL
#else
a%irp(:) = 0
@ -3113,11 +3111,9 @@ subroutine psb_z_mv_csr_from_coo(a,b,info)
i = itemp(k)
a%irp(i) = a%irp(i) + 1
end do
#endif
call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione)
#endif
!write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1))
!write(0,*) name,a%irp(:)
call a%set_host()
end subroutine psb_z_mv_csr_from_coo

@ -44,8 +44,8 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase)
integer(psb_ipk_), intent(in), optional :: ibase
complex(psb_spk_) :: shift_, tp, ts
complex(psb_spk_), allocatable :: suma(:)
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_
integer(psb_ipk_) :: ibase_
logical is_nested, is_parallel
if (present(shift)) then
shift_ = shift
@ -59,19 +59,40 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase)
end if
#if defined(OPENMP)
is_parallel = omp_in_parallel()
if (is_parallel) then
call inner_c_exscan()
else
!$OMP PARALLEL default(shared)
call inner_c_exscan()
!$OMP END PARALLEL
end if
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
!$OMP PARALLEL default(none) &
!$OMP shared(suma,nthreads,n,x,shift_,ibase_) &
!$OMP private(ithread,wrk,i,first_idx,last_idx)
#endif
#if defined(OPENMP)
contains
subroutine inner_c_exscan()
! Note: all these variables are private, but SUMB should *really* be
! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied
! so effectively we are recovering a SHARED SUMB which is what
! we need in this case. If it was an ALLOCATABLE, then it would be the contents
! that would get copied, and the SHARED effect would no longer be there.
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk
complex(psb_spk_), pointer :: sumb(:)
!$OMP SINGLE
nthreads = omp_get_num_threads()
allocate(suma(nthreads+1))
suma(:) = 0
!suma(1) = 1
!$OMP END SINGLE
ithread = omp_get_thread_num()
!$OMP SINGLE
allocate(sumb(nthreads+1))
sumb(:) = 0
!$OMP END SINGLE COPYPRIVATE(sumb)
wrk = (n)/nthreads
if (ithread < MOD((n),nthreads)) then
@ -81,18 +102,19 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase)
first_idx = ithread*wrk + MOD((n),nthreads) + ibase_
end if
last_idx = min(first_idx + wrk - 1,n - (ibase_-ione))
last_idx = min(first_idx + wrk - 1,n - (ione-ibase_))
if (first_idx<=last_idx) then
suma(ithread+2) = suma(ithread+2) + x(first_idx)
sumb(ithread+2) = sumb(ithread+2) + x(first_idx)
do i=first_idx+1,last_idx
suma(ithread+2) = suma(ithread+2) + x(i)
sumb(ithread+2) = sumb(ithread+2) + x(i)
x(i) = x(i)+x(i-1)
end do
end if
!$OMP BARRIER
!$OMP SINGLE
do i=2,nthreads+1
suma(i) = suma(i) + suma(i-1)
sumb(i) = sumb(i) + sumb(i-1)
end do
!$OMP END SINGLE
@ -100,24 +122,15 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
x(i) = suma(ithread+1) + x(i) + shift_
x(i) = sumb(ithread+1) + x(i) + shift_
end do
!$OMP END DO
!$OMP SINGLE
x(1) = shift_
deallocate(sumb)
!$OMP END SINGLE
!$OMP END PARALLEL
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
end subroutine inner_c_exscan
#endif
end subroutine psi_c_exscanv
subroutine psb_m_cgelp(trans,iperm,x,info)

@ -44,8 +44,8 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase)
integer(psb_ipk_), intent(in), optional :: ibase
real(psb_dpk_) :: shift_, tp, ts
real(psb_dpk_), allocatable :: suma(:)
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_
integer(psb_ipk_) :: ibase_
logical is_nested, is_parallel
if (present(shift)) then
shift_ = shift
@ -59,19 +59,40 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase)
end if
#if defined(OPENMP)
is_parallel = omp_in_parallel()
if (is_parallel) then
call inner_d_exscan()
else
!$OMP PARALLEL default(shared)
call inner_d_exscan()
!$OMP END PARALLEL
end if
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
!$OMP PARALLEL default(none) &
!$OMP shared(suma,nthreads,n,x,shift_,ibase_) &
!$OMP private(ithread,wrk,i,first_idx,last_idx)
#endif
#if defined(OPENMP)
contains
subroutine inner_d_exscan()
! Note: all these variables are private, but SUMB should *really* be
! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied
! so effectively we are recovering a SHARED SUMB which is what
! we need in this case. If it was an ALLOCATABLE, then it would be the contents
! that would get copied, and the SHARED effect would no longer be there.
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk
real(psb_dpk_), pointer :: sumb(:)
!$OMP SINGLE
nthreads = omp_get_num_threads()
allocate(suma(nthreads+1))
suma(:) = 0
!suma(1) = 1
!$OMP END SINGLE
ithread = omp_get_thread_num()
!$OMP SINGLE
allocate(sumb(nthreads+1))
sumb(:) = 0
!$OMP END SINGLE COPYPRIVATE(sumb)
wrk = (n)/nthreads
if (ithread < MOD((n),nthreads)) then
@ -81,18 +102,19 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase)
first_idx = ithread*wrk + MOD((n),nthreads) + ibase_
end if
last_idx = min(first_idx + wrk - 1,n - (ibase_-ione))
last_idx = min(first_idx + wrk - 1,n - (ione-ibase_))
if (first_idx<=last_idx) then
suma(ithread+2) = suma(ithread+2) + x(first_idx)
sumb(ithread+2) = sumb(ithread+2) + x(first_idx)
do i=first_idx+1,last_idx
suma(ithread+2) = suma(ithread+2) + x(i)
sumb(ithread+2) = sumb(ithread+2) + x(i)
x(i) = x(i)+x(i-1)
end do
end if
!$OMP BARRIER
!$OMP SINGLE
do i=2,nthreads+1
suma(i) = suma(i) + suma(i-1)
sumb(i) = sumb(i) + sumb(i-1)
end do
!$OMP END SINGLE
@ -100,24 +122,15 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
x(i) = suma(ithread+1) + x(i) + shift_
x(i) = sumb(ithread+1) + x(i) + shift_
end do
!$OMP END DO
!$OMP SINGLE
x(1) = shift_
deallocate(sumb)
!$OMP END SINGLE
!$OMP END PARALLEL
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
end subroutine inner_d_exscan
#endif
end subroutine psi_d_exscanv
subroutine psb_m_dgelp(trans,iperm,x,info)

@ -44,8 +44,8 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase)
integer(psb_ipk_), intent(in), optional :: ibase
integer(psb_epk_) :: shift_, tp, ts
integer(psb_epk_), allocatable :: suma(:)
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_
integer(psb_ipk_) :: ibase_
logical is_nested, is_parallel
if (present(shift)) then
shift_ = shift
@ -59,19 +59,40 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase)
end if
#if defined(OPENMP)
is_parallel = omp_in_parallel()
if (is_parallel) then
call inner_e_exscan()
else
!$OMP PARALLEL default(shared)
call inner_e_exscan()
!$OMP END PARALLEL
end if
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
!$OMP PARALLEL default(none) &
!$OMP shared(suma,nthreads,n,x,shift_,ibase_) &
!$OMP private(ithread,wrk,i,first_idx,last_idx)
#endif
#if defined(OPENMP)
contains
subroutine inner_e_exscan()
! Note: all these variables are private, but SUMB should *really* be
! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied
! so effectively we are recovering a SHARED SUMB which is what
! we need in this case. If it was an ALLOCATABLE, then it would be the contents
! that would get copied, and the SHARED effect would no longer be there.
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk
integer(psb_epk_), pointer :: sumb(:)
!$OMP SINGLE
nthreads = omp_get_num_threads()
allocate(suma(nthreads+1))
suma(:) = 0
!suma(1) = 1
!$OMP END SINGLE
ithread = omp_get_thread_num()
!$OMP SINGLE
allocate(sumb(nthreads+1))
sumb(:) = 0
!$OMP END SINGLE COPYPRIVATE(sumb)
wrk = (n)/nthreads
if (ithread < MOD((n),nthreads)) then
@ -81,18 +102,19 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase)
first_idx = ithread*wrk + MOD((n),nthreads) + ibase_
end if
last_idx = min(first_idx + wrk - 1,n - (ibase_-ione))
last_idx = min(first_idx + wrk - 1,n - (ione-ibase_))
if (first_idx<=last_idx) then
suma(ithread+2) = suma(ithread+2) + x(first_idx)
sumb(ithread+2) = sumb(ithread+2) + x(first_idx)
do i=first_idx+1,last_idx
suma(ithread+2) = suma(ithread+2) + x(i)
sumb(ithread+2) = sumb(ithread+2) + x(i)
x(i) = x(i)+x(i-1)
end do
end if
!$OMP BARRIER
!$OMP SINGLE
do i=2,nthreads+1
suma(i) = suma(i) + suma(i-1)
sumb(i) = sumb(i) + sumb(i-1)
end do
!$OMP END SINGLE
@ -100,24 +122,15 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
x(i) = suma(ithread+1) + x(i) + shift_
x(i) = sumb(ithread+1) + x(i) + shift_
end do
!$OMP END DO
!$OMP SINGLE
x(1) = shift_
deallocate(sumb)
!$OMP END SINGLE
!$OMP END PARALLEL
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
end subroutine inner_e_exscan
#endif
end subroutine psi_e_exscanv
subroutine psb_m_egelp(trans,iperm,x,info)

@ -44,8 +44,8 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase)
integer(psb_ipk_), intent(in), optional :: ibase
integer(psb_i2pk_) :: shift_, tp, ts
integer(psb_i2pk_), allocatable :: suma(:)
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_
integer(psb_ipk_) :: ibase_
logical is_nested, is_parallel
if (present(shift)) then
shift_ = shift
@ -59,19 +59,40 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase)
end if
#if defined(OPENMP)
is_parallel = omp_in_parallel()
if (is_parallel) then
call inner_i2_exscan()
else
!$OMP PARALLEL default(shared)
call inner_i2_exscan()
!$OMP END PARALLEL
end if
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
!$OMP PARALLEL default(none) &
!$OMP shared(suma,nthreads,n,x,shift_,ibase_) &
!$OMP private(ithread,wrk,i,first_idx,last_idx)
#endif
#if defined(OPENMP)
contains
subroutine inner_i2_exscan()
! Note: all these variables are private, but SUMB should *really* be
! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied
! so effectively we are recovering a SHARED SUMB which is what
! we need in this case. If it was an ALLOCATABLE, then it would be the contents
! that would get copied, and the SHARED effect would no longer be there.
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk
integer(psb_i2pk_), pointer :: sumb(:)
!$OMP SINGLE
nthreads = omp_get_num_threads()
allocate(suma(nthreads+1))
suma(:) = 0
!suma(1) = 1
!$OMP END SINGLE
ithread = omp_get_thread_num()
!$OMP SINGLE
allocate(sumb(nthreads+1))
sumb(:) = 0
!$OMP END SINGLE COPYPRIVATE(sumb)
wrk = (n)/nthreads
if (ithread < MOD((n),nthreads)) then
@ -81,18 +102,19 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase)
first_idx = ithread*wrk + MOD((n),nthreads) + ibase_
end if
last_idx = min(first_idx + wrk - 1,n - (ibase_-ione))
last_idx = min(first_idx + wrk - 1,n - (ione-ibase_))
if (first_idx<=last_idx) then
suma(ithread+2) = suma(ithread+2) + x(first_idx)
sumb(ithread+2) = sumb(ithread+2) + x(first_idx)
do i=first_idx+1,last_idx
suma(ithread+2) = suma(ithread+2) + x(i)
sumb(ithread+2) = sumb(ithread+2) + x(i)
x(i) = x(i)+x(i-1)
end do
end if
!$OMP BARRIER
!$OMP SINGLE
do i=2,nthreads+1
suma(i) = suma(i) + suma(i-1)
sumb(i) = sumb(i) + sumb(i-1)
end do
!$OMP END SINGLE
@ -100,24 +122,15 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
x(i) = suma(ithread+1) + x(i) + shift_
x(i) = sumb(ithread+1) + x(i) + shift_
end do
!$OMP END DO
!$OMP SINGLE
x(1) = shift_
deallocate(sumb)
!$OMP END SINGLE
!$OMP END PARALLEL
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
end subroutine inner_i2_exscan
#endif
end subroutine psi_i2_exscanv
subroutine psb_m_i2gelp(trans,iperm,x,info)

@ -44,9 +44,8 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase)
integer(psb_ipk_), intent(in), optional :: ibase
integer(psb_mpk_) :: shift_, tp, ts
integer(psb_mpk_), allocatable :: suma(:)
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_
logical is_nested
integer(psb_ipk_) :: ibase_
logical is_nested, is_parallel
if (present(shift)) then
shift_ = shift
@ -60,20 +59,40 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase)
end if
#if defined(OPENMP)
is_nested = omp_get_nested()
call omp_set_nested(.true.)
!$OMP PARALLEL default(none) &
!$OMP shared(suma,nthreads,n,x,shift_,ibase_) &
!$OMP private(ithread,wrk,i,first_idx,last_idx)
is_parallel = omp_in_parallel()
if (is_parallel) then
call inner_m_exscan()
else
!$OMP PARALLEL default(shared)
call inner_m_exscan()
!$OMP END PARALLEL
end if
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
#endif
#if defined(OPENMP)
contains
subroutine inner_m_exscan()
! Note: all these variables are private, but SUMB should *really* be
! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied
! so effectively we are recovering a SHARED SUMB which is what
! we need in this case. If it was an ALLOCATABLE, then it would be the contents
! that would get copied, and the SHARED effect would no longer be there.
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk
integer(psb_mpk_), pointer :: sumb(:)
!$OMP SINGLE
nthreads = omp_get_num_threads()
allocate(suma(nthreads+1))
suma(:) = 0
!suma(1) = 1
!$OMP END SINGLE
ithread = omp_get_thread_num()
!$OMP SINGLE
allocate(sumb(nthreads+1))
sumb(:) = 0
!$OMP END SINGLE COPYPRIVATE(sumb)
wrk = (n)/nthreads
if (ithread < MOD((n),nthreads)) then
@ -83,18 +102,19 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase)
first_idx = ithread*wrk + MOD((n),nthreads) + ibase_
end if
last_idx = min(first_idx + wrk - 1,n - (ibase_-ione))
last_idx = min(first_idx + wrk - 1,n - (ione-ibase_))
if (first_idx<=last_idx) then
suma(ithread+2) = suma(ithread+2) + x(first_idx)
sumb(ithread+2) = sumb(ithread+2) + x(first_idx)
do i=first_idx+1,last_idx
suma(ithread+2) = suma(ithread+2) + x(i)
sumb(ithread+2) = sumb(ithread+2) + x(i)
x(i) = x(i)+x(i-1)
end do
end if
!$OMP BARRIER
!$OMP SINGLE
do i=2,nthreads+1
suma(i) = suma(i) + suma(i-1)
sumb(i) = sumb(i) + sumb(i-1)
end do
!$OMP END SINGLE
@ -102,24 +122,15 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
x(i) = suma(ithread+1) + x(i) + shift_
x(i) = sumb(ithread+1) + x(i) + shift_
end do
!$OMP END DO
!$OMP SINGLE
x(1) = shift_
deallocate(sumb)
!$OMP END SINGLE
!$OMP END PARALLEL
call omp_set_nested(is_nested)
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
end subroutine inner_m_exscan
#endif
end subroutine psi_m_exscanv
subroutine psb_m_mgelp(trans,iperm,x,info)

@ -44,8 +44,8 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase)
integer(psb_ipk_), intent(in), optional :: ibase
real(psb_spk_) :: shift_, tp, ts
real(psb_spk_), allocatable :: suma(:)
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_
integer(psb_ipk_) :: ibase_
logical is_nested, is_parallel
if (present(shift)) then
shift_ = shift
@ -59,19 +59,40 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase)
end if
#if defined(OPENMP)
is_parallel = omp_in_parallel()
if (is_parallel) then
call inner_s_exscan()
else
!$OMP PARALLEL default(shared)
call inner_s_exscan()
!$OMP END PARALLEL
end if
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
!$OMP PARALLEL default(none) &
!$OMP shared(suma,nthreads,n,x,shift_,ibase_) &
!$OMP private(ithread,wrk,i,first_idx,last_idx)
#endif
#if defined(OPENMP)
contains
subroutine inner_s_exscan()
! Note: all these variables are private, but SUMB should *really* be
! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied
! so effectively we are recovering a SHARED SUMB which is what
! we need in this case. If it was an ALLOCATABLE, then it would be the contents
! that would get copied, and the SHARED effect would no longer be there.
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk
real(psb_spk_), pointer :: sumb(:)
!$OMP SINGLE
nthreads = omp_get_num_threads()
allocate(suma(nthreads+1))
suma(:) = 0
!suma(1) = 1
!$OMP END SINGLE
ithread = omp_get_thread_num()
!$OMP SINGLE
allocate(sumb(nthreads+1))
sumb(:) = 0
!$OMP END SINGLE COPYPRIVATE(sumb)
wrk = (n)/nthreads
if (ithread < MOD((n),nthreads)) then
@ -81,18 +102,19 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase)
first_idx = ithread*wrk + MOD((n),nthreads) + ibase_
end if
last_idx = min(first_idx + wrk - 1,n - (ibase_-ione))
last_idx = min(first_idx + wrk - 1,n - (ione-ibase_))
if (first_idx<=last_idx) then
suma(ithread+2) = suma(ithread+2) + x(first_idx)
sumb(ithread+2) = sumb(ithread+2) + x(first_idx)
do i=first_idx+1,last_idx
suma(ithread+2) = suma(ithread+2) + x(i)
sumb(ithread+2) = sumb(ithread+2) + x(i)
x(i) = x(i)+x(i-1)
end do
end if
!$OMP BARRIER
!$OMP SINGLE
do i=2,nthreads+1
suma(i) = suma(i) + suma(i-1)
sumb(i) = sumb(i) + sumb(i-1)
end do
!$OMP END SINGLE
@ -100,24 +122,15 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
x(i) = suma(ithread+1) + x(i) + shift_
x(i) = sumb(ithread+1) + x(i) + shift_
end do
!$OMP END DO
!$OMP SINGLE
x(1) = shift_
deallocate(sumb)
!$OMP END SINGLE
!$OMP END PARALLEL
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
end subroutine inner_s_exscan
#endif
end subroutine psi_s_exscanv
subroutine psb_m_sgelp(trans,iperm,x,info)

@ -44,8 +44,8 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase)
integer(psb_ipk_), intent(in), optional :: ibase
complex(psb_dpk_) :: shift_, tp, ts
complex(psb_dpk_), allocatable :: suma(:)
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_
integer(psb_ipk_) :: ibase_
logical is_nested, is_parallel
if (present(shift)) then
shift_ = shift
@ -59,19 +59,40 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase)
end if
#if defined(OPENMP)
is_parallel = omp_in_parallel()
if (is_parallel) then
call inner_z_exscan()
else
!$OMP PARALLEL default(shared)
call inner_z_exscan()
!$OMP END PARALLEL
end if
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
!$OMP PARALLEL default(none) &
!$OMP shared(suma,nthreads,n,x,shift_,ibase_) &
!$OMP private(ithread,wrk,i,first_idx,last_idx)
#endif
#if defined(OPENMP)
contains
subroutine inner_z_exscan()
! Note: all these variables are private, but SUMB should *really* be
! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied
! so effectively we are recovering a SHARED SUMB which is what
! we need in this case. If it was an ALLOCATABLE, then it would be the contents
! that would get copied, and the SHARED effect would no longer be there.
integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk
complex(psb_dpk_), pointer :: sumb(:)
!$OMP SINGLE
nthreads = omp_get_num_threads()
allocate(suma(nthreads+1))
suma(:) = 0
!suma(1) = 1
!$OMP END SINGLE
ithread = omp_get_thread_num()
!$OMP SINGLE
allocate(sumb(nthreads+1))
sumb(:) = 0
!$OMP END SINGLE COPYPRIVATE(sumb)
wrk = (n)/nthreads
if (ithread < MOD((n),nthreads)) then
@ -81,18 +102,19 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase)
first_idx = ithread*wrk + MOD((n),nthreads) + ibase_
end if
last_idx = min(first_idx + wrk - 1,n - (ibase_-ione))
last_idx = min(first_idx + wrk - 1,n - (ione-ibase_))
if (first_idx<=last_idx) then
suma(ithread+2) = suma(ithread+2) + x(first_idx)
sumb(ithread+2) = sumb(ithread+2) + x(first_idx)
do i=first_idx+1,last_idx
suma(ithread+2) = suma(ithread+2) + x(i)
sumb(ithread+2) = sumb(ithread+2) + x(i)
x(i) = x(i)+x(i-1)
end do
end if
!$OMP BARRIER
!$OMP SINGLE
do i=2,nthreads+1
suma(i) = suma(i) + suma(i-1)
sumb(i) = sumb(i) + sumb(i-1)
end do
!$OMP END SINGLE
@ -100,24 +122,15 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase)
!$OMP DO SCHEDULE(STATIC)
do i=1,n
x(i) = suma(ithread+1) + x(i) + shift_
x(i) = sumb(ithread+1) + x(i) + shift_
end do
!$OMP END DO
!$OMP SINGLE
x(1) = shift_
deallocate(sumb)
!$OMP END SINGLE
!$OMP END PARALLEL
#else
tp = shift_
do i=1,n
ts = x(i)
x(i) = tp
tp = tp + ts
end do
end subroutine inner_z_exscan
#endif
end subroutine psi_z_exscanv
subroutine psb_m_zgelp(trans,iperm,x,info)

Loading…
Cancel
Save