From 02dd204351332fb7f9c4bb9b4e550602dde7935e Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 18 Apr 2023 17:00:22 +0200 Subject: [PATCH] Implement psi_exscan and use in _from_coo --- base/modules/auxil/psi_c_serial_mod.f90 | 12 + base/modules/auxil/psi_d_serial_mod.f90 | 12 + base/modules/auxil/psi_e_serial_mod.f90 | 12 + base/modules/auxil/psi_i2_serial_mod.f90 | 12 + base/modules/auxil/psi_m_serial_mod.f90 | 12 + base/modules/auxil/psi_s_serial_mod.f90 | 12 + base/modules/auxil/psi_z_serial_mod.f90 | 12 + ...{psb_c_csc_impl.f90 => psb_c_csc_impl.F90} | 34 +- base/serial/impl/psb_c_csr_impl.F90 | 126 +------ ...{psb_d_csc_impl.f90 => psb_d_csc_impl.F90} | 34 +- base/serial/impl/psb_d_csr_impl.F90 | 126 +------ ...{psb_s_csc_impl.f90 => psb_s_csc_impl.F90} | 34 +- base/serial/impl/psb_s_csr_impl.F90 | 126 +------ ...{psb_z_csc_impl.f90 => psb_z_csc_impl.F90} | 34 +- base/serial/impl/psb_z_csr_impl.F90 | 126 +------ base/serial/psi_c_serial_impl.F90 | 91 +++++ base/serial/psi_d_serial_impl.F90 | 91 +++++ base/serial/psi_e_serial_impl.F90 | 330 ++++++++++++++++- base/serial/psi_i2_serial_impl.F90 | 330 ++++++++++++++++- base/serial/psi_m_serial_impl.F90 | 332 +++++++++++++++++- base/serial/psi_s_serial_impl.F90 | 91 +++++ base/serial/psi_z_serial_impl.F90 | 91 +++++ 22 files changed, 1554 insertions(+), 526 deletions(-) rename base/serial/impl/{psb_c_csc_impl.f90 => psb_c_csc_impl.F90} (99%) rename base/serial/impl/{psb_d_csc_impl.f90 => psb_d_csc_impl.F90} (99%) rename base/serial/impl/{psb_s_csc_impl.f90 => psb_s_csc_impl.F90} (99%) rename base/serial/impl/{psb_z_csc_impl.f90 => psb_z_csc_impl.F90} (99%) diff --git a/base/modules/auxil/psi_c_serial_mod.f90 b/base/modules/auxil/psi_c_serial_mod.f90 index 191e7ef3..d62ba3bc 100644 --- a/base/modules/auxil/psi_c_serial_mod.f90 +++ b/base/modules/auxil/psi_c_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_c_serial_mod end subroutine psi_csctv end interface psi_sct + interface psi_exscan + subroutine psi_c_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_spk_ + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_c_exscanv + end interface psi_exscan + end module psi_c_serial_mod diff --git a/base/modules/auxil/psi_d_serial_mod.f90 b/base/modules/auxil/psi_d_serial_mod.f90 index f1dbc16c..ae88be74 100644 --- a/base/modules/auxil/psi_d_serial_mod.f90 +++ b/base/modules/auxil/psi_d_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_d_serial_mod end subroutine psi_dsctv end interface psi_sct + interface psi_exscan + subroutine psi_d_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_dpk_ + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_d_exscanv + end interface psi_exscan + end module psi_d_serial_mod diff --git a/base/modules/auxil/psi_e_serial_mod.f90 b/base/modules/auxil/psi_e_serial_mod.f90 index 909b025c..a5544075 100644 --- a/base/modules/auxil/psi_e_serial_mod.f90 +++ b/base/modules/auxil/psi_e_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_e_serial_mod end subroutine psi_esctv end interface psi_sct + interface psi_exscan + subroutine psi_e_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_epk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_epk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_e_exscanv + end interface psi_exscan + end module psi_e_serial_mod diff --git a/base/modules/auxil/psi_i2_serial_mod.f90 b/base/modules/auxil/psi_i2_serial_mod.f90 index 38ef9c38..c0b5a327 100644 --- a/base/modules/auxil/psi_i2_serial_mod.f90 +++ b/base/modules/auxil/psi_i2_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_i2_serial_mod end subroutine psi_i2sctv end interface psi_sct + interface psi_exscan + subroutine psi_i2_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_i2pk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_i2pk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_i2_exscanv + end interface psi_exscan + end module psi_i2_serial_mod diff --git a/base/modules/auxil/psi_m_serial_mod.f90 b/base/modules/auxil/psi_m_serial_mod.f90 index a80d0ffe..ab875f7b 100644 --- a/base/modules/auxil/psi_m_serial_mod.f90 +++ b/base/modules/auxil/psi_m_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_m_serial_mod end subroutine psi_msctv end interface psi_sct + interface psi_exscan + subroutine psi_m_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_mpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_m_exscanv + end interface psi_exscan + end module psi_m_serial_mod diff --git a/base/modules/auxil/psi_s_serial_mod.f90 b/base/modules/auxil/psi_s_serial_mod.f90 index 3c0a3bdc..fee1afc6 100644 --- a/base/modules/auxil/psi_s_serial_mod.f90 +++ b/base/modules/auxil/psi_s_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_s_serial_mod end subroutine psi_ssctv end interface psi_sct + interface psi_exscan + subroutine psi_s_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_spk_ + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_s_exscanv + end interface psi_exscan + end module psi_s_serial_mod diff --git a/base/modules/auxil/psi_z_serial_mod.f90 b/base/modules/auxil/psi_z_serial_mod.f90 index 7bc9728e..3ab430cc 100644 --- a/base/modules/auxil/psi_z_serial_mod.f90 +++ b/base/modules/auxil/psi_z_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_z_serial_mod end subroutine psi_zsctv end interface psi_sct + interface psi_exscan + subroutine psi_z_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_dpk_ + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_z_exscanv + end interface psi_exscan + end module psi_z_serial_mod diff --git a/base/serial/impl/psb_c_csc_impl.f90 b/base/serial/impl/psb_c_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_c_csc_impl.f90 rename to base/serial/impl/psb_c_csc_impl.F90 index 87d7e3dd..c573a40d 100644 --- a/base/serial/impl/psb_c_csc_impl.f90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_c_base_mat_mod use psb_c_csc_mat_mod, psb_protect_name => psb_c_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index e091e020..388e9e2b 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) character(len=20) :: name='c_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/impl/psb_d_csc_impl.f90 b/base/serial/impl/psb_d_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_d_csc_impl.f90 rename to base/serial/impl/psb_d_csc_impl.F90 index 4f10439b..891df5a3 100644 --- a/base/serial/impl/psb_d_csc_impl.f90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_d_base_mat_mod use psb_d_csc_mat_mod, psb_protect_name => psb_d_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index e6b9f958..1762604d 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) character(len=20) :: name='d_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/impl/psb_s_csc_impl.f90 b/base/serial/impl/psb_s_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_s_csc_impl.f90 rename to base/serial/impl/psb_s_csc_impl.F90 index e52086d1..2bf77184 100644 --- a/base/serial/impl/psb_s_csc_impl.f90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_s_base_mat_mod use psb_s_csc_mat_mod, psb_protect_name => psb_s_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 73f5484a..71da21a4 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) character(len=20) :: name='s_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/impl/psb_z_csc_impl.f90 b/base/serial/impl/psb_z_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_z_csc_impl.f90 rename to base/serial/impl/psb_z_csc_impl.F90 index 8b0ccc65..22ea3677 100644 --- a/base/serial/impl/psb_z_csc_impl.f90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_z_base_mat_mod use psb_z_csc_mat_mod, psb_protect_name => psb_z_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index b07e37b7..34332f46 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) character(len=20) :: name='z_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index 1da2ce6e..91f463f4 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_c_exscanv(n,x,info,shift,ibase) + use psi_c_serial_mod, psb_protect_name => psi_c_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + complex(psb_spk_) :: shift_, tp, ts + complex(psb_spk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = czero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_c_exscanv + subroutine psb_m_cgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_cgelp use psb_const_mod diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index 8c65b349..d8fb92d7 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_d_exscanv(n,x,info,shift,ibase) + use psi_d_serial_mod, psb_protect_name => psi_d_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + real(psb_dpk_) :: shift_, tp, ts + real(psb_dpk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = dzero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_d_exscanv + subroutine psb_m_dgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_dgelp use psb_const_mod diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index 988bad52..24ce41f1 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_e_exscanv(n,x,info,shift,ibase) + use psi_e_serial_mod, psb_protect_name => psi_e_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_epk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_epk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + integer(psb_epk_) :: shift_, tp, ts + integer(psb_epk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = ezero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_e_exscanv + subroutine psb_m_egelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_egelp use psb_const_mod @@ -441,9 +532,9 @@ subroutine psi_eaxpby(m,n,alpha, x, beta, y, info) integer(psb_epk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +593,8 @@ subroutine psi_eaxpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +624,106 @@ subroutine psi_eaxpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call eaxpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call eaxpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.ezero) then + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = ezero + enddo + else if (beta.eq.eone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +746,7 @@ subroutine psi_eaxpbyv2(m,alpha, x, beta, y, z, info) integer(psb_epk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +785,105 @@ subroutine psi_eaxpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call eaxpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.ezero) then + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = ezero + enddo + else if (beta.eq.eone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1231,7 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.ezero) then if (beta.eq.ezero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = ezero enddo @@ -953,12 +1243,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1261,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1276,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1294,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1309,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1327,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1342,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1435,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1453,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1468,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1486,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1501,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1519,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1534,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index 83b078f0..c5889013 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_i2_exscanv(n,x,info,shift,ibase) + use psi_i2_serial_mod, psb_protect_name => psi_i2_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_i2pk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_i2pk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + integer(psb_i2pk_) :: shift_, tp, ts + integer(psb_i2pk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = i2zero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_i2_exscanv + subroutine psb_m_i2gelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_i2gelp use psb_const_mod @@ -441,9 +532,9 @@ subroutine psi_i2axpby(m,n,alpha, x, beta, y, info) integer(psb_i2pk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +593,8 @@ subroutine psi_i2axpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +624,106 @@ subroutine psi_i2axpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call i2axpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call i2axpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.i2zero) then + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = i2zero + enddo + else if (beta.eq.i2one) then + ! + ! Do nothing! + ! + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +746,7 @@ subroutine psi_i2axpbyv2(m,alpha, x, beta, y, z, info) integer(psb_i2pk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +785,105 @@ subroutine psi_i2axpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call i2axpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.i2zero) then + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = i2zero + enddo + else if (beta.eq.i2one) then + ! + ! Do nothing! + ! + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1231,7 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.i2zero) then if (beta.eq.i2zero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = i2zero enddo @@ -953,12 +1243,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1261,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1276,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1294,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1309,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1327,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1342,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1435,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1453,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1468,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1486,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1501,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1519,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1534,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index 950e2358..d7849cd7 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -29,6 +29,99 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_m_exscanv(n,x,info,shift,ibase) + use psi_m_serial_mod, psb_protect_name => psi_m_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_mpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + integer(psb_mpk_) :: shift_, tp, ts + integer(psb_mpk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + logical is_nested + + if (present(shift)) then + shift_ = shift + else + shift_ = mzero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + is_nested = omp_get_nested() + call omp_set_nested(.true.) + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + !$OMP END PARALLEL + call omp_set_nested(is_nested) +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_m_exscanv + subroutine psb_m_mgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_mgelp use psb_const_mod @@ -441,9 +534,9 @@ subroutine psi_maxpby(m,n,alpha, x, beta, y, info) integer(psb_mpk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +595,8 @@ subroutine psi_maxpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +626,106 @@ subroutine psi_maxpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call maxpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call maxpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.mzero) then + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = mzero + enddo + else if (beta.eq.mone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +748,7 @@ subroutine psi_maxpbyv2(m,alpha, x, beta, y, z, info) integer(psb_mpk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +787,105 @@ subroutine psi_maxpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call maxpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.mzero) then + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = mzero + enddo + else if (beta.eq.mone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1233,7 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.mzero) then if (beta.eq.mzero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = mzero enddo @@ -953,12 +1245,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1263,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1278,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1296,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1311,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1329,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1344,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1437,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1455,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1470,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1488,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1503,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1521,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1536,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index 6c8e21e2..ec22ade7 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_s_exscanv(n,x,info,shift,ibase) + use psi_s_serial_mod, psb_protect_name => psi_s_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + real(psb_spk_) :: shift_, tp, ts + real(psb_spk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = szero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_s_exscanv + subroutine psb_m_sgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_sgelp use psb_const_mod diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index f3087992..9bcdfd7e 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_z_exscanv(n,x,info,shift,ibase) + use psi_z_serial_mod, psb_protect_name => psi_z_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + complex(psb_dpk_) :: shift_, tp, ts + complex(psb_dpk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = zzero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_z_exscanv + subroutine psb_m_zgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_zgelp use psb_const_mod