From e8903bc28be6d0a9165f5e0279ef980f6f9fa791 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Mon, 21 Nov 2022 08:59:59 -0500 Subject: [PATCH 01/38] Fix VL(:) initialization when CHECK_=.false. --- base/tools/psb_cd_inloc.f90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/base/tools/psb_cd_inloc.f90 b/base/tools/psb_cd_inloc.f90 index f8d609c4..e8b9578e 100644 --- a/base/tools/psb_cd_inloc.f90 +++ b/base/tools/psb_cd_inloc.f90 @@ -275,6 +275,8 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash) if (debug_size) & & write(debug_unit,*) me,' ',trim(name),': After sort ',nlu + else + nlu = loc_row end if call psb_nullify_desc(desc) if (do_timings) then From beefe9aa1a6030e1d97827ad74a44ce19fa253b3 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Tue, 31 Jan 2023 08:55:00 -0500 Subject: [PATCH 02/38] Missing "compile" for configure. --- compile | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 compile diff --git a/compile b/compile new file mode 100644 index 00000000..e69de29b From afdbac67276a033ebe24b0b52b873ebfed753b5c Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Mon, 6 Feb 2023 19:00:07 +0100 Subject: [PATCH 03/38] Swicth csr_impl to F90 --- ...{psb_c_csr_impl.f90 => psb_c_csr_impl.F90} | 339 +++++++++--------- ...{psb_d_csr_impl.f90 => psb_d_csr_impl.F90} | 339 +++++++++--------- ...{psb_s_csr_impl.f90 => psb_s_csr_impl.F90} | 339 +++++++++--------- ...{psb_z_csr_impl.f90 => psb_z_csr_impl.F90} | 339 +++++++++--------- test/pargen/runs/ppde.inp | 2 +- 5 files changed, 685 insertions(+), 673 deletions(-) rename base/serial/impl/{psb_c_csr_impl.f90 => psb_c_csr_impl.F90} (96%) rename base/serial/impl/{psb_d_csr_impl.f90 => psb_d_csr_impl.F90} (96%) rename base/serial/impl/{psb_s_csr_impl.f90 => psb_s_csr_impl.F90} (96%) rename base/serial/impl/{psb_z_csr_impl.f90 => psb_z_csr_impl.F90} (96%) diff --git a/base/serial/impl/psb_c_csr_impl.f90 b/base/serial/impl/psb_c_csr_impl.F90 similarity index 96% rename from base/serial/impl/psb_c_csr_impl.f90 rename to base/serial/impl/psb_c_csr_impl.F90 index 88510e53..87b0872b 100644 --- a/base/serial/impl/psb_c_csr_impl.f90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2837,6 +2837,9 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_c_base_mat_mod use psb_c_csr_mat_mod, psb_protect_name => psb_c_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_csr_sparse_mat), intent(inout) :: a @@ -2851,13 +2854,11 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='c_cp_csr_from_coo' - logical :: use_openmp = .false. - - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j + integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads +#endif info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2902,94 +2903,93 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if +#if defined(OPENMP) + maxthreads = omp_get_max_threads() + allocate(sum(maxthreads+1)) + sum(:) = 0 + sum(1) = 1 + + !$OMP PARALLEL default(none) & + !$OMP shared(nza,itemp,a,nthreads,sum,nr) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + !$OMP BARRIER + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip +#endif call a%set_host() - - + end subroutine psb_c_cp_csr_from_coo @@ -3089,6 +3089,9 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_c_base_mat_mod use psb_c_csr_mat_mod, psb_protect_name => psb_c_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_csr_sparse_mat), intent(inout) :: a @@ -3102,12 +3105,12 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + integer(psb_ipk_) :: nxt_val,old_val,saved_elem +#endif info = psb_success_ @@ -3135,74 +3138,74 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(sum,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(sum(nthreads+1)) + sum(:) = 0 + sum(1) = 1 + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3217,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip -!!$ end if +#endif call a%set_host() diff --git a/base/serial/impl/psb_d_csr_impl.f90 b/base/serial/impl/psb_d_csr_impl.F90 similarity index 96% rename from base/serial/impl/psb_d_csr_impl.f90 rename to base/serial/impl/psb_d_csr_impl.F90 index a927b1a9..3aadde20 100644 --- a/base/serial/impl/psb_d_csr_impl.f90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2837,6 +2837,9 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_d_base_mat_mod use psb_d_csr_mat_mod, psb_protect_name => psb_d_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_csr_sparse_mat), intent(inout) :: a @@ -2851,13 +2854,11 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='d_cp_csr_from_coo' - logical :: use_openmp = .false. - - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j + integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads +#endif info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2902,94 +2903,93 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if +#if defined(OPENMP) + maxthreads = omp_get_max_threads() + allocate(sum(maxthreads+1)) + sum(:) = 0 + sum(1) = 1 + + !$OMP PARALLEL default(none) & + !$OMP shared(nza,itemp,a,nthreads,sum,nr) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + !$OMP BARRIER + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip +#endif call a%set_host() - - + end subroutine psb_d_cp_csr_from_coo @@ -3089,6 +3089,9 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_d_base_mat_mod use psb_d_csr_mat_mod, psb_protect_name => psb_d_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_csr_sparse_mat), intent(inout) :: a @@ -3102,12 +3105,12 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + integer(psb_ipk_) :: nxt_val,old_val,saved_elem +#endif info = psb_success_ @@ -3135,74 +3138,74 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(sum,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(sum(nthreads+1)) + sum(:) = 0 + sum(1) = 1 + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3217,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip -!!$ end if +#endif call a%set_host() diff --git a/base/serial/impl/psb_s_csr_impl.f90 b/base/serial/impl/psb_s_csr_impl.F90 similarity index 96% rename from base/serial/impl/psb_s_csr_impl.f90 rename to base/serial/impl/psb_s_csr_impl.F90 index b2d9dd48..a554e13b 100644 --- a/base/serial/impl/psb_s_csr_impl.f90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2837,6 +2837,9 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_s_base_mat_mod use psb_s_csr_mat_mod, psb_protect_name => psb_s_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_csr_sparse_mat), intent(inout) :: a @@ -2851,13 +2854,11 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='s_cp_csr_from_coo' - logical :: use_openmp = .false. - - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j + integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads +#endif info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2902,94 +2903,93 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if +#if defined(OPENMP) + maxthreads = omp_get_max_threads() + allocate(sum(maxthreads+1)) + sum(:) = 0 + sum(1) = 1 + + !$OMP PARALLEL default(none) & + !$OMP shared(nza,itemp,a,nthreads,sum,nr) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + !$OMP BARRIER + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip +#endif call a%set_host() - - + end subroutine psb_s_cp_csr_from_coo @@ -3089,6 +3089,9 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_s_base_mat_mod use psb_s_csr_mat_mod, psb_protect_name => psb_s_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_csr_sparse_mat), intent(inout) :: a @@ -3102,12 +3105,12 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + integer(psb_ipk_) :: nxt_val,old_val,saved_elem +#endif info = psb_success_ @@ -3135,74 +3138,74 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(sum,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(sum(nthreads+1)) + sum(:) = 0 + sum(1) = 1 + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3217,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip -!!$ end if +#endif call a%set_host() diff --git a/base/serial/impl/psb_z_csr_impl.f90 b/base/serial/impl/psb_z_csr_impl.F90 similarity index 96% rename from base/serial/impl/psb_z_csr_impl.f90 rename to base/serial/impl/psb_z_csr_impl.F90 index ba2d322f..b2b0d3d1 100644 --- a/base/serial/impl/psb_z_csr_impl.f90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2837,6 +2837,9 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_z_base_mat_mod use psb_z_csr_mat_mod, psb_protect_name => psb_z_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_csr_sparse_mat), intent(inout) :: a @@ -2851,13 +2854,11 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='z_cp_csr_from_coo' - logical :: use_openmp = .false. - - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j + integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads +#endif info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2902,94 +2903,93 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if +#if defined(OPENMP) + maxthreads = omp_get_max_threads() + allocate(sum(maxthreads+1)) + sum(:) = 0 + sum(1) = 1 + + !$OMP PARALLEL default(none) & + !$OMP shared(nza,itemp,a,nthreads,sum,nr) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + !$OMP BARRIER + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip +#endif call a%set_host() - - + end subroutine psb_z_cp_csr_from_coo @@ -3089,6 +3089,9 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_z_base_mat_mod use psb_z_csr_mat_mod, psb_protect_name => psb_z_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_csr_sparse_mat), intent(inout) :: a @@ -3102,12 +3105,12 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. +#if defined(OPENMP) + integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + integer(psb_ipk_) :: nxt_val,old_val,saved_elem +#endif info = psb_success_ @@ -3135,74 +3138,74 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(sum,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + !$OMP END DO + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(sum(nthreads+1)) + sum(:) = 0 + sum(1) = 1 + !$OMP END SINGLE + + ithread = omp_get_thread_num() + + work = nr/nthreads + if (ithread < MOD(nr,nthreads)) then + work = work + 1 + first_idx = ithread*work + 1 + else + first_idx = ithread*work + MOD(nr,nthreads) + 1 + end if + + last_idx = first_idx + work - 1 + + s = 0 + do i=first_idx,last_idx + s = s + a%irp(i) + end do + if (work > 0) then + sum(ithread+2) = s + end if + + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sum(i) = sum(i) + sum(i-1) + end do + !$OMP END SINGLE + + if (work > 0) then + saved_elem = a%irp(first_idx) + end if + if (ithread == 0) then + a%irp(1) = 1 + end if + + if (work > 0) then + old_val = a%irp(first_idx+1) + a%irp(first_idx+1) = saved_elem + sum(ithread+1) + end if + + do i=first_idx+2,last_idx+1 + nxt_val = a%irp(i) + a%irp(i) = a%irp(i-1) + old_val + old_val = nxt_val + end do + + !$OMP END PARALLEL +#else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3217,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip -!!$ end if +#endif call a%set_host() diff --git a/test/pargen/runs/ppde.inp b/test/pargen/runs/ppde.inp index f4b45430..996585dd 100644 --- a/test/pargen/runs/ppde.inp +++ b/test/pargen/runs/ppde.inp @@ -8,7 +8,7 @@ CSR Storage format for matrix A: CSR COO 0100 MAXIT 05 ITRACE 002 IRST restart for RGMRES and BiCGSTABL -ILU Block Solver ILU,ILUT,INVK,AINVT,AORTH +INVK Block Solver ILU,ILUT,INVK,AINVT,AORTH NONE If ILU : MILU or NONE othewise ignored NONE Scaling if ILUT: NONE, MAXVAL otherwise ignored 0 Level of fill for forward factorization From 784cc65e518374dfaa618be2f6d731030b2f58a4 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Fri, 10 Feb 2023 07:45:44 -0500 Subject: [PATCH 04/38] Temporarily revert hash_map_mod waiting for a proper fix --- base/modules/desc/psb_hash_map_mod.f90 | 871 ++++++++++++++++--------- 1 file changed, 547 insertions(+), 324 deletions(-) diff --git a/base/modules/desc/psb_hash_map_mod.f90 b/base/modules/desc/psb_hash_map_mod.f90 index 06e68451..528450ae 100644 --- a/base/modules/desc/psb_hash_map_mod.f90 +++ b/base/modules/desc/psb_hash_map_mod.f90 @@ -659,49 +659,410 @@ contains !$ use_openmp = .true. - info = psb_success_ - name = 'hash_g2l_ins' - call psb_erractionsave(err_act) + if (.true.) then + info = psb_success_ + name = 'hash_g2l_ins' + call psb_erractionsave(err_act) - ctxt = idxmap%get_ctxt() - call psb_info(ctxt, me, np) + ctxt = idxmap%get_ctxt() + call psb_info(ctxt, me, np) - is = size(idx) + is = size(idx) - if (present(mask)) then - if (size(mask) < size(idx)) then - info = -1 - return + if (present(mask)) then + if (size(mask) < size(idx)) then + info = -1 + return + end if + end if + if (present(lidx)) then + if (size(lidx) < size(idx)) then + info = -1 + return + end if end if - end if - if (present(lidx)) then - if (size(lidx) < size(idx)) then + + mglob = idxmap%get_gr() + nrow = idxmap%get_lr() + if (idxmap%is_bld()) then + + if (present(lidx)) then + if (present(mask)) then + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + tlip = lip + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + if (info >=0) then + if (nxt == tlip) then + ncol = max(ncol,nxt) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + end if + idx(i) = lip + info = psb_success_ + else + idx(i) = -1 + end if + enddo + + else if (.not.present(mask)) then + + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + + if (info >=0) then + if (nxt == lip) then + ncol = max(nxt,ncol) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + end if + idx(i) = lip + info = psb_success_ + enddo + + end if + + else if (.not.present(lidx)) then + + if (present(mask)) then + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + idx(i) = lip + info = psb_success_ + else + idx(i) = -1 + end if + enddo + + else if (.not.present(mask)) then + + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + ch_err='psb_ensure_size' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + ch_err='SearchInsKeyVal' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + goto 9999 + end if + idx(i) = lip + info = psb_success_ + enddo + + + end if + end if + else + ! Wrong state + idx = -1 info = -1 - return end if - end if + else - mglob = idxmap%get_gr() - nrow = idxmap%get_lr() + info = psb_success_ + name = 'hash_g2l_ins' + call psb_erractionsave(err_act) - if (idxmap%is_bld()) then - if (use_openmp) then - !$ call OMP_init_lock(ins_lck) - isLoopValid = .true. - ncol = idxmap%get_lc() + ctxt = idxmap%get_ctxt() + call psb_info(ctxt, me, np) + + is = size(idx) + + if (present(mask)) then + if (size(mask) < size(idx)) then + info = -1 + return + end if end if - if (present(lidx)) then - if (present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,mask,lidx,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is + if (present(lidx)) then + if (size(lidx) < size(idx)) then + info = -1 + return + end if + end if + + mglob = idxmap%get_gr() + nrow = idxmap%get_lr() + + if (idxmap%is_bld()) then + if (use_openmp) then + !$ call OMP_init_lock(ins_lck) + isLoopValid = .true. + ncol = idxmap%get_lc() + end if + + if (present(lidx)) then + if (present(mask)) then + if (use_openmp) then + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,mask,lidx,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is + + if (mask(i)) then + ip = idx(i) + + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip < 0) then + tlip = lip + nxt = lidx(i) + + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). + !$ call OMP_set_lock(ins_lck) + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + ! Index not found + if (lip < 0) then + ! Locking system to handle concurrent hashmap read/write. + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then + + ncol = MAX(ncol,nxt) + !$ call OMP_unset_lock(ins_lck) + + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + + isLoopValid = .false. + cycle + end if + + idxmap%loc_to_glob(nxt) = ip + else + !$ call OMP_unset_lock(ins_lck) + end if + + info = psb_success_ + + else + !$ call OMP_unset_lock(ins_lck) + + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + + isLoopValid = .false. + cycle + end if + else + !$ call OMP_unset_lock(ins_lck) + end if + end if + + idx(i) = lip + info = psb_success_ + else + idx(i) = -1 + end if + end do + !$OMP END PARALLEL DO + + call idxmap%set_lc(ncol) - if (mask(i)) then + if (.not. isLoopValid) then + goto 9999 + end if + + else + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + tlip = lip + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + if (info >=0) then + if (nxt == tlip) then + ncol = max(ncol,nxt) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + end if + idx(i) = lip + info = psb_success_ + else + idx(i) = -1 + end if + enddo + end if + else if (.not.present(mask)) then + if (use_openmp) then + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,idx,lidx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -709,9 +1070,8 @@ contains cycle endif - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. + ! In OMP logic the index research limit is turned off. It is + ! a correct way to fit the subroutine? call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) @@ -729,11 +1089,12 @@ contains ! is often already existing). !$ call OMP_set_lock(ins_lck) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - ! Index not found + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then - ! Locking system to handle concurrent hashmap read/write. + ! Locking system to handle concurrent write/access. Under checking! call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + !$ call OMP_unset_lock(ins_lck) if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -744,12 +1105,13 @@ contains ncol = MAX(ncol,nxt) !$ call OMP_unset_lock(ins_lck) + ! Under checking! call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. cycle @@ -761,7 +1123,6 @@ contains end if info = psb_success_ - else !$ call OMP_unset_lock(ins_lck) @@ -778,43 +1139,40 @@ contains idx(i) = lip info = psb_success_ - else - idx(i) = -1 - end if - end do - !$OMP END PARALLEL DO + end do + !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) + call idxmap%set_lc(ncol) - if (.not. isLoopValid) then - goto 9999 - end if + if (.not. isLoopValid) then + goto 9999 + end if - else - do i = 1, is - ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob) ) then + else + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then nxt = lidx(i) if (nxt <= nrow) then idx(i) = -1 cycle endif call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + if (info >=0) then - if (nxt == tlip) then - ncol = max(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) + if (nxt == lip) then + ncol = max(nxt,ncol) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then + info=1 call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) goto 9999 @@ -831,50 +1189,58 @@ contains end if idx(i) = lip info = psb_success_ - else - idx(i) = -1 - end if - enddo + enddo + end if end if - else if (.not.present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,lidx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif + else if (.not.present(lidx)) then - ! In OMP logic the index research limit is turned off. It is - ! a correct way to fit the subroutine? - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) + if (present(mask)) then + if (use_openmp) then + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,idx,mask,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is - if (lip < 0) then - tlip = lip - nxt = lidx(i) + ncol = idxmap%get_lc() + info = 0 + if (mask(i)) then + ip = idx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - !$ call OMP_set_lock(ins_lck) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) + nxt = ncol + 1 + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - ! Locking system to handle concurrent write/access. Under checking! - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - !$ call OMP_unset_lock(ins_lck) + if (lip < 0) then + + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). + !$ call OMP_set_lock(ins_lck) + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + ! Index not found + if (lip < 0) then + ! Locking system to handle concurrent hashmap write/access. + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + else + !$ call OMP_unset_lock(ins_lck) + end if + + idx(i) = lip + info = psb_success_ + end if if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -885,13 +1251,12 @@ contains ncol = MAX(ncol,nxt) !$ call OMP_unset_lock(ins_lck) - ! Under checking! call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. cycle @@ -912,78 +1277,71 @@ contains isLoopValid = .false. cycle end if + else - !$ call OMP_unset_lock(ins_lck) + idx(i) = -1 end if - end if - - idx(i) = lip - info = psb_success_ - end do - !$OMP END PARALLEL DO + end do + !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) + call idxmap%set_lc(ncol) - if (.not. isLoopValid) then - goto 9999 - end if + if (.not. isLoopValid) then + goto 9999 + end if - else - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - - if (info >=0) then - if (nxt == lip) then - ncol = max(nxt,ncol) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) + else + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + idx(i) = lip info = psb_success_ else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 + idx(i) = -1 end if - end if - idx(i) = lip - info = psb_success_ - enddo - end if - end if - - else if (.not.present(lidx)) then - - if (present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mask,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is + enddo + end if + else if (.not.present(mask)) then + if (use_openmp) then + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is - if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -1004,7 +1362,7 @@ contains ! is often already existing). !$ call OMP_set_lock(ins_lck) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) + & idxmap%hashv,idxmap%glb_lc,ncol) ! Index not found if (lip < 0) then @@ -1024,7 +1382,6 @@ contains ! the hash map. In that case 'tlip' is the value corresponding ! to the existing mapping. if (nxt == tlip) then - ncol = MAX(ncol,nxt) !$ call OMP_unset_lock(ins_lck) @@ -1033,9 +1390,9 @@ contains if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + &a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. + !$ isLoopValid = .false. cycle end if @@ -1045,6 +1402,7 @@ contains end if info = psb_success_ + else !$ call OMP_unset_lock(ins_lck) @@ -1054,23 +1412,18 @@ contains isLoopValid = .false. cycle end if + end do + !$OMP END PARALLEL DO - else - idx(i) = -1 - end if - end do - !$OMP END PARALLEL DO + call idxmap%set_lc(ncol) - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 - end if + if (.not. isLoopValid) then + goto 9999 + end if - else - do i = 1, is - ncol = idxmap%get_lc() - if (mask(i)) then + else + do i = 1, is + ncol = idxmap%get_lc() ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 @@ -1091,8 +1444,9 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then info=1 + ch_err='psb_ensure_size' call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='psb_ensure_size',i_err=(/info/)) + &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) goto 9999 end if idxmap%loc_to_glob(nxt) = ip @@ -1100,159 +1454,28 @@ contains endif info = psb_success_ else + ch_err='SearchInsKeyVal' call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) + & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) goto 9999 end if idx(i) = lip info = psb_success_ - else - idx(i) = -1 - end if - enddo - end if - else if (.not.present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is - - ip = idx(i) - - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip < 0) then - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - !$ call OMP_set_lock(ins_lck) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap write/access. - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - else - !$ call OMP_unset_lock(ins_lck) - end if - - idx(i) = lip - info = psb_success_ - end if - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) - - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - !$ isLoopValid = .false. - cycle - end if - - idxmap%loc_to_glob(nxt) = ip - else - !$ call OMP_unset_lock(ins_lck) - end if - - info = psb_success_ - - else - !$ call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle - end if - end do - !$OMP END PARALLEL DO - - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 + enddo end if - - else - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - end if - - if (info >=0) then - if (nxt == lip) then - ncol = nxt - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - ch_err='psb_ensure_size' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - ch_err='SearchInsKeyVal' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 - end if - idx(i) = lip - info = psb_success_ - enddo end if end if - end if - if (use_openmp) then - !$ call OMP_destroy_lock(ins_lck) - end if + if (use_openmp) then + !$ call OMP_destroy_lock(ins_lck) + end if - else - ! Wrong state - idx = -1 - info = -1 + else + ! Wrong state + idx = -1 + info = -1 + end if end if - call psb_erractionrestore(err_act) return From 49d37911caa43599512a6a57974da83be5dd0ec9 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Fri, 10 Feb 2023 17:14:47 +0100 Subject: [PATCH 05/38] Work on psb_hash_map_mod --- ..._hash_map_mod.f90 => psb_hash_map_mod.F90} | 512 +++++++++--------- 1 file changed, 257 insertions(+), 255 deletions(-) rename base/modules/desc/{psb_hash_map_mod.f90 => psb_hash_map_mod.F90} (97%) diff --git a/base/modules/desc/psb_hash_map_mod.f90 b/base/modules/desc/psb_hash_map_mod.F90 similarity index 97% rename from base/modules/desc/psb_hash_map_mod.f90 rename to base/modules/desc/psb_hash_map_mod.F90 index 528450ae..efb3ed10 100644 --- a/base/modules/desc/psb_hash_map_mod.f90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -636,7 +636,9 @@ contains use psb_realloc_mod use psb_sort_mod use psb_penv_mod - !$ use omp_lib +#ifdef OPENMP + use omp_lib +#endif implicit none @@ -655,230 +657,12 @@ contains logical :: use_openmp = .false. logical, volatile :: isLoopValid - !$ integer(kind = OMP_lock_kind) :: ins_lck - - !$ use_openmp = .true. - - if (.true.) then - info = psb_success_ - name = 'hash_g2l_ins' - call psb_erractionsave(err_act) - - ctxt = idxmap%get_ctxt() - call psb_info(ctxt, me, np) - - is = size(idx) - - if (present(mask)) then - if (size(mask) < size(idx)) then - info = -1 - return - end if - end if - if (present(lidx)) then - if (size(lidx) < size(idx)) then - info = -1 - return - end if - end if - - - mglob = idxmap%get_gr() - nrow = idxmap%get_lr() - if (idxmap%is_bld()) then - - if (present(lidx)) then - if (present(mask)) then - do i = 1, is - ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob) ) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (info >=0) then - if (nxt == tlip) then - ncol = max(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 - end if - end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 - end if - enddo - - else if (.not.present(mask)) then - - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - - if (info >=0) then - if (nxt == lip) then - ncol = max(nxt,ncol) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 - end if - end if - idx(i) = lip - info = psb_success_ - enddo - - end if - - else if (.not.present(lidx)) then - - if (present(mask)) then - do i = 1, is - ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - end if - - if (info >=0) then - if (nxt == lip) then - ncol = nxt - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 - end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 - end if - enddo - - else if (.not.present(mask)) then - - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - end if - - if (info >=0) then - if (nxt == lip) then - ncol = nxt - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - ch_err='psb_ensure_size' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - ch_err='SearchInsKeyVal' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 - end if - idx(i) = lip - info = psb_success_ - enddo - - - end if - end if - else - ! Wrong state - idx = -1 - info = -1 - end if - else +#ifdef OPENMP + integer(kind = OMP_lock_kind) :: ins_lck +#endif + if (use_openmp) then +#ifdef OPENMP info = psb_success_ name = 'hash_g2l_ins' call psb_erractionsave(err_act) @@ -905,13 +689,12 @@ contains mglob = idxmap%get_gr() nrow = idxmap%get_lr() - if (idxmap%is_bld()) then - if (use_openmp) then - !$ call OMP_init_lock(ins_lck) - isLoopValid = .true. - ncol = idxmap%get_lc() - end if - + if (idxmap%is_bld()) then + + call OMP_init_lock(ins_lck) + isLoopValid = .true. + ncol = idxmap%get_lc() + if (present(lidx)) then if (present(mask)) then if (use_openmp) then @@ -947,7 +730,7 @@ contains ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - !$ call OMP_set_lock(ins_lck) + call OMP_set_lock(ins_lck) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) ! Index not found @@ -962,7 +745,7 @@ contains if (nxt == tlip) then ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) @@ -977,13 +760,13 @@ contains idxmap%loc_to_glob(nxt) = ip else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if info = psb_success_ else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) @@ -992,7 +775,7 @@ contains cycle end if else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if end if @@ -1056,7 +839,9 @@ contains end if enddo end if + else if (.not.present(mask)) then + if (use_openmp) then !$OMP PARALLEL DO default(none) schedule(STATIC) & !$OMP shared(name,is,idx,lidx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & @@ -1087,14 +872,14 @@ contains ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - !$ call OMP_set_lock(ins_lck) + call OMP_set_lock(ins_lck) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then ! Locking system to handle concurrent write/access. Under checking! call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -1103,7 +888,7 @@ contains if (nxt == tlip) then ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) ! Under checking! call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& @@ -1119,12 +904,12 @@ contains idxmap%loc_to_glob(nxt) = ip else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if info = psb_success_ else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) @@ -1133,7 +918,7 @@ contains cycle end if else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if end if @@ -1225,7 +1010,7 @@ contains ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - !$ call OMP_set_lock(ins_lck) + call OMP_set_lock(ins_lck) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) @@ -1235,7 +1020,7 @@ contains call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if idx(i) = lip @@ -1249,7 +1034,7 @@ contains if (nxt == tlip) then ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) @@ -1264,12 +1049,12 @@ contains idxmap%loc_to_glob(nxt) = ip else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if info = psb_success_ else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) @@ -1360,7 +1145,7 @@ contains ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - !$ call OMP_set_lock(ins_lck) + call OMP_set_lock(ins_lck) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) @@ -1370,7 +1155,7 @@ contains call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if idx(i) = lip @@ -1383,7 +1168,7 @@ contains ! to the existing mapping. if (nxt == tlip) then ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) @@ -1392,19 +1177,19 @@ contains call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) - !$ isLoopValid = .false. + isLoopValid = .false. cycle end if idxmap%loc_to_glob(nxt) = ip else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if info = psb_success_ else - !$ call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) @@ -1466,10 +1251,227 @@ contains end if end if - if (use_openmp) then - !$ call OMP_destroy_lock(ins_lck) + call OMP_destroy_lock(ins_lck) + + else + ! Wrong state + idx = -1 + info = -1 + end if +#endif + else if (.not.use_openmp) then + info = psb_success_ + name = 'hash_g2l_ins' + call psb_erractionsave(err_act) + + ctxt = idxmap%get_ctxt() + call psb_info(ctxt, me, np) + + is = size(idx) + + if (present(mask)) then + if (size(mask) < size(idx)) then + info = -1 + return end if + end if + if (present(lidx)) then + if (size(lidx) < size(idx)) then + info = -1 + return + end if + end if + + + mglob = idxmap%get_gr() + nrow = idxmap%get_lr() + if (idxmap%is_bld()) then + + if (present(lidx)) then + if (present(mask)) then + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + tlip = lip + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + if (info >=0) then + if (nxt == tlip) then + ncol = max(ncol,nxt) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + end if + idx(i) = lip + info = psb_success_ + else + idx(i) = -1 + end if + enddo + + else if (.not.present(mask)) then + + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + + if (info >=0) then + if (nxt == lip) then + ncol = max(nxt,ncol) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + end if + idx(i) = lip + info = psb_success_ + enddo + + end if + + else if (.not.present(lidx)) then + + if (present(mask)) then + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_ensure_size',i_err=(/info/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + goto 9999 + end if + idx(i) = lip + info = psb_success_ + else + idx(i) = -1 + end if + enddo + + else if (.not.present(mask)) then + + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + ch_err='psb_ensure_size' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + goto 9999 + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + ch_err='SearchInsKeyVal' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + goto 9999 + end if + idx(i) = lip + info = psb_success_ + enddo + + + end if + end if else ! Wrong state idx = -1 From bb4e80f647b1ad51f391313c193e6a5952faacfd Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Fri, 10 Feb 2023 17:57:56 +0100 Subject: [PATCH 06/38] Bit of cleanup in psb_hash_map_mod --- base/modules/desc/psb_hash_map_mod.F90 | 523 ++++++++----------------- 1 file changed, 172 insertions(+), 351 deletions(-) diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index efb3ed10..e5ab6385 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -690,164 +690,20 @@ contains nrow = idxmap%get_lr() if (idxmap%is_bld()) then - + call OMP_init_lock(ins_lck) isLoopValid = .true. ncol = idxmap%get_lc() - + if (present(lidx)) then if (present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,mask,lidx,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is - - if (mask(i)) then - ip = idx(i) - - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip < 0) then - tlip = lip - nxt = lidx(i) - - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - call OMP_set_lock(ins_lck) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap read/write. - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - call OMP_unset_lock(ins_lck) - - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - isLoopValid = .false. - cycle - end if - - idxmap%loc_to_glob(nxt) = ip - else - call OMP_unset_lock(ins_lck) - end if - - info = psb_success_ - - else - call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle - end if - else - call OMP_unset_lock(ins_lck) - end if - end if - - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 - end if - end do - !$OMP END PARALLEL DO - - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 - end if - - else - do i = 1, is - ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob) ) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (info >=0) then - if (nxt == tlip) then - ncol = max(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 - end if - end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 - end if - enddo - end if - - else if (.not.present(mask)) then + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,mask,lidx,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,lidx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is + if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -855,14 +711,15 @@ contains cycle endif - ! In OMP logic the index research limit is turned off. It is - ! a correct way to fit the subroutine? + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then tlip = lip - nxt = lidx(i) + nxt = lidx(i) if (nxt <= nrow) then idx(i) = -1 @@ -872,14 +729,13 @@ contains ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - call OMP_set_lock(ins_lck) + call OMP_set_lock(ins_lck) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - + ! Index not found if (lip < 0) then - ! Locking system to handle concurrent write/access. Under checking! + ! Locking system to handle concurrent hashmap read/write. call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - call OMP_unset_lock(ins_lck) if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -890,7 +746,6 @@ contains ncol = MAX(ncol,nxt) call OMP_unset_lock(ins_lck) - ! Under checking! call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) @@ -904,10 +759,11 @@ contains idxmap%loc_to_glob(nxt) = ip else - call OMP_unset_lock(ins_lck) + call OMP_unset_lock(ins_lck) end if info = psb_success_ + else call OMP_unset_lock(ins_lck) @@ -924,108 +780,58 @@ contains idx(i) = lip info = psb_success_ - end do - !$OMP END PARALLEL DO - - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if + end do + !$OMP END PARALLEL DO - else - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip + call idxmap%set_lc(ncol) - if (info >=0) then - if (nxt == lip) then - ncol = max(nxt,ncol) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 - end if - end if - idx(i) = lip - info = psb_success_ - enddo + if (.not. isLoopValid) then + goto 9999 end if - end if - else if (.not.present(lidx)) then - if (present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mask,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is - - ncol = idxmap%get_lc() - info = 0 - if (mask(i)) then - ip = idx(i) - - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif + else if (.not.present(mask)) then - nxt = ncol + 1 - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,idx,lidx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is + ip = idx(i) - if (lip < 0) then + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - call OMP_set_lock(ins_lck) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap write/access. - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - else - call OMP_unset_lock(ins_lck) - end if + ! In OMP logic the index research limit is turned off. It is + ! a correct way to fit the subroutine? + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) - idx(i) = lip - info = psb_success_ - end if + if (lip < 0) then + tlip = lip + nxt = lidx(i) + + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). + call OMP_set_lock(ins_lck) + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip < 0) then + ! Locking system to handle concurrent write/access. Under checking! + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + call OMP_unset_lock(ins_lck) if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -1036,6 +842,7 @@ contains ncol = MAX(ncol,nxt) call OMP_unset_lock(ins_lck) + ! Under checking! call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) @@ -1062,71 +869,36 @@ contains isLoopValid = .false. cycle end if - else - idx(i) = -1 + call OMP_unset_lock(ins_lck) end if - end do - !$OMP END PARALLEL DO - - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 end if - else - do i = 1, is - ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - end if + idx(i) = lip + info = psb_success_ + end do + !$OMP END PARALLEL DO - if (info >=0) then - if (nxt == lip) then - ncol = nxt - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 - end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 - end if - enddo + call idxmap%set_lc(ncol) + + if (.not. isLoopValid) then + goto 9999 end if - else if (.not.present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is + end if + + else if (.not.present(lidx)) then + + if (present(mask)) then + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,idx,mask,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is + + ncol = idxmap%get_lc() + info = 0 + if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -1134,6 +906,7 @@ contains cycle endif + nxt = ncol + 1 ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, ! but inside a critical region. @@ -1167,6 +940,7 @@ contains ! the hash map. In that case 'tlip' is the value corresponding ! to the existing mapping. if (nxt == tlip) then + ncol = MAX(ncol,nxt) call OMP_unset_lock(ins_lck) @@ -1187,7 +961,6 @@ contains end if info = psb_success_ - else call OMP_unset_lock(ins_lck) @@ -1197,57 +970,105 @@ contains isLoopValid = .false. cycle end if - end do - !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if + end do + !$OMP END PARALLEL DO - else - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) + call idxmap%set_lc(ncol) + + if (.not. isLoopValid) then + goto 9999 + end if + + else if (.not.present(mask)) then + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(name,is,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) + do i = 1, is + + ip = idx(i) + + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip < 0) then + + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). + call OMP_set_lock(ins_lck) + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + ! Index not found if (lip < 0) then + ! Locking system to handle concurrent hashmap write/access. call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip + else + call OMP_unset_lock(ins_lck) end if - if (info >=0) then - if (nxt == lip) then - ncol = nxt - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - ch_err='psb_ensure_size' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ + idx(i) = lip + info = psb_success_ + end if + + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then + ncol = MAX(ncol,nxt) + call OMP_unset_lock(ins_lck) + + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + + isLoopValid = .false. + cycle + end if + + idxmap%loc_to_glob(nxt) = ip else - ch_err='SearchInsKeyVal' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 + call OMP_unset_lock(ins_lck) end if - idx(i) = lip + info = psb_success_ - enddo + + else + call OMP_unset_lock(ins_lck) + + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + + isLoopValid = .false. + cycle + end if + end do + !$OMP END PARALLEL DO + + call idxmap%set_lc(ncol) + + if (.not. isLoopValid) then + goto 9999 end if + end if end if From ed7862a848f5b0c220baa011845c82e5d48f9ced Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Tue, 14 Feb 2023 13:04:58 +0100 Subject: [PATCH 07/38] Fix OpenMP g2lv1_ins --- base/modules/desc/psb_hash_map_mod.F90 | 405 ++++++++++--------------- 1 file changed, 168 insertions(+), 237 deletions(-) diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index e5ab6385..5f32b2e1 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -654,88 +654,97 @@ contains type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: me, np character(len=20) :: name,ch_err - logical :: use_openmp = .false. - - logical, volatile :: isLoopValid + logical, allocatable :: mask_(:) + logical :: use_openmp = .true. #ifdef OPENMP integer(kind = OMP_lock_kind) :: ins_lck #endif + logical, volatile :: isLoopValid + info = psb_success_ + name = 'hash_g2l_ins' + call psb_erractionsave(err_act) - if (use_openmp) then -#ifdef OPENMP - info = psb_success_ - name = 'hash_g2l_ins' - call psb_erractionsave(err_act) - - ctxt = idxmap%get_ctxt() - call psb_info(ctxt, me, np) + ctxt = idxmap%get_ctxt() + call psb_info(ctxt, me, np) - is = size(idx) + is = size(idx) - if (present(mask)) then - if (size(mask) < size(idx)) then - info = -1 - return - end if + if (present(mask)) then + if (size(mask) < size(idx)) then + info = -1 + return end if + end if - if (present(lidx)) then - if (size(lidx) < size(idx)) then - info = -1 - return - end if + if (present(lidx)) then + if (size(lidx) < size(idx)) then + info = -1 + return end if + end if + + mglob = idxmap%get_gr() + nrow = idxmap%get_lr() - mglob = idxmap%get_gr() - nrow = idxmap%get_lr() + if (use_openmp) then +#ifdef OPENMP + call OMP_init_lock(ins_lck) if (idxmap%is_bld()) then - call OMP_init_lock(ins_lck) isLoopValid = .true. ncol = idxmap%get_lc() + if (present(mask)) then + !write(0,*) 'present mask' + mask_ = mask + else + allocate(mask_(size(idx))) + mask_ = .true. + end if if (present(lidx)) then + !write(0,*) 'present lidx' if (present(mask)) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,mask,lidx,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + + !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & + !$OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) + !$OMP reduction(.AND.:isLoopValid) do i = 1, is - + info = 0 if (mask(i)) then ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif + call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, ! but inside a critical region. call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif + call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + nxt = lidx(i) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap read/write. + + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + ! Index not found call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -744,7 +753,6 @@ contains if (nxt == tlip) then ncol = MAX(ncol,nxt) - call OMP_unset_lock(ins_lck) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) @@ -754,84 +762,68 @@ contains &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - cycle + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) end if - - idxmap%loc_to_glob(nxt) = ip - else - call OMP_unset_lock(ins_lck) end if - - info = psb_success_ - else - call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle + idx(i) = -1 end if - else call OMP_unset_lock(ins_lck) end if + else + idx(i) = lip end if - - idx(i) = lip - info = psb_success_ else idx(i) = -1 end if + end do !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) - if (.not. isLoopValid) then goto 9999 end if + else - - else if (.not.present(mask)) then - - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,lidx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) + !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & + !$OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & + !$OMP private(i,ip,lip,tlip,nxt,info) & + !$OMP reduction(.AND.:isLoopValid) do i = 1, is + info = 0 ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif + call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + call OMP_unset_lock(ins_lck) - ! In OMP logic the index research limit is turned off. It is - ! a correct way to fit the subroutine? + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - + call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + nxt = lidx(i) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - ! Locking system to handle concurrent write/access. Under checking! + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - call OMP_unset_lock(ins_lck) + lip = tlip if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -840,9 +832,7 @@ contains if (nxt == tlip) then ncol = MAX(ncol,nxt) - call OMP_unset_lock(ins_lck) - ! Under checking! call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) @@ -851,62 +841,48 @@ contains &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - cycle + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) end if - - idxmap%loc_to_glob(nxt) = ip - else - call OMP_unset_lock(ins_lck) end if - - info = psb_success_ else - call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle + idx(i) = -1 end if - else call OMP_unset_lock(ins_lck) end if + else + idx(i) = lip end if - idx(i) = lip - info = psb_success_ end do !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) - if (.not. isLoopValid) then goto 9999 end if - end if - else if (.not.present(lidx)) then - - if (present(mask)) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mask,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + !write(0,*) 'not present lidx' + if(present(mask)) then + !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & + !$OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & !$OMP private(i,ip,lip,tlip,nxt,info) & !$OMP reduction(.AND.:isLoopValid) do i = 1, is - - ncol = idxmap%get_lc() info = 0 if (mask(i)) then ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif + call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + call OMP_unset_lock(ins_lck) - nxt = ncol + 1 ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, ! but inside a critical region. @@ -914,198 +890,153 @@ contains & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then - + call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index - ! is often already existing). - call OMP_set_lock(ins_lck) + ! is often already existing, so this lock is relatively rare). + ncol = idxmap%get_lc() + nxt = ncol + 1 call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap write/access. + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + ! Index not found call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip - else - call OMP_unset_lock(ins_lck) - end if - - idx(i) = lip - info = psb_success_ - end if - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then - ncol = MAX(ncol,nxt) - call OMP_unset_lock(ins_lck) + ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. - cycle + isLoopValid = .false. + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + end if + end if + else + idx(i) = -1 end if - - idxmap%loc_to_glob(nxt) = ip - else call OMP_unset_lock(ins_lck) end if - - info = psb_success_ else - call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle + idx(i) = lip end if - else idx(i) = -1 end if + end do !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) - if (.not. isLoopValid) then goto 9999 end if - - else if (.not.present(mask)) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & + else + !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & + !$OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) + !$OMP reduction(.AND.:isLoopValid) do i = 1, is - + info = 0 ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif + call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, ! but inside a critical region. call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - + call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). - call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + nxt = ncol + 1 call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap write/access. + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + ! Index not found call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip - else - call OMP_unset_lock(ins_lck) - end if - idx(i) = lip - info = psb_success_ - end if + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - ncol = MAX(ncol,nxt) - call OMP_unset_lock(ins_lck) + ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. - cycle + isLoopValid = .false. + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + end if + end if + else + idx(i) = -1 end if - - idxmap%loc_to_glob(nxt) = ip - else call OMP_unset_lock(ins_lck) end if - info = psb_success_ - else - call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle + idx(i) = lip end if + end do !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) - if (.not. isLoopValid) then goto 9999 end if end if end if - - call OMP_destroy_lock(ins_lck) - else ! Wrong state idx = -1 info = -1 end if + call OMP_destroy_lock(ins_lck) + #endif else if (.not.use_openmp) then - info = psb_success_ - name = 'hash_g2l_ins' - call psb_erractionsave(err_act) - - ctxt = idxmap%get_ctxt() - call psb_info(ctxt, me, np) - is = size(idx) - - if (present(mask)) then - if (size(mask) < size(idx)) then - info = -1 - return - end if - end if - if (present(lidx)) then - if (size(lidx) < size(idx)) then - info = -1 - return - end if - end if - - - mglob = idxmap%get_gr() - nrow = idxmap%get_lr() if (idxmap%is_bld()) then if (present(lidx)) then @@ -1205,14 +1136,14 @@ contains if (present(mask)) then do i = 1, is - ncol = idxmap%get_lc() if (mask(i)) then - ip = idx(i) + ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - nxt = ncol + 1 + ncol = idxmap%get_lc() + nxt = ncol + 1 call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& & idxmap%glb_lc,ncol) if (lip < 0) then From c05b32c20219fa613fe94064d2cb9cba5057abd4 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 29 Mar 2023 14:41:35 +0200 Subject: [PATCH 08/38] Reset status for csr_impl. --- base/internals/psi_bld_tmphalo.f90 | 1 + base/modules/desc/psb_hash_map_mod.F90 | 2 +- base/modules/desc/psb_hash_mod.F90 | 26 +++++++++------ base/serial/impl/psb_c_csr_impl.F90 | 44 ++++++++++++++++++-------- base/serial/impl/psb_d_csr_impl.F90 | 44 ++++++++++++++++++-------- base/serial/impl/psb_s_csr_impl.F90 | 44 ++++++++++++++++++-------- base/serial/impl/psb_z_csr_impl.F90 | 44 ++++++++++++++++++-------- 7 files changed, 139 insertions(+), 66 deletions(-) diff --git a/base/internals/psi_bld_tmphalo.f90 b/base/internals/psi_bld_tmphalo.f90 index 78ce69ed..9d3fb7ab 100644 --- a/base/internals/psi_bld_tmphalo.f90 +++ b/base/internals/psi_bld_tmphalo.f90 @@ -104,6 +104,7 @@ subroutine psi_bld_tmphalo(desc,info) call desc%indxmap%l2gip(helem(1:nh),info) if (info == psb_success_) call desc%indxmap%fnd_owner(helem(1:nh),hproc,info) + !write(0,*) 'bld_tmphalo calling set_owner',hproc(:) if (info == psb_success_) call desc%indxmap%set_halo_owner(hproc,info) if (info == psb_success_) call desc%indxmap%xtnd_p_adjcncy(hproc) diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index 5f32b2e1..82e1f012 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -655,7 +655,7 @@ contains integer(psb_ipk_) :: me, np character(len=20) :: name,ch_err logical, allocatable :: mask_(:) - logical :: use_openmp = .true. + logical :: use_openmp = .false. #ifdef OPENMP integer(kind = OMP_lock_kind) :: ins_lck #endif diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index 42911e3f..0c682670 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -388,7 +388,7 @@ contains info = HashOK hsize = hash%hsize hmask = hash%hmask - + val = -1 hk = iand(psb_hashval(key),hmask) if (hk == 0) then hd = 1 @@ -409,6 +409,7 @@ contains info = HashDuplicate return end if + !$OMP CRITICAL if (hash%table(hk,1) == HashFreeEntry) then if (hash%nk == hash%hsize -1) then ! @@ -420,19 +421,22 @@ contains call psb_hash_realloc(hash,info) if (info /= HashOk) then info = HashOutOfMemory - return + !return else call psb_hash_searchinskey(key,val,nextval,hash,info) - return + !return end if else hash%nk = hash%nk + 1 hash%table(hk,1) = key hash%table(hk,2) = nextval val = nextval - return + !return end if end if + !$OMP END CRITICAL + if (info /= HashOk) return + if (val > 0) return hk = hk - hd if (hk < 0) hk = hk + hsize end do @@ -448,7 +452,7 @@ contains info = HashOK hsize = hash%hsize hmask = hash%hmask - + hk = iand(psb_hashval(key),hmask) if (hk == 0) then hd = 1 @@ -460,7 +464,7 @@ contains info = HashOutOfMemory return end if - + val = -1 hash%nsrch = hash%nsrch + 1 do hash%nacc = hash%nacc + 1 @@ -469,6 +473,7 @@ contains info = HashDuplicate return end if + !$OMP CRITICAL if (hash%table(hk,1) == HashFreeEntry) then if (hash%nk == hash%hsize -1) then ! @@ -480,19 +485,22 @@ contains call psb_hash_realloc(hash,info) if (info /= HashOk) then info = HashOutOfMemory - return + !return else call psb_hash_searchinskey(key,val,nextval,hash,info) - return + !return end if else hash%nk = hash%nk + 1 hash%table(hk,1) = key hash%table(hk,2) = nextval val = nextval - return + !return end if end if + !$OMP END CRITICAL + if (info /= HashOk) return + if (val > 0) return hk = hk - hd if (hk < 0) hk = hk + hsize end do diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index fa5f2edf..a7869136 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2862,6 +2874,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='c_cp_csr_from_coo' + logical :: use_openmp = .false. + #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j @@ -2983,21 +2997,22 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - + + end subroutine psb_c_cp_csr_from_coo @@ -3113,6 +3128,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' + logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3120,7 +3136,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3214,6 +3229,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else + do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index faf422ae..a2ddad30 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2862,6 +2874,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='d_cp_csr_from_coo' + logical :: use_openmp = .false. + #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j @@ -2983,21 +2997,22 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - + + end subroutine psb_d_cp_csr_from_coo @@ -3113,6 +3128,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' + logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3120,7 +3136,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3214,6 +3229,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else + do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 446e6791..a8c67c10 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2862,6 +2874,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='s_cp_csr_from_coo' + logical :: use_openmp = .false. + #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j @@ -2983,21 +2997,22 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - + + end subroutine psb_s_cp_csr_from_coo @@ -3113,6 +3128,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' + logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3120,7 +3136,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3214,6 +3229,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else + do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 5ec579d5..6344b268 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2862,6 +2874,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='z_cp_csr_from_coo' + logical :: use_openmp = .false. + #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j @@ -2983,21 +2997,22 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - + + end subroutine psb_z_cp_csr_from_coo @@ -3113,6 +3128,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' + logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3120,7 +3136,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3214,6 +3229,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else + do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 From 0f1603a2e959c18d9365b228de02db9eb063bc13 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 13 Apr 2023 16:40:29 +0200 Subject: [PATCH 09/38] The current version of test/omp seems to be working. To be completed --- base/modules/desc/psb_hash_map_mod.F90 | 30 +- base/serial/impl/psb_d_coo_impl.F90 | 38 +- base/tools/psb_dspins.F90 | 35 +- test/omp/Makefile | 56 ++ test/omp/psb_tomp.F90 | 1062 ++++++++++++++++++++++++ test/pargen/psb_d_pde3d.F90 | 1 + 6 files changed, 1193 insertions(+), 29 deletions(-) create mode 100644 test/omp/Makefile create mode 100644 test/omp/psb_tomp.F90 diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index 82e1f012..af9cdf61 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -1036,7 +1036,12 @@ contains #endif else if (.not.use_openmp) then - +#ifdef OPENMP + ! $ omp parallel + ! $ omp critical + !write(0,*) 'In cnv: ',omp_get_num_threads() +#endif + isLoopValid = .true. if (idxmap%is_bld()) then if (present(lidx)) then @@ -1067,7 +1072,7 @@ contains if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 + isLoopValid = .false. end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) @@ -1076,7 +1081,7 @@ contains else call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 + isLoopValid = .false. end if end if idx(i) = lip @@ -1114,7 +1119,7 @@ contains info=1 call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 + isLoopValid = .false. end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) @@ -1123,7 +1128,7 @@ contains else call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 + isLoopValid = .false. end if end if idx(i) = lip @@ -1160,7 +1165,7 @@ contains info=1 call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 + isLoopValid = .false. end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) @@ -1169,7 +1174,7 @@ contains else call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 + isLoopValid = .false. end if idx(i) = lip info = psb_success_ @@ -1205,7 +1210,8 @@ contains ch_err='psb_ensure_size' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 + isLoopValid = .false. + end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) @@ -1215,7 +1221,7 @@ contains ch_err='SearchInsKeyVal' call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 + isLoopValid = .false. end if idx(i) = lip info = psb_success_ @@ -1229,6 +1235,12 @@ contains idx = -1 info = -1 end if +#ifdef OPENMP + ! $ omp end critical + ! $ omp end parallel + +#endif + if (.not. isLoopValid) goto 9999 end if call psb_erractionrestore(err_act) return diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 24b486e2..f4f4f0dc 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -2818,6 +2818,9 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_coo_sparse_mat), intent(inout) :: a @@ -2829,7 +2832,7 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='d_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit, nzaold info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2862,9 +2865,11 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2872,16 +2877,23 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) +#if defined(OPENMP) + !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() +#endif end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2933,9 +2945,9 @@ contains ! the serial version: each element is stored in data ! structures but the invalid ones are stored as '-1' values. ! These values will be filtered in a future fixing process. - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nz,imin,imax,jmin,jmax,ia,ja,val,ia1,ia2,aspk,nza) & - !$OMP private(ir,ic,i) + ! $ OMP PARALLEL DO default(none) schedule(STATIC) & + ! $ OMP shared(nz,imin,imax,jmin,jmax,ia,ja,val,ia1,ia2,aspk,nza) & + ! $ OMP private(ir,ic,i) do i=1,nz ir = ia(i) ic = ja(i) @@ -2949,9 +2961,9 @@ contains aspk(nza+i) = -1 end if end do - !$OMP END PARALLEL DO + ! $OMP END PARALLEL DO - nza = nza + nz + !nza = nza + nz #else do i=1, nz ir = ia(i) diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 2a70ab83..9c4f137f 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -51,6 +51,9 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_dspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -82,7 +85,9 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) - +#if defined(OPENMP) + !write(0,*) name,omp_get_num_threads(),omp_get_thread_num() +#endif if (nz < 0) then info = 1111 call psb_errpush(info,name) @@ -131,15 +136,26 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if - - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol) +#endif + + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp critical +#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - +!!$ write(0,*) omp_get_thread_num(),'Check g2l: ',ila(1:nz),':',& +!!$ & jla(1:nz),':',ja(1:nz) +#if defined(OPENMP) + !$omp end critical +#endif + if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - goto 9999 + !goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() @@ -149,7 +165,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then @@ -175,8 +191,13 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - goto 9999 + !goto 9999 end if + +#if defined(OPENMP) + !$omp end parallel +#endif + endif else if (desc_a%is_asb()) then diff --git a/test/omp/Makefile b/test/omp/Makefile new file mode 100644 index 00000000..c35431c5 --- /dev/null +++ b/test/omp/Makefile @@ -0,0 +1,56 @@ +INSTALLDIR=../.. +INCDIR=$(INSTALLDIR)/include +MODDIR=$(INSTALLDIR)/modules/ +include $(INCDIR)/Make.inc.psblas +# +# Libraries used +LIBDIR=$(INSTALLDIR)/lib +PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base +LDLIBS=$(PSBLDLIBS) +# +# Compilers and such +# +CCOPT= -g +FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG). + + +EXEDIR=./runs + +all: runsd psb_tomp #psb_d_pde3d psb_s_pde3d psb_d_pde2d psb_s_pde2d + +runsd: + (if test ! -d runs ; then mkdir runs; fi) + +psb_tomp: psb_tomp.o + $(FLINK) psb_tomp.o -o psb_tomp $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_tomp $(EXEDIR) + +psb_d_pde3d: psb_d_pde3d.o + $(FLINK) psb_d_pde3d.o -o psb_d_pde3d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_d_pde3d $(EXEDIR) + + +psb_s_pde3d: psb_s_pde3d.o + $(FLINK) psb_s_pde3d.o -o psb_s_pde3d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_s_pde3d $(EXEDIR) + +psb_d_pde2d: psb_d_pde2d.o + $(FLINK) psb_d_pde2d.o -o psb_d_pde2d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_d_pde2d $(EXEDIR) + + +psb_s_pde2d: psb_s_pde2d.o + $(FLINK) psb_s_pde2d.o -o psb_s_pde2d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_s_pde2d $(EXEDIR) + + +clean: + /bin/rm -f psb_tomp.o psb_d_pde3d.o psb_s_pde3d.o psb_d_pde2d.o psb_s_pde2d.o *$(.mod) \ + $(EXEDIR)/psb_d_pde3d $(EXEDIR)/psb_s_pde3d $(EXEDIR)/psb_d_pde2d $(EXEDIR)/psb_s_pde2d +verycleanlib: + (cd ../..; make veryclean) +lib: + (cd ../../; make library) + + + diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 new file mode 100644 index 00000000..8681c396 --- /dev/null +++ b/test/omp/psb_tomp.F90 @@ -0,0 +1,1062 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! File: psb_d_pde3d.f90 +! +! Program: psb_d_pde3d +! This sample program solves a linear system obtained by discretizing a +! PDE with Dirichlet BCs. +! +! +! The PDE is a general second order equation in 3d +! +! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u) +! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f +! dxdx dydy dzdz dx dy dz +! +! with Dirichlet boundary conditions +! u = g +! +! on the unit cube 0<=x,y,z<=1. +! +! +! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation. +! +! There are three choices available for data distribution: +! 1. A simple BLOCK distribution +! 2. A ditribution based on arbitrary assignment of indices to processes, +! typically from a graph partitioner +! 3. A 3D distribution in which the unit cube is partitioned +! into subcubes, each one assigned to a process. +! +! +module psb_d_pde3d_mod + + + use psb_base_mod, only : psb_dpk_, psb_ipk_, psb_lpk_, psb_desc_type,& + & psb_dspmat_type, psb_d_vect_type, dzero,& + & psb_d_base_sparse_mat, psb_d_base_vect_type, & + & psb_i_base_vect_type, psb_l_base_vect_type + + interface + function d_func_3d(x,y,z) result(val) + import :: psb_dpk_ + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + end function d_func_3d + end interface + + interface psb_gen_pde3d + module procedure psb_d_gen_pde3d + end interface psb_gen_pde3d + +contains + + function d_null_func_3d(x,y,z) result(val) + + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + + val = dzero + + end function d_null_func_3d + ! + ! functions parametrizing the differential equation + ! + + ! + ! Note: b1, b2 and b3 are the coefficients of the first + ! derivative of the unknown function. The default + ! we apply here is to have them zero, so that the resulting + ! matrix is symmetric/hermitian and suitable for + ! testing with CG and FCG. + ! When testing methods for non-hermitian matrices you can + ! change the B1/B2/B3 functions to e.g. done/sqrt((3*done)) + ! + function b1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b1 + real(psb_dpk_), intent(in) :: x,y,z + b1=dzero + end function b1 + function b2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b2 + real(psb_dpk_), intent(in) :: x,y,z + b2=dzero + end function b2 + function b3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b3 + real(psb_dpk_), intent(in) :: x,y,z + b3=dzero + end function b3 + function c(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: c + real(psb_dpk_), intent(in) :: x,y,z + c=dzero + end function c + function a1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a1 + real(psb_dpk_), intent(in) :: x,y,z + a1=done/80 + end function a1 + function a2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a2 + real(psb_dpk_), intent(in) :: x,y,z + a2=done/80 + end function a2 + function a3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a3 + real(psb_dpk_), intent(in) :: x,y,z + a3=done/80 + end function a3 + function g(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: g + real(psb_dpk_), intent(in) :: x,y,z + g = dzero + if (x == done) then + g = done + else if (x == dzero) then + g = exp(y**2-z**2) + end if + end function g + + + ! + ! subroutine to allocate and fill in the coefficient matrix and + ! the rhs. + ! + subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info,& + & f,amold,vmold,imold,partition,nrl,iv) + use psb_base_mod + use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif + ! + ! Discretizes the partial differential equation + ! + ! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u) + ! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f + ! dxdx dydy dzdz dx dy dz + ! + ! with Dirichlet boundary conditions + ! u = g + ! + ! on the unit cube 0<=x,y,z<=1. + ! + ! + ! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation. + ! + implicit none + integer(psb_ipk_) :: idim + type(psb_dspmat_type) :: a + type(psb_d_vect_type) :: xv,bv + type(psb_desc_type) :: desc_a + type(psb_ctxt_type) :: ctxt + integer(psb_ipk_) :: info + character(len=*) :: afmt + procedure(d_func_3d), optional :: f + class(psb_d_base_sparse_mat), optional :: amold + class(psb_d_base_vect_type), optional :: vmold + class(psb_i_base_vect_type), optional :: imold + integer(psb_ipk_), optional :: partition, nrl,iv(:) + + ! Local variables. + + integer(psb_ipk_), parameter :: nb=20 + type(psb_d_csc_sparse_mat) :: acsc + type(psb_d_coo_sparse_mat) :: acoo + type(psb_d_csr_sparse_mat) :: acsr + real(psb_dpk_) :: zt(nb),x,y,z + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_, mysz + integer(psb_lpk_) :: m,n,glob_row,nt + integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner + ! For 3D partition + ! Note: integer control variables going directly into an MPI call + ! must be 4 bytes, i.e. psb_mpk_ + integer(psb_mpk_) :: npdims(3), npp, minfo + integer(psb_ipk_) :: npx,npy,npz, iamx,iamy,iamz,mynx,myny,mynz + integer(psb_ipk_), allocatable :: bndx(:),bndy(:),bndz(:) + ! Process grid + integer(psb_ipk_) :: np, iam + integer(psb_ipk_) :: icoeff + integer(psb_lpk_), allocatable :: myidx(:) + ! deltah dimension of each grid cell + ! deltat discretization time + real(psb_dpk_) :: deltah, sqdeltah, deltah2 + real(psb_dpk_), parameter :: rhs=dzero,one=done,zero=dzero + real(psb_dpk_) :: t0, t1, t2, t3, tasb, talc, ttot, tgen, tcdasb + integer(psb_ipk_) :: err_act + procedure(d_func_3d), pointer :: f_ + character(len=20) :: name, ch_err,tmpfmt + + info = psb_success_ + name = 'create_matrix' + call psb_erractionsave(err_act) + + call psb_info(ctxt, iam, np) + + + if (present(f)) then + f_ => f + else + f_ => d_null_func_3d + end if + + deltah = done/(idim+1) + sqdeltah = deltah*deltah + deltah2 = (2*done)* deltah + + if (present(partition)) then + if ((1<= partition).and.(partition <= 3)) then + partition_ = partition + else + write(*,*) 'Invalid partition choice ',partition,' defaulting to 3' + partition_ = 3 + end if + else + partition_ = 3 + end if + + ! initialize array descriptor and sparse matrix storage. provide an + ! estimate of the number of non zeroes + + m = (1_psb_lpk_*idim)*idim*idim + n = m + nnz = ((n*7)/(np)) + if(iam == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n + t0 = psb_wtime() + select case(partition_) + case(1) + ! A BLOCK partition + if (present(nrl)) then + nr = nrl + else + ! + ! Using a simple BLOCK distribution. + ! + nt = (m+np-1)/np + nr = max(0,min(nt,m-(iam*nt))) + end if + + nt = nr + call psb_sum(ctxt,nt) + if (nt /= m) then + write(psb_err_unit,*) iam, 'Initialization error ',nr,nt,m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! First example of use of CDALL: specify for each process a number of + ! contiguous rows + ! + call psb_cdall(ctxt,desc_a,info,nl=nr) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(2) + ! A partition defined by the user through IV + + if (present(iv)) then + if (size(iv) /= m) then + write(psb_err_unit,*) iam, 'Initialization error: wrong IV size',size(iv),m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + else + write(psb_err_unit,*) iam, 'Initialization error: IV not present' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! Second example of use of CDALL: specify for each row the + ! process that owns it + ! + call psb_cdall(ctxt,desc_a,info,vg=iv) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(3) + ! A 3-dimensional partition + + ! A nifty MPI function will split the process list + npdims = 0 +#if defined(SERIAL_MPI) + npdims = 1 +#else + call mpi_dims_create(np,3,npdims,info) +#endif + npx = npdims(1) + npy = npdims(2) + npz = npdims(3) + + allocate(bndx(0:npx),bndy(0:npy),bndz(0:npz)) + ! We can reuse idx2ijk for process indices as well. + call idx2ijk(iamx,iamy,iamz,iam,npx,npy,npz,base=0) + ! Now let's split the 3D cube in hexahedra + call dist1Didx(bndx,idim,npx) + mynx = bndx(iamx+1)-bndx(iamx) + call dist1Didx(bndy,idim,npy) + myny = bndy(iamy+1)-bndy(iamy) + call dist1Didx(bndz,idim,npz) + mynz = bndz(iamz+1)-bndz(iamz) + + ! How many indices do I own? + nlr = mynx*myny*mynz + allocate(myidx(nlr)) + ! Now, let's generate the list of indices I own + nr = 0 + do i=bndx(iamx),bndx(iamx+1)-1 + do j=bndy(iamy),bndy(iamy+1)-1 + do k=bndz(iamz),bndz(iamz+1)-1 + nr = nr + 1 + call ijk2idx(myidx(nr),i,j,k,idim,idim,idim) + end do + end do + end do + if (nr /= nlr) then + write(psb_err_unit,*) iam,iamx,iamy,iamz, 'Initialization error: NR vs NLR ',& + & nr,nlr,mynx,myny,mynz + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + end if + + ! + ! Third example of use of CDALL: specify for each process + ! the set of global indices it owns. + ! + call psb_cdall(ctxt,desc_a,info,vl=myidx) + + + ! + ! Specify process topology + ! + block + ! + ! Use adjcncy methods + ! + integer(psb_mpk_), allocatable :: neighbours(:) + integer(psb_mpk_) :: cnt + logical, parameter :: debug_adj=.true. + if (debug_adj.and.(np > 1)) then + cnt = 0 + allocate(neighbours(np)) + if (iamx < npx-1) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx+1,iamy,iamz,npx,npy,npz,base=0) + end if + if (iamy < npy-1) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy+1,iamz,npx,npy,npz,base=0) + end if + if (iamz < npz-1) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy,iamz+1,npx,npy,npz,base=0) + end if + if (iamx >0) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx-1,iamy,iamz,npx,npy,npz,base=0) + end if + if (iamy >0) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy-1,iamz,npx,npy,npz,base=0) + end if + if (iamz >0) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy,iamz-1,npx,npy,npz,base=0) + end if + call psb_realloc(cnt, neighbours,info) + call desc_a%set_p_adjcncy(neighbours) + !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() + end if + end block + + case default + write(psb_err_unit,*) iam, 'Initialization error: should not get here' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end select + + + if (info == psb_success_) call psb_spall(a,desc_a,info,nnz=nnz, & + & bldmode=psb_matbld_remote_,dupl=psb_dupl_add_) + ! define rhs from boundary conditions; also build initial guess + if (info == psb_success_) call psb_geall(xv,desc_a,info) + if (info == psb_success_) call psb_geall(bv,desc_a,info,& + & bldmode=psb_matbld_remote_,dupl=psb_dupl_add_) + + call psb_barrier(ctxt) + talc = psb_wtime()-t0 + + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='allocation rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + call psb_barrier(ctxt) + t1 = psb_wtime() + !$omp parallel private(i,ii,ib,icoeff,glob_row,x,y,z,zt,ix,iy,iz) + ! shared(deltah,myidx,a,desc_a,nb) + ! we build an auxiliary matrix consisting of one row at a + ! time; just a small matrix. might be extended to generate + ! a bunch of rows per call. + ! + block + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + + ! loop over rows belonging to current process in a block + ! distribution. + + !$omp do + ! + do ii=1, nlr, nb + if (info /= 0) cycle + ib = min(nb,nlr-ii+1) + !ib = min(nb,mysz-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle + end do + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel + + tgen = psb_wtime()-t1 + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='insert rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + + call psb_barrier(ctxt) + t1 = psb_wtime() + call psb_cdasb(desc_a,info,mold=imold) + tcdasb = psb_wtime()-t1 + + call psb_barrier(ctxt) + t1 = psb_wtime() + if (info == psb_success_) then + if (present(amold)) then + call psb_spasb(a,desc_a,info,mold=amold) + else + call psb_spasb(a,desc_a,info,afmt=afmt) + end if + end if + call psb_barrier(ctxt) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + if (info == psb_success_) call psb_geasb(xv,desc_a,info,mold=vmold) + if (info == psb_success_) call psb_geasb(bv,desc_a,info,mold=vmold) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + tasb = psb_wtime()-t1 + call psb_barrier(ctxt) + ttot = psb_wtime() - t0 + + call psb_amx(ctxt,talc) + call psb_amx(ctxt,tgen) + call psb_amx(ctxt,tasb) + call psb_amx(ctxt,ttot) + if(iam == psb_root_) then + tmpfmt = a%get_fmt() + write(psb_out_unit,'("The matrix has been generated and assembled in ",a3," format.")')& + & tmpfmt + write(psb_out_unit,'("-allocation time : ",es12.5)') talc + write(psb_out_unit,'("-coeff. gen. time : ",es12.5)') tgen + write(psb_out_unit,'("-desc asbly time : ",es12.5)') tcdasb + write(psb_out_unit,'("- mat asbly time : ",es12.5)') tasb + write(psb_out_unit,'("-total time : ",es12.5)') ttot + + end if + !call a%print('a.mtx',head='Test') + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(ctxt,err_act) + + return + end subroutine psb_d_gen_pde3d + function outside(i,j,k,bndx,bndy,bndz,iamx,iamy,iamz) result(res) + logical :: res + integer(psb_ipk_), intent(in) :: i,j,k,iamx,iamy,iamz + integer(psb_ipk_), intent(in) :: bndx(0:),bndy(0:),bndz(0:) + + res = (i=bndx(iamx+1)) & + & .or.(j=bndy(iamy+1)) & + & .or.(k=bndz(iamz+1)) + end function outside +end module psb_d_pde3d_mod + +program psb_d_pde3d + use psb_base_mod + use psb_prec_mod + use psb_krylov_mod + use psb_util_mod + use psb_d_pde3d_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + + ! input parameters + character(len=20) :: kmethd, ptype + character(len=5) :: afmt + integer(psb_ipk_) :: idim + integer(psb_epk_) :: system_size + + ! miscellaneous + real(psb_dpk_), parameter :: one = done + real(psb_dpk_) :: t1, t2, tprec + + ! sparse matrix and preconditioner + type(psb_dspmat_type) :: a + type(psb_dprec_type) :: prec + ! descriptor + type(psb_desc_type) :: desc_a + ! dense vectors + type(psb_d_vect_type) :: xxv,bv + ! parallel environment + type(psb_ctxt_type) :: ctxt + integer(psb_ipk_) :: iam, np, nth + + ! solver parameters + integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart + integer(psb_epk_) :: amatsize, precsize, descsize, d2size + real(psb_dpk_) :: err, eps + + ! Parameters for solvers in Block-Jacobi preconditioner + type ainvparms + character(len=12) :: alg, orth_alg, ilu_alg, ilut_scale + integer(psb_ipk_) :: fill, inv_fill + real(psb_dpk_) :: thresh, inv_thresh + end type ainvparms + type(ainvparms) :: parms + + ! other variables + integer(psb_ipk_) :: info, i + character(len=20) :: name,ch_err + character(len=40) :: fname + + info=psb_success_ + + + call psb_init(ctxt) + call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + + if (iam < 0) then + ! This should not happen, but just in case + call psb_exit(ctxt) + stop + endif + if(psb_errstatus_fatal()) goto 9999 + name='pde3d90' + call psb_set_errverbosity(itwo) + call psb_cd_set_large_threshold(2000_psb_ipk_) + ! + ! Hello world + ! + if (iam == psb_root_) then + write(*,*) 'Welcome to PSBLAS version: ',psb_version_string_ + write(*,*) 'This is the ',trim(name),' sample program' + end if + ! + ! get parameters + ! + call get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ipart,parms) + ! + ! allocate and fill in the coefficient matrix, rhs and initial guess + ! + call psb_barrier(ctxt) + t1 = psb_wtime() + call psb_gen_pde3d(ctxt,idim,a,bv,xxv,desc_a,afmt,info,partition=ipart) + call psb_barrier(ctxt) + t2 = psb_wtime() - t1 + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='psb_gen_pde3d' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + if (iam == psb_root_) write(psb_out_unit,'("Overall matrix creation time : ",es12.5)')t2 + if (iam == psb_root_) write(psb_out_unit,'(" ")') + ! + ! prepare the preconditioner. + ! + if(iam == psb_root_) write(psb_out_unit,'("Setting preconditioner to : ",a)')ptype + call prec%init(ctxt,ptype,info) + ! + ! Set the options for the BJAC preconditioner + ! + if (psb_toupper(ptype) == "BJAC") then + call prec%set('sub_solve', parms%alg, info) + select case (psb_toupper(parms%alg)) + case ("ILU") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('ilu_alg', parms%ilu_alg, info) + case ("ILUT") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('sub_iluthrs', parms%thresh, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + case ("AINV") + call prec%set('inv_thresh', parms%inv_thresh, info) + call prec%set('inv_fillin', parms%inv_fill, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + call prec%set('ainv_alg', parms%orth_alg, info) + case ("INVK") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('inv_fillin', parms%inv_fill, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + case ("INVT") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('inv_fillin', parms%inv_fill, info) + call prec%set('sub_iluthrs', parms%thresh, info) + call prec%set('inv_thresh', parms%inv_thresh, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + case default + ! Do nothing, use default setting in the init routine + end select + else + ! nothing to set for NONE or DIAG preconditioner + end if + + call psb_barrier(ctxt) + t1 = psb_wtime() + call prec%build(a,desc_a,info) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='psb_precbld' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + tprec = psb_wtime()-t1 + + call psb_amx(ctxt,tprec) + + if (iam == psb_root_) write(psb_out_unit,'("Preconditioner time : ",es12.5)')tprec + if (iam == psb_root_) write(psb_out_unit,'(" ")') + call prec%descr(info) + ! + ! iterative method parameters + ! + if(iam == psb_root_) write(psb_out_unit,'("Calling iterative method ",a)')kmethd + call psb_barrier(ctxt) + t1 = psb_wtime() + eps = 1.d-6 + call psb_krylov(kmethd,a,prec,bv,xxv,eps,desc_a,info,& + & itmax=itmax,iter=iter,err=err,itrace=itrace,istop=istopc,irst=irst) + + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='solver routine' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + call psb_barrier(ctxt) + t2 = psb_wtime() - t1 + call psb_amx(ctxt,t2) + amatsize = a%sizeof() + descsize = desc_a%sizeof() + precsize = prec%sizeof() + system_size = desc_a%get_global_rows() + call psb_sum(ctxt,amatsize) + call psb_sum(ctxt,descsize) + call psb_sum(ctxt,precsize) + + if (iam == psb_root_) then + write(psb_out_unit,'(" ")') + write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Linear system size : ",i12)') system_size + write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 + write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter + write(psb_out_unit,'("Number of iterations : ",i12)')iter + write(psb_out_unit,'("Convergence indicator on exit : ",es12.5)')err + write(psb_out_unit,'("Info on exit : ",i12)')info + write(psb_out_unit,'("Total memory occupation for A: ",i12)')amatsize + write(psb_out_unit,'("Total memory occupation for PREC: ",i12)')precsize + write(psb_out_unit,'("Total memory occupation for DESC_A: ",i12)')descsize + write(psb_out_unit,'("Storage format for A: ",a)') a%get_fmt() + write(psb_out_unit,'("Storage format for DESC_A: ",a)') desc_a%get_fmt() + end if + + + ! + ! cleanup storage and exit + ! + call psb_gefree(bv,desc_a,info) + call psb_gefree(xxv,desc_a,info) + call psb_spfree(a,desc_a,info) + call prec%free(info) + call psb_cdfree(desc_a,info) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='free routine' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + call psb_exit(ctxt) + stop + +9999 call psb_error(ctxt) + + stop + +contains + ! + ! get iteration parameters from standard input + ! + subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,& + & itmax,itrace,irst,ipart,parms) + type(psb_ctxt_type) :: ctxt + character(len=*) :: kmethd, ptype, afmt + integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst,ipart + integer(psb_ipk_) :: np, iam + integer(psb_ipk_) :: ip, inp_unit + character(len=1024) :: filename + type(ainvparms) :: parms + + call psb_info(ctxt, iam, np) + + if (iam == 0) then + if (command_argument_count()>0) then + call get_command_argument(1,filename) + inp_unit = 30 + open(inp_unit,file=filename,action='read',iostat=info) + if (info /= 0) then + write(psb_err_unit,*) 'Could not open file ',filename,' for input' + call psb_abort(ctxt) + stop + else + write(psb_err_unit,*) 'Opened file ',trim(filename),' for input' + end if + else + inp_unit=psb_inp_unit + end if + read(inp_unit,*) ip + if (ip >= 3) then + read(inp_unit,*) kmethd + read(inp_unit,*) ptype + read(inp_unit,*) afmt + + read(inp_unit,*) idim + if (ip >= 4) then + read(inp_unit,*) ipart + else + ipart = 3 + endif + if (ip >= 5) then + read(inp_unit,*) istopc + else + istopc=1 + endif + if (ip >= 6) then + read(inp_unit,*) itmax + else + itmax=500 + endif + if (ip >= 7) then + read(inp_unit,*) itrace + else + itrace=-1 + endif + if (ip >= 8) then + read(inp_unit,*) irst + else + irst=1 + endif + if (ip >= 9) then + read(inp_unit,*) parms%alg + read(inp_unit,*) parms%ilu_alg + read(inp_unit,*) parms%ilut_scale + read(inp_unit,*) parms%fill + read(inp_unit,*) parms%inv_fill + read(inp_unit,*) parms%thresh + read(inp_unit,*) parms%inv_thresh + read(inp_unit,*) parms%orth_alg + else + parms%alg = 'ILU' ! Block Solver ILU,ILUT,INVK,AINVT,AORTH + parms%ilu_alg = 'NONE' ! If ILU : MILU or NONE othewise ignored + parms%ilut_scale = 'NONE' ! If ILUT: NONE, MAXVAL, DIAG, ARWSUM, ACLSUM, ARCSUM + parms%fill = 0 ! Level of fill for forward factorization + parms%inv_fill = 1 ! Level of fill for inverse factorization (only INVK) + parms%thresh = 1E-1_psb_dpk_ ! Threshold for forward factorization + parms%inv_thresh = 1E-1_psb_dpk_ ! Threshold for inverse factorization + parms%orth_alg = 'LLK' ! What orthogonalization algorithm? + endif + + write(psb_out_unit,'("Solving matrix : ell1")') + write(psb_out_unit,& + & '("Grid dimensions : ",i4," x ",i4," x ",i4)') & + & idim,idim,idim + write(psb_out_unit,'("Number of processors : ",i0)')np + select case(ipart) + case(1) + write(psb_out_unit,'("Data distribution : BLOCK")') + case(3) + write(psb_out_unit,'("Data distribution : 3D")') + case default + ipart = 3 + write(psb_out_unit,'("Unknown data distrbution, defaulting to 3D")') + end select + write(psb_out_unit,'("Preconditioner : ",a)') ptype + if( psb_toupper(ptype) == "BJAC" ) then + write(psb_out_unit,'("Block subsolver : ",a)') parms%alg + select case (psb_toupper(parms%alg)) + case ('ILU') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("MILU : ",a)') parms%ilu_alg + case ('ILUT') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("Threshold : ",es12.5)') parms%thresh + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case ('INVK') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("Invese Fill in : ",i0)') parms%inv_fill + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case ('INVT') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("Threshold : ",es12.5)') parms%thresh + write(psb_out_unit,'("Invese Fill in : ",i0)') parms%inv_fill + write(psb_out_unit,'("Inverse Threshold : ",es12.5)') parms%inv_thresh + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case ('AINV','AORTH') + write(psb_out_unit,'("Inverse Threshold : ",es12.5)') parms%inv_thresh + write(psb_out_unit,'("Invese Fill in : ",i0)') parms%inv_fill + write(psb_out_unit,'("Orthogonalization : ",a)') parms%orth_alg + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case default + write(psb_out_unit,'("Unknown diagonal solver")') + end select + end if + write(psb_out_unit,'("Iterative method : ",a)') kmethd + write(psb_out_unit,'(" ")') + else + ! wrong number of parameter, print an error message and exit + call pr_usage(izero) + call psb_abort(ctxt) + stop 1 + endif + if (inp_unit /= psb_inp_unit) then + close(inp_unit) + end if + + end if + ! broadcast parameters to all processors + call psb_bcast(ctxt,kmethd) + call psb_bcast(ctxt,afmt) + call psb_bcast(ctxt,ptype) + call psb_bcast(ctxt,idim) + call psb_bcast(ctxt,ipart) + call psb_bcast(ctxt,istopc) + call psb_bcast(ctxt,itmax) + call psb_bcast(ctxt,itrace) + call psb_bcast(ctxt,irst) + call psb_bcast(ctxt,parms%alg) + call psb_bcast(ctxt,parms%fill) + call psb_bcast(ctxt,parms%inv_fill) + call psb_bcast(ctxt,parms%thresh) + call psb_bcast(ctxt,parms%inv_thresh) + call psb_bcast(ctxt,parms%orth_alg) + call psb_bcast(ctxt,parms%ilut_scale) + + return + + end subroutine get_parms + ! + ! print an error message + ! + subroutine pr_usage(iout) + integer(psb_ipk_) :: iout + write(iout,*)'incorrect parameter(s) found' + write(iout,*)' usage: pde3d90 methd prec dim & + &[istop itmax itrace]' + write(iout,*)' where:' + write(iout,*)' methd: cgstab cgs rgmres bicgstabl' + write(iout,*)' prec : bjac diag none' + write(iout,*)' dim number of points along each axis' + write(iout,*)' the size of the resulting linear ' + write(iout,*)' system is dim**3' + write(iout,*)' ipart data partition 1 3 ' + write(iout,*)' istop stopping criterion 1, 2 ' + write(iout,*)' itmax maximum number of iterations [500] ' + write(iout,*)' itrace <=0 (no tracing, default) or ' + write(iout,*)' >= 1 do tracing every itrace' + write(iout,*)' iterations ' + end subroutine pr_usage + +end program psb_d_pde3d diff --git a/test/pargen/psb_d_pde3d.F90 b/test/pargen/psb_d_pde3d.F90 index d4eeccf2..0f9df354 100644 --- a/test/pargen/psb_d_pde3d.F90 +++ b/test/pargen/psb_d_pde3d.F90 @@ -799,6 +799,7 @@ program psb_d_pde3d if(psb_errstatus_fatal()) goto 9999 name='pde3d90' call psb_set_errverbosity(itwo) + call psb_cd_set_large_threshold(2000_psb_ipk_) ! ! Hello world ! From eb11e5e053daeaddef8c0f4d4eca48cdf85f0872 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 14 Apr 2023 17:48:15 +0200 Subject: [PATCH 10/38] Put CRITICAL(name) in G2L_INS --- base/modules/desc/psb_gen_block_map_mod.F90 | 1 + ...list_map_mod.f90 => psb_glist_map_mod.F90} | 0 base/modules/desc/psb_hash_map_mod.F90 | 91 ++++++++----------- base/modules/desc/psb_hash_mod.F90 | 49 +++++----- ..._list_map_mod.f90 => psb_list_map_mod.F90} | 59 +++++++++++- ..._repl_map_mod.f90 => psb_repl_map_mod.F90} | 0 6 files changed, 125 insertions(+), 75 deletions(-) rename base/modules/desc/{psb_glist_map_mod.f90 => psb_glist_map_mod.F90} (100%) rename base/modules/desc/{psb_list_map_mod.f90 => psb_list_map_mod.F90} (92%) rename base/modules/desc/{psb_repl_map_mod.f90 => psb_repl_map_mod.F90} (100%) diff --git a/base/modules/desc/psb_gen_block_map_mod.F90 b/base/modules/desc/psb_gen_block_map_mod.F90 index dfb47b61..2c20a547 100644 --- a/base/modules/desc/psb_gen_block_map_mod.F90 +++ b/base/modules/desc/psb_gen_block_map_mod.F90 @@ -488,6 +488,7 @@ contains integer(psb_ipk_) :: iam, np logical :: owned_ + write(0,*) 'block_g2lv2' info = 0 ctxt = idxmap%get_ctxt() call psb_info(ctxt,iam,np) diff --git a/base/modules/desc/psb_glist_map_mod.f90 b/base/modules/desc/psb_glist_map_mod.F90 similarity index 100% rename from base/modules/desc/psb_glist_map_mod.f90 rename to base/modules/desc/psb_glist_map_mod.F90 diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index af9cdf61..0f9af7ef 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -207,7 +207,6 @@ contains integer(psb_ipk_) :: i logical :: owned_ info = 0 - if (present(mask)) then if (size(mask) < size(idx)) then info = -1 @@ -249,7 +248,6 @@ contains end do end if - end subroutine hash_l2gv1 subroutine hash_l2gv2(idxin,idxout,idxmap,info,mask,owned) @@ -334,7 +332,6 @@ contains info = 0 ctxt = idxmap%get_ctxt() call psb_info(ctxt,iam,np) - if (present(mask)) then if (size(mask) < size(idx)) then info = -1 @@ -437,7 +434,6 @@ contains end if end if - end subroutine hash_g2lv1 subroutine hash_g2lv2(idxin,idxout,idxmap,info,mask,owned) @@ -460,7 +456,6 @@ contains is = size(idxin) im = min(is,size(idxout)) - info = 0 ctxt = idxmap%get_ctxt() call psb_info(ctxt,iam,np) @@ -567,7 +562,6 @@ contains end if end if - end subroutine hash_g2lv2 @@ -688,14 +682,13 @@ contains if (use_openmp) then #ifdef OPENMP - call OMP_init_lock(ins_lck) + !call OMP_init_lock(ins_lck) if (idxmap%is_bld()) then isLoopValid = .true. ncol = idxmap%get_lc() if (present(mask)) then - !write(0,*) 'present mask' mask_ = mask else allocate(mask_(size(idx))) @@ -703,13 +696,12 @@ contains end if if (present(lidx)) then - !write(0,*) 'present lidx' if (present(mask)) then - !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & - !$OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) do i = 1, is info = 0 if (mask(i)) then @@ -718,9 +710,9 @@ contains idx(i) = -1 cycle endif - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, @@ -728,7 +720,7 @@ contains call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index @@ -772,7 +764,7 @@ contains else idx(i) = -1 end if - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) end if else idx(i) = lip @@ -782,17 +774,17 @@ contains end if end do - !$OMP END PARALLEL DO + ! $ OMP END PARALLEL DO if (.not. isLoopValid) then goto 9999 end if else - !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & - !$OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) do i = 1, is info = 0 ip = idx(i) @@ -800,9 +792,9 @@ contains idx(i) = -1 cycle endif - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, @@ -810,7 +802,7 @@ contains call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). @@ -851,26 +843,25 @@ contains else idx(i) = -1 end if - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) end if else idx(i) = lip end if end do - !$OMP END PARALLEL DO + ! $ OMP END PARALLEL DO if (.not. isLoopValid) then goto 9999 end if end if else if (.not.present(lidx)) then - !write(0,*) 'not present lidx' if(present(mask)) then - !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & - !$OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) do i = 1, is info = 0 if (mask(i)) then @@ -879,9 +870,9 @@ contains idx(i) = -1 cycle endif - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, @@ -890,7 +881,7 @@ contains & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing, so this lock is relatively rare). @@ -932,7 +923,7 @@ contains else idx(i) = -1 end if - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) end if else idx(i) = lip @@ -942,16 +933,16 @@ contains end if end do - !$OMP END PARALLEL DO + ! $ OMP END PARALLEL DO if (.not. isLoopValid) then goto 9999 end if else - !$OMP PARALLEL DO default(none) schedule(DYNAMIC) & - !$OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) do i = 1, is info = 0 ip = idx(i) @@ -959,9 +950,9 @@ contains idx(i) = -1 cycle endif - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, @@ -969,7 +960,7 @@ contains call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then - call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). @@ -1011,7 +1002,7 @@ contains else idx(i) = -1 end if - call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) end if else @@ -1019,7 +1010,7 @@ contains end if end do - !$OMP END PARALLEL DO + ! $ OMP END PARALLEL DO if (.not. isLoopValid) then goto 9999 @@ -1032,7 +1023,7 @@ contains idx = -1 info = -1 end if - call OMP_destroy_lock(ins_lck) + !call OMP_destroy_lock(ins_lck) #endif else if (.not.use_openmp) then @@ -1139,9 +1130,9 @@ contains else if (.not.present(lidx)) then - if (present(mask)) then + if (present(mask)) then do i = 1, is - if (mask(i)) then + if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 @@ -1182,7 +1173,6 @@ contains idx(i) = -1 end if enddo - else if (.not.present(mask)) then do i = 1, is @@ -1227,7 +1217,6 @@ contains info = psb_success_ enddo - end if end if else diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index 0c682670..c3c5ba20 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -409,34 +409,39 @@ contains info = HashDuplicate return end if - !$OMP CRITICAL - if (hash%table(hk,1) == HashFreeEntry) then - if (hash%nk == hash%hsize -1) then - ! - ! Note: because of the way we allocate things at CDALL - ! time this is really unlikely; if we get here, we - ! have at least as many halo indices as internals, which - ! means we're already in trouble. But we try to keep going. - ! - call psb_hash_realloc(hash,info) - if (info /= HashOk) then - info = HashOutOfMemory - !return + !$omp critical(hashsearchins) + if (hash%table(hk,1) == key) then + val = hash%table(hk,2) + info = HashDuplicate + else + if (hash%table(hk,1) == HashFreeEntry) then + if (hash%nk == hash%hsize -1) then + ! + ! Note: because of the way we allocate things at CDALL + ! time this is really unlikely; if we get here, we + ! have at least as many halo indices as internals, which + ! means we're already in trouble. But we try to keep going. + ! + call psb_hash_realloc(hash,info) + if (info /= HashOk) then + info = HashOutOfMemory + !return + else + call psb_hash_searchinskey(key,val,nextval,hash,info) + !return + end if else - call psb_hash_searchinskey(key,val,nextval,hash,info) + hash%nk = hash%nk + 1 + hash%table(hk,1) = key + hash%table(hk,2) = nextval + val = nextval !return end if - else - hash%nk = hash%nk + 1 - hash%table(hk,1) = key - hash%table(hk,2) = nextval - val = nextval - !return end if end if - !$OMP END CRITICAL + !$omp end critical(hashsearchins) if (info /= HashOk) return - if (val > 0) return + if (val > 0) return hk = hk - hd if (hk < 0) hk = hk + hsize end do diff --git a/base/modules/desc/psb_list_map_mod.f90 b/base/modules/desc/psb_list_map_mod.F90 similarity index 92% rename from base/modules/desc/psb_list_map_mod.f90 rename to base/modules/desc/psb_list_map_mod.F90 index 3e3c8e25..5961d5c2 100644 --- a/base/modules/desc/psb_list_map_mod.f90 +++ b/base/modules/desc/psb_list_map_mod.F90 @@ -349,7 +349,6 @@ contains logical :: owned_ info = 0 - if (present(mask)) then if (size(mask) < size(idxin)) then info = -1 @@ -644,7 +643,20 @@ contains if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) info = -4 + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if ((ix <= idxmap%local_rows).or.(info /= 0)) then @@ -654,6 +666,7 @@ contains idxmap%local_cols = max(ix,idxmap%local_cols) idxmap%loc_to_glob(ix) = idxin(i) idxmap%glob_to_loc(idxin(i)) = ix +#endif end if idxout(i) = ix else @@ -668,6 +681,19 @@ contains if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) info = -4 + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if ((ix <= idxmap%local_rows).or.(info /= 0)) then @@ -677,6 +703,7 @@ contains idxmap%local_cols = max(ix,idxmap%local_cols) idxmap%loc_to_glob(ix) = idxin(i) idxmap%glob_to_loc(idxin(i)) = ix +#endif end if idxout(i) = ix else @@ -692,7 +719,20 @@ contains if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) - if (ix < 0) then + if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) info = -4 + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then @@ -702,6 +742,7 @@ contains idxmap%local_cols = ix idxmap%loc_to_glob(ix) = idxin(i) idxmap%glob_to_loc(idxin(i)) = ix +#endif end if idxout(i) = ix else @@ -716,6 +757,19 @@ contains if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) info = -4 + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then @@ -725,6 +779,7 @@ contains idxmap%local_cols = ix idxmap%loc_to_glob(ix) = idxin(i) idxmap%glob_to_loc(idxin(i)) = ix +#endif end if idxout(i) = ix else diff --git a/base/modules/desc/psb_repl_map_mod.f90 b/base/modules/desc/psb_repl_map_mod.F90 similarity index 100% rename from base/modules/desc/psb_repl_map_mod.f90 rename to base/modules/desc/psb_repl_map_mod.F90 From 8459ea28f5359e55b0e51db837dd34c5fdee5873 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 14 Apr 2023 18:25:43 +0200 Subject: [PATCH 11/38] Modified matrix build procedures with OpenMP --- base/serial/impl/psb_c_coo_impl.F90 | 33 ++++++++++++++++-------- base/serial/impl/psb_c_csr_impl.F90 | 29 ++++++++++----------- base/serial/impl/psb_d_coo_impl.F90 | 13 +++++----- base/serial/impl/psb_d_csr_impl.F90 | 29 ++++++++++----------- base/serial/impl/psb_s_coo_impl.F90 | 33 ++++++++++++++++-------- base/serial/impl/psb_s_csr_impl.F90 | 29 ++++++++++----------- base/serial/impl/psb_z_coo_impl.F90 | 33 ++++++++++++++++-------- base/serial/impl/psb_z_csr_impl.F90 | 29 ++++++++++----------- base/tools/psb_cspins.F90 | 39 ++++++++++++++++++++++------- base/tools/psb_dspins.F90 | 18 ++++++------- base/tools/psb_sspins.F90 | 39 ++++++++++++++++++++++------- base/tools/psb_zspins.F90 | 39 ++++++++++++++++++++++------- test/omp/psb_tomp.F90 | 18 +++++++++---- test/pargen/psb_d_pde3d.F90 | 1 - 14 files changed, 236 insertions(+), 146 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 830b7400..c9be113e 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -2818,6 +2818,9 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_coo_sparse_mat), intent(inout) :: a @@ -2829,7 +2832,7 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='c_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2861,10 +2864,11 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2872,16 +2876,23 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) +#if defined(OPENMP) + !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() +#endif end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - - + else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2951,7 +2962,7 @@ contains end do !$OMP END PARALLEL DO - nza = nza + nz + !nza = nza + nz #else do i=1, nz ir = ia(i) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index a7869136..fc56e9d8 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2997,22 +2997,21 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - - + end subroutine psb_c_cp_csr_from_coo @@ -3128,7 +3127,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3229,7 +3227,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index f4f4f0dc..bb845f4b 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -2832,7 +2832,7 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='d_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit, nzaold + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2864,7 +2864,6 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - if (a%is_bld()) then !$omp critical @@ -2890,7 +2889,7 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) call a%set_sorted(.false.) - + else if (a%is_upd()) then nza = a%get_nzeros() isza = a%get_size() @@ -2945,9 +2944,9 @@ contains ! the serial version: each element is stored in data ! structures but the invalid ones are stored as '-1' values. ! These values will be filtered in a future fixing process. - ! $ OMP PARALLEL DO default(none) schedule(STATIC) & - ! $ OMP shared(nz,imin,imax,jmin,jmax,ia,ja,val,ia1,ia2,aspk,nza) & - ! $ OMP private(ir,ic,i) + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP shared(nz,imin,imax,jmin,jmax,ia,ja,val,ia1,ia2,aspk,nza) & + !$OMP private(ir,ic,i) do i=1,nz ir = ia(i) ic = ja(i) @@ -2961,7 +2960,7 @@ contains aspk(nza+i) = -1 end if end do - ! $OMP END PARALLEL DO + !$OMP END PARALLEL DO !nza = nza + nz #else diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index a2ddad30..1e579aaa 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2997,22 +2997,21 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - - + end subroutine psb_d_cp_csr_from_coo @@ -3128,7 +3127,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3229,7 +3227,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index be4218de..0b201684 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -2818,6 +2818,9 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_coo_sparse_mat), intent(inout) :: a @@ -2829,7 +2832,7 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='s_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2861,10 +2864,11 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2872,16 +2876,23 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) +#if defined(OPENMP) + !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() +#endif end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - - + else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2951,7 +2962,7 @@ contains end do !$OMP END PARALLEL DO - nza = nza + nz + !nza = nza + nz #else do i=1, nz ir = ia(i) diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index a8c67c10..4eeaaf5d 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2997,22 +2997,21 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - - + end subroutine psb_s_cp_csr_from_coo @@ -3128,7 +3127,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3229,7 +3227,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 4f99cb5c..14410f23 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -2818,6 +2818,9 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_coo_sparse_mat), intent(inout) :: a @@ -2829,7 +2832,7 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='z_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2861,10 +2864,11 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2872,16 +2876,23 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) +#if defined(OPENMP) + !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() +#endif end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - - + else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2951,7 +2962,7 @@ contains end do !$OMP END PARALLEL DO - nza = nza + nz + !nza = nza + nz #else do i=1, nz ir = ia(i) diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 6344b268..3e1dacb9 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2997,22 +2997,21 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif call a%set_host() - - + end subroutine psb_z_cp_csr_from_coo @@ -3128,7 +3127,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. #if defined(OPENMP) integer(psb_ipk_), allocatable :: sum(:) @@ -3229,7 +3227,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) !$OMP END PARALLEL #else - do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 diff --git a/base/tools/psb_cspins.F90 b/base/tools/psb_cspins.F90 index 27cfbd8e..0a65fb5c 100644 --- a/base/tools/psb_cspins.F90 +++ b/base/tools/psb_cspins.F90 @@ -51,6 +51,9 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_cspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -70,7 +73,7 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k + integer(psb_ipk_) :: i,k, ith, nth integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) complex(psb_spk_), allocatable :: lval(:) @@ -82,7 +85,13 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) - +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif if (nz < 0) then info = 1111 call psb_errpush(info,name) @@ -131,15 +140,23 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if - - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol) +#endif + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp critical(cSPINS) +#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - +#if defined(OPENMP) + !$omp end critical(cSPINS) +#endif + if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - goto 9999 + !goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() @@ -149,13 +166,12 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -175,8 +191,13 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - goto 9999 + !goto 9999 end if + +#if defined(OPENMP) + !$omp end parallel +#endif + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 9c4f137f..6f700bcc 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -73,7 +73,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k + integer(psb_ipk_) :: i,k, ith, nth integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) real(psb_dpk_), allocatable :: lval(:) @@ -86,7 +86,11 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) #if defined(OPENMP) - !write(0,*) name,omp_get_num_threads(),omp_get_thread_num() + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 #endif if (nz < 0) then info = 1111 @@ -139,17 +143,14 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) #if defined(OPENMP) !$omp parallel private(ila,jla,nrow,ncol) #endif - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) #if defined(OPENMP) - !$omp critical + !$omp critical(dSPINS) #endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) -!!$ write(0,*) omp_get_thread_num(),'Check g2l: ',ila(1:nz),':',& -!!$ & jla(1:nz),':',ja(1:nz) #if defined(OPENMP) - !$omp end critical + !$omp end critical(dSPINS) #endif if (info /= psb_success_) then @@ -171,7 +172,6 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -197,7 +197,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) #if defined(OPENMP) !$omp end parallel #endif - + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then diff --git a/base/tools/psb_sspins.F90 b/base/tools/psb_sspins.F90 index aee7a900..1e7f9037 100644 --- a/base/tools/psb_sspins.F90 +++ b/base/tools/psb_sspins.F90 @@ -51,6 +51,9 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_sspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -70,7 +73,7 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k + integer(psb_ipk_) :: i,k, ith, nth integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) real(psb_spk_), allocatable :: lval(:) @@ -82,7 +85,13 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) - +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif if (nz < 0) then info = 1111 call psb_errpush(info,name) @@ -131,15 +140,23 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if - - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol) +#endif + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp critical(sSPINS) +#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - +#if defined(OPENMP) + !$omp end critical(sSPINS) +#endif + if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - goto 9999 + !goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() @@ -149,13 +166,12 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -175,8 +191,13 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - goto 9999 + !goto 9999 end if + +#if defined(OPENMP) + !$omp end parallel +#endif + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then diff --git a/base/tools/psb_zspins.F90 b/base/tools/psb_zspins.F90 index abe64251..40c0783b 100644 --- a/base/tools/psb_zspins.F90 +++ b/base/tools/psb_zspins.F90 @@ -51,6 +51,9 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_zspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -70,7 +73,7 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k + integer(psb_ipk_) :: i,k, ith, nth integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) complex(psb_dpk_), allocatable :: lval(:) @@ -82,7 +85,13 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) - +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif if (nz < 0) then info = 1111 call psb_errpush(info,name) @@ -131,15 +140,23 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if - - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol) +#endif + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) +#if defined(OPENMP) + !$omp critical(zSPINS) +#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - +#if defined(OPENMP) + !$omp end critical(zSPINS) +#endif + if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - goto 9999 + !goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() @@ -149,13 +166,12 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -175,8 +191,13 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - goto 9999 + !goto 9999 end if + +#if defined(OPENMP) + !$omp end parallel +#endif + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 index 8681c396..7965a5f4 100644 --- a/test/omp/psb_tomp.F90 +++ b/test/omp/psb_tomp.F90 @@ -451,16 +451,24 @@ contains call psb_barrier(ctxt) t1 = psb_wtime() - !$omp parallel private(i,ii,ib,icoeff,glob_row,x,y,z,zt,ix,iy,iz) - ! shared(deltah,myidx,a,desc_a,nb) + !$omp parallel shared(deltah,myidx,a,desc_a) ! we build an auxiliary matrix consisting of one row at a ! time; just a small matrix. might be extended to generate ! a bunch of rows per call. ! block + integer(psb_ipk_) :: i,j,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row integer(psb_lpk_), allocatable :: irow(:),icol(:) real(psb_dpk_), allocatable :: val(:) - + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif allocate(val(20*nb),irow(20*nb),& &icol(20*nb),stat=info) if (info /= psb_success_ ) then @@ -473,7 +481,7 @@ contains ! loop over rows belonging to current process in a block ! distribution. - !$omp do + !$omp do schedule(dynamic,4) ! do ii=1, nlr, nb if (info /= 0) cycle @@ -723,7 +731,7 @@ program psb_d_pde3d if(psb_errstatus_fatal()) goto 9999 name='pde3d90' call psb_set_errverbosity(itwo) - call psb_cd_set_large_threshold(2000_psb_ipk_) + !call psb_cd_set_large_threshold(2000_psb_ipk_) ! ! Hello world ! diff --git a/test/pargen/psb_d_pde3d.F90 b/test/pargen/psb_d_pde3d.F90 index 0f9df354..d4eeccf2 100644 --- a/test/pargen/psb_d_pde3d.F90 +++ b/test/pargen/psb_d_pde3d.F90 @@ -799,7 +799,6 @@ program psb_d_pde3d if(psb_errstatus_fatal()) goto 9999 name='pde3d90' call psb_set_errverbosity(itwo) - call psb_cd_set_large_threshold(2000_psb_ipk_) ! ! Hello world ! From f068d73ef1ef33b8bb5ca162573edee5950dc4ef Mon Sep 17 00:00:00 2001 From: sfilippone Date: Sun, 16 Apr 2023 11:30:09 +0200 Subject: [PATCH 12/38] First working version --- base/modules/desc/psb_hash_map_mod.F90 | 20 ++++++++++++++------ base/tools/psb_cspins.F90 | 4 ++-- base/tools/psb_dspins.F90 | 4 ++-- base/tools/psb_sspins.F90 | 4 ++-- base/tools/psb_zspins.F90 | 4 ++-- test/omp/psb_tomp.F90 | 4 ++-- 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index 0f9af7ef..eac8cc7a 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -649,7 +649,7 @@ contains integer(psb_ipk_) :: me, np character(len=20) :: name,ch_err logical, allocatable :: mask_(:) - logical :: use_openmp = .false. + logical :: use_openmp = .true. #ifdef OPENMP integer(kind = OMP_lock_kind) :: ins_lck #endif @@ -681,7 +681,7 @@ contains nrow = idxmap%get_lr() if (use_openmp) then -#ifdef OPENMP +#ifdef OPENMP !call OMP_init_lock(ins_lck) if (idxmap%is_bld()) then @@ -697,6 +697,7 @@ contains if (present(lidx)) then if (present(mask)) then + !$omp critical(hash_g2l_ins) ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & @@ -775,11 +776,13 @@ contains end do ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) if (.not. isLoopValid) then goto 9999 end if else + !$omp critical(hash_g2l_ins) ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & @@ -851,7 +854,8 @@ contains end do ! $ OMP END PARALLEL DO - + !$omp end critical(hash_g2l_ins) + if (.not. isLoopValid) then goto 9999 end if @@ -861,7 +865,9 @@ contains ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) + ! $ OMP reduction(.AND.:isLoopValid) + !$omp critical(hash_g2l_ins) + do i = 1, is info = 0 if (mask(i)) then @@ -934,7 +940,8 @@ contains end do ! $ OMP END PARALLEL DO - + !$omp end critical(hash_g2l_ins) + if (.not. isLoopValid) then goto 9999 end if @@ -943,6 +950,7 @@ contains ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & ! $ OMP private(i,ip,lip,tlip,nxt,info) & ! $ OMP reduction(.AND.:isLoopValid) + !$omp critical(hash_g2l_ins) do i = 1, is info = 0 ip = idx(i) @@ -1011,6 +1019,7 @@ contains end do ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) if (.not. isLoopValid) then goto 9999 @@ -1024,7 +1033,6 @@ contains info = -1 end if !call OMP_destroy_lock(ins_lck) - #endif else if (.not.use_openmp) then #ifdef OPENMP diff --git a/base/tools/psb_cspins.F90 b/base/tools/psb_cspins.F90 index 0a65fb5c..cfa88c55 100644 --- a/base/tools/psb_cspins.F90 +++ b/base/tools/psb_cspins.F90 @@ -145,12 +145,12 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) #if defined(OPENMP) - !$omp critical(cSPINS) + !$omp critical(cspins) #endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) #if defined(OPENMP) - !$omp end critical(cSPINS) + !$omp end critical(cspins) #endif if (info /= psb_success_) then diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 6f700bcc..0ba71096 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -145,12 +145,12 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) #if defined(OPENMP) - !$omp critical(dSPINS) + !$omp critical(dspins) #endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) #if defined(OPENMP) - !$omp end critical(dSPINS) + !$omp end critical(dspins) #endif if (info /= psb_success_) then diff --git a/base/tools/psb_sspins.F90 b/base/tools/psb_sspins.F90 index 1e7f9037..71cd293a 100644 --- a/base/tools/psb_sspins.F90 +++ b/base/tools/psb_sspins.F90 @@ -145,12 +145,12 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) #if defined(OPENMP) - !$omp critical(sSPINS) + !$omp critical(sspins) #endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) #if defined(OPENMP) - !$omp end critical(sSPINS) + !$omp end critical(sspins) #endif if (info /= psb_success_) then diff --git a/base/tools/psb_zspins.F90 b/base/tools/psb_zspins.F90 index 40c0783b..be24a047 100644 --- a/base/tools/psb_zspins.F90 +++ b/base/tools/psb_zspins.F90 @@ -145,12 +145,12 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) #if defined(OPENMP) - !$omp critical(zSPINS) + !$omp critical(zspins) #endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) #if defined(OPENMP) - !$omp end critical(zSPINS) + !$omp end critical(zspins) #endif if (info /= psb_success_) then diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 index 7965a5f4..0a13a41b 100644 --- a/test/omp/psb_tomp.F90 +++ b/test/omp/psb_tomp.F90 @@ -639,7 +639,7 @@ contains write(psb_out_unit,'("-total time : ",es12.5)') ttot end if - !call a%print('a.mtx',head='Test') + call a%print('a.mtx',head='Test') call psb_erractionrestore(err_act) return @@ -731,7 +731,7 @@ program psb_d_pde3d if(psb_errstatus_fatal()) goto 9999 name='pde3d90' call psb_set_errverbosity(itwo) - !call psb_cd_set_large_threshold(2000_psb_ipk_) + call psb_cd_set_large_threshold(2000_psb_ipk_) ! ! Hello world ! From 5a5712b4f079c242c5ae3938deb5826dc54ac388 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Sun, 16 Apr 2023 12:02:54 +0200 Subject: [PATCH 13/38] Rely on CRITICAL inside G2L_INS implementation --- base/tools/psb_cspins.F90 | 6 ------ base/tools/psb_dspins.F90 | 6 ------ base/tools/psb_sspins.F90 | 6 ------ base/tools/psb_zspins.F90 | 6 ------ test/omp/psb_tomp.F90 | 2 +- 5 files changed, 1 insertion(+), 25 deletions(-) diff --git a/base/tools/psb_cspins.F90 b/base/tools/psb_cspins.F90 index cfa88c55..66e03ed9 100644 --- a/base/tools/psb_cspins.F90 +++ b/base/tools/psb_cspins.F90 @@ -144,14 +144,8 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) !$omp parallel private(ila,jla,nrow,ncol) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) -#if defined(OPENMP) - !$omp critical(cspins) -#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) -#if defined(OPENMP) - !$omp end critical(cspins) -#endif if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 0ba71096..2e11c511 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -144,14 +144,8 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) !$omp parallel private(ila,jla,nrow,ncol) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) -#if defined(OPENMP) - !$omp critical(dspins) -#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) -#if defined(OPENMP) - !$omp end critical(dspins) -#endif if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& diff --git a/base/tools/psb_sspins.F90 b/base/tools/psb_sspins.F90 index 71cd293a..90fa9e2d 100644 --- a/base/tools/psb_sspins.F90 +++ b/base/tools/psb_sspins.F90 @@ -144,14 +144,8 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) !$omp parallel private(ila,jla,nrow,ncol) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) -#if defined(OPENMP) - !$omp critical(sspins) -#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) -#if defined(OPENMP) - !$omp end critical(sspins) -#endif if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& diff --git a/base/tools/psb_zspins.F90 b/base/tools/psb_zspins.F90 index be24a047..74e2d5dc 100644 --- a/base/tools/psb_zspins.F90 +++ b/base/tools/psb_zspins.F90 @@ -144,14 +144,8 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) !$omp parallel private(ila,jla,nrow,ncol) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) -#if defined(OPENMP) - !$omp critical(zspins) -#endif if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) -#if defined(OPENMP) - !$omp end critical(zspins) -#endif if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 index 0a13a41b..468eeb5a 100644 --- a/test/omp/psb_tomp.F90 +++ b/test/omp/psb_tomp.F90 @@ -731,7 +731,7 @@ program psb_d_pde3d if(psb_errstatus_fatal()) goto 9999 name='pde3d90' call psb_set_errverbosity(itwo) - call psb_cd_set_large_threshold(2000_psb_ipk_) + call psb_cd_set_large_threshold(125000_psb_ipk_) ! ! Hello world ! From 6ba7d9315933c0da85de59cefe610644bbff0d02 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Sun, 16 Apr 2023 12:03:16 +0200 Subject: [PATCH 14/38] Fix CRITICAL in LIST%G2L_INS --- base/modules/desc/psb_list_map_mod.F90 | 207 ++++++++++++++++++------- 1 file changed, 148 insertions(+), 59 deletions(-) diff --git a/base/modules/desc/psb_list_map_mod.F90 b/base/modules/desc/psb_list_map_mod.F90 index 5961d5c2..6b61cf52 100644 --- a/base/modules/desc/psb_list_map_mod.F90 +++ b/base/modules/desc/psb_list_map_mod.F90 @@ -500,19 +500,37 @@ contains if (present(lidx)) then if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if ((ix <= idxmap%local_rows).or.(info /= 0)) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if ((ix <= idxmap%local_rows).or.(info /= 0)) then + if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -524,18 +542,37 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if ((ix <= idxmap%local_rows).or.(info /= 0)) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if ((ix <= idxmap%local_rows).or.(info /= 0)) then + if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -548,19 +585,37 @@ contains if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if (info /= 0) then + if (info /= 0) then info = -4 - return + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -572,18 +627,36 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then info = -4 - return + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -640,32 +713,36 @@ contains if (present(lidx)) then if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then #if defined(OPENMP) - !$OMP CRITICAL(LISTINS) - ix = idxmap%glob_to_loc(idxin(i)) - if (ix < 0) then - ix = lidx(i) - call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if (info /= 0) info = -4 - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix - end if - !$OMP END CRITICAL(LISTINS) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) #else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix #endif end if idxout(i) = ix @@ -678,6 +755,7 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then @@ -687,10 +765,13 @@ contains if (ix < 0) then ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if (info /= 0) info = -4 - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix + if (info /= 0) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if end if !$OMP END CRITICAL(LISTINS) #else @@ -698,11 +779,11 @@ contains call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix #endif end if idxout(i) = ix @@ -716,6 +797,7 @@ contains if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) @@ -726,10 +808,13 @@ contains if (ix < 0) then ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if (info /= 0) info = -4 - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if end if !$OMP END CRITICAL(LISTINS) #else @@ -737,11 +822,11 @@ contains call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then info = -4 - return + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix #endif end if idxout(i) = ix @@ -754,6 +839,7 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then @@ -763,10 +849,13 @@ contains if (ix < 0) then ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if (info /= 0) info = -4 - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if end if !$OMP END CRITICAL(LISTINS) #else @@ -774,11 +863,11 @@ contains call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then info = -4 - return - end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if #endif end if idxout(i) = ix From dbd55321f83b3f40d075999756050748138cb957 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 17 Apr 2023 21:00:20 +0200 Subject: [PATCH 15/38] Fixed CSR mv and cp _from_coo with OpenMP. --- base/serial/impl/psb_c_csr_impl.F90 | 200 +++++++++++++--------------- base/serial/impl/psb_d_csr_impl.F90 | 200 +++++++++++++--------------- base/serial/impl/psb_s_csr_impl.F90 | 200 +++++++++++++--------------- base/serial/impl/psb_z_csr_impl.F90 | 200 +++++++++++++--------------- test/omp/psb_tomp.F90 | 11 +- 5 files changed, 394 insertions(+), 417 deletions(-) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index fc56e9d8..e091e020 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2877,7 +2877,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) logical :: use_openmp = .false. #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads #endif @@ -2923,81 +2923,74 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 #if defined(OPENMP) - maxthreads = omp_get_max_threads() - allocate(sum(maxthreads+1)) - sum(:) = 0 - sum(1) = 1 - + !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - - last_idx = first_idx + work - 1 - + + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - + !$OMP END SINGLE + !$OMP BARRIER - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do - + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE + !$OMP END PARALLEL #else - + a%irp(:) = 0 do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3010,6 +3003,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) end do a%irp(nr+1) = ip #endif + call a%set_host() end subroutine psb_c_cp_csr_from_coo @@ -3129,7 +3123,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif @@ -3156,90 +3150,88 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() - allocate(sum(nthreads+1)) - sum(:) = 0 - sum(1) = 1 + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - last_idx = first_idx + work - 1 - + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE + !$OMP END SINGLE - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if + !$OMP BARRIER - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE !$OMP END PARALLEL #else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif - + + !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) + !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_c_mv_csr_from_coo diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 1e579aaa..e6b9f958 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2877,7 +2877,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) logical :: use_openmp = .false. #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads #endif @@ -2923,81 +2923,74 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 #if defined(OPENMP) - maxthreads = omp_get_max_threads() - allocate(sum(maxthreads+1)) - sum(:) = 0 - sum(1) = 1 - + !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - - last_idx = first_idx + work - 1 - + + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - + !$OMP END SINGLE + !$OMP BARRIER - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do - + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE + !$OMP END PARALLEL #else - + a%irp(:) = 0 do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3010,6 +3003,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) end do a%irp(nr+1) = ip #endif + call a%set_host() end subroutine psb_d_cp_csr_from_coo @@ -3129,7 +3123,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif @@ -3156,90 +3150,88 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() - allocate(sum(nthreads+1)) - sum(:) = 0 - sum(1) = 1 + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - last_idx = first_idx + work - 1 - + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE + !$OMP END SINGLE - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if + !$OMP BARRIER - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE !$OMP END PARALLEL #else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif - + + !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) + !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_d_mv_csr_from_coo diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 4eeaaf5d..73f5484a 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2877,7 +2877,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) logical :: use_openmp = .false. #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads #endif @@ -2923,81 +2923,74 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 #if defined(OPENMP) - maxthreads = omp_get_max_threads() - allocate(sum(maxthreads+1)) - sum(:) = 0 - sum(1) = 1 - + !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - - last_idx = first_idx + work - 1 - + + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - + !$OMP END SINGLE + !$OMP BARRIER - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do - + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE + !$OMP END PARALLEL #else - + a%irp(:) = 0 do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3010,6 +3003,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) end do a%irp(nr+1) = ip #endif + call a%set_host() end subroutine psb_s_cp_csr_from_coo @@ -3129,7 +3123,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif @@ -3156,90 +3150,88 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() - allocate(sum(nthreads+1)) - sum(:) = 0 - sum(1) = 1 + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - last_idx = first_idx + work - 1 - + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE + !$OMP END SINGLE - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if + !$OMP BARRIER - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE !$OMP END PARALLEL #else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif - + + !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) + !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_s_mv_csr_from_coo diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 3e1dacb9..b07e37b7 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2877,7 +2877,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) logical :: use_openmp = .false. #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads #endif @@ -2923,81 +2923,74 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 #if defined(OPENMP) - maxthreads = omp_get_max_threads() - allocate(sum(maxthreads+1)) - sum(:) = 0 - sum(1) = 1 - + !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - - last_idx = first_idx + work - 1 - + + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - + !$OMP END SINGLE + !$OMP BARRIER - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do - + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE + !$OMP END PARALLEL #else - + a%irp(:) = 0 do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3010,6 +3003,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) end do a%irp(nr+1) = ip #endif + call a%set_host() end subroutine psb_z_cp_csr_from_coo @@ -3129,7 +3123,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' #if defined(OPENMP) - integer(psb_ipk_), allocatable :: sum(:) + integer(psb_ipk_), allocatable :: suma(:) integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s integer(psb_ipk_) :: nxt_val,old_val,saved_elem #endif @@ -3156,90 +3150,88 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,nr,a,itemp,nza) & + !$OMP private(ithread,work,i,first_idx,last_idx,s) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) + !$OMP private(k) do k=1,nza i = itemp(k) - a%irp(i) = a%irp(i) + 1 + !$OMP ATOMIC UPDATE + a%irp(i+1) = a%irp(i+1) + 1 + !$OMP END ATOMIC end do !$OMP END DO - + !$OMP SINGLE nthreads = omp_get_num_threads() - allocate(sum(nthreads+1)) - sum(:) = 0 - sum(1) = 1 + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 !$OMP END SINGLE - ithread = omp_get_thread_num() - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then + + work = (nr+1)/nthreads + if (ithread < MOD((nr+1),nthreads)) then work = work + 1 first_idx = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + first_idx = ithread*work + MOD((nr+1),nthreads) + 1 end if - last_idx = first_idx + work - 1 - + last_idx = min(first_idx + work - 1,nr+1) s = 0 - do i=first_idx,last_idx - s = s + a%irp(i) - end do - if (work > 0) then - sum(ithread+2) = s + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + a%irp(i) + a%irp(i) = a%irp(i)+a%irp(i-1) + end do end if - !$OMP BARRIER - !$OMP SINGLE do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) + suma(i) = suma(i) + suma(i-1) end do - !$OMP END SINGLE + !$OMP END SINGLE - if (work > 0) then - saved_elem = a%irp(first_idx) - end if - if (ithread == 0) then - a%irp(1) = 1 - end if - - if (work > 0) then - old_val = a%irp(first_idx+1) - a%irp(first_idx+1) = saved_elem + sum(ithread+1) - end if + !$OMP BARRIER - do i=first_idx+2,last_idx+1 - nxt_val = a%irp(i) - a%irp(i) = a%irp(i-1) + old_val - old_val = nxt_val + !$OMP DO SCHEDULE(STATIC) + do i=1,nr+1 + a%irp(i) = suma(ithread+1) + a%irp(i) +1 end do + !$OMP END DO + !$OMP SINGLE + a%irp(1) = 1 + !$OMP END SINGLE !$OMP END PARALLEL #else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + ip = 1 + do i=1,nr + ncl = a%irp(i) + a%irp(i) = ip + ip = ip + ncl + end do + a%irp(nr+1) = ip #endif - + + !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) + !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_z_mv_csr_from_coo diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 index 468eeb5a..f3d19a25 100644 --- a/test/omp/psb_tomp.F90 +++ b/test/omp/psb_tomp.F90 @@ -639,7 +639,16 @@ contains write(psb_out_unit,'("-total time : ",es12.5)') ttot end if - call a%print('a.mtx',head='Test') +!!$ !$omp parallel +!!$ !$omp master +!!$ block +!!$ character(len=1024) :: fname +!!$ write(fname,'(a,i4.4,a,i4.4,a)') 'a-',iam,'-',np,'.mtx' +!!$ write(0,*) iam,' Size of A ',a%get_nrows(),a%get_ncols(),a%get_nzeros() +!!$ call a%print(fname,head='Test') +!!$ end block +!!$ !$omp end master +!!$ !$omp end parallel call psb_erractionrestore(err_act) return From 02dd204351332fb7f9c4bb9b4e550602dde7935e Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 18 Apr 2023 17:00:22 +0200 Subject: [PATCH 16/38] Implement psi_exscan and use in _from_coo --- base/modules/auxil/psi_c_serial_mod.f90 | 12 + base/modules/auxil/psi_d_serial_mod.f90 | 12 + base/modules/auxil/psi_e_serial_mod.f90 | 12 + base/modules/auxil/psi_i2_serial_mod.f90 | 12 + base/modules/auxil/psi_m_serial_mod.f90 | 12 + base/modules/auxil/psi_s_serial_mod.f90 | 12 + base/modules/auxil/psi_z_serial_mod.f90 | 12 + ...{psb_c_csc_impl.f90 => psb_c_csc_impl.F90} | 34 +- base/serial/impl/psb_c_csr_impl.F90 | 126 +------ ...{psb_d_csc_impl.f90 => psb_d_csc_impl.F90} | 34 +- base/serial/impl/psb_d_csr_impl.F90 | 126 +------ ...{psb_s_csc_impl.f90 => psb_s_csc_impl.F90} | 34 +- base/serial/impl/psb_s_csr_impl.F90 | 126 +------ ...{psb_z_csc_impl.f90 => psb_z_csc_impl.F90} | 34 +- base/serial/impl/psb_z_csr_impl.F90 | 126 +------ base/serial/psi_c_serial_impl.F90 | 91 +++++ base/serial/psi_d_serial_impl.F90 | 91 +++++ base/serial/psi_e_serial_impl.F90 | 330 ++++++++++++++++- base/serial/psi_i2_serial_impl.F90 | 330 ++++++++++++++++- base/serial/psi_m_serial_impl.F90 | 332 +++++++++++++++++- base/serial/psi_s_serial_impl.F90 | 91 +++++ base/serial/psi_z_serial_impl.F90 | 91 +++++ 22 files changed, 1554 insertions(+), 526 deletions(-) rename base/serial/impl/{psb_c_csc_impl.f90 => psb_c_csc_impl.F90} (99%) rename base/serial/impl/{psb_d_csc_impl.f90 => psb_d_csc_impl.F90} (99%) rename base/serial/impl/{psb_s_csc_impl.f90 => psb_s_csc_impl.F90} (99%) rename base/serial/impl/{psb_z_csc_impl.f90 => psb_z_csc_impl.F90} (99%) diff --git a/base/modules/auxil/psi_c_serial_mod.f90 b/base/modules/auxil/psi_c_serial_mod.f90 index 191e7ef3..d62ba3bc 100644 --- a/base/modules/auxil/psi_c_serial_mod.f90 +++ b/base/modules/auxil/psi_c_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_c_serial_mod end subroutine psi_csctv end interface psi_sct + interface psi_exscan + subroutine psi_c_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_spk_ + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_c_exscanv + end interface psi_exscan + end module psi_c_serial_mod diff --git a/base/modules/auxil/psi_d_serial_mod.f90 b/base/modules/auxil/psi_d_serial_mod.f90 index f1dbc16c..ae88be74 100644 --- a/base/modules/auxil/psi_d_serial_mod.f90 +++ b/base/modules/auxil/psi_d_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_d_serial_mod end subroutine psi_dsctv end interface psi_sct + interface psi_exscan + subroutine psi_d_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_dpk_ + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_d_exscanv + end interface psi_exscan + end module psi_d_serial_mod diff --git a/base/modules/auxil/psi_e_serial_mod.f90 b/base/modules/auxil/psi_e_serial_mod.f90 index 909b025c..a5544075 100644 --- a/base/modules/auxil/psi_e_serial_mod.f90 +++ b/base/modules/auxil/psi_e_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_e_serial_mod end subroutine psi_esctv end interface psi_sct + interface psi_exscan + subroutine psi_e_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_epk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_epk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_e_exscanv + end interface psi_exscan + end module psi_e_serial_mod diff --git a/base/modules/auxil/psi_i2_serial_mod.f90 b/base/modules/auxil/psi_i2_serial_mod.f90 index 38ef9c38..c0b5a327 100644 --- a/base/modules/auxil/psi_i2_serial_mod.f90 +++ b/base/modules/auxil/psi_i2_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_i2_serial_mod end subroutine psi_i2sctv end interface psi_sct + interface psi_exscan + subroutine psi_i2_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_i2pk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_i2pk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_i2_exscanv + end interface psi_exscan + end module psi_i2_serial_mod diff --git a/base/modules/auxil/psi_m_serial_mod.f90 b/base/modules/auxil/psi_m_serial_mod.f90 index a80d0ffe..ab875f7b 100644 --- a/base/modules/auxil/psi_m_serial_mod.f90 +++ b/base/modules/auxil/psi_m_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_m_serial_mod end subroutine psi_msctv end interface psi_sct + interface psi_exscan + subroutine psi_m_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_mpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_m_exscanv + end interface psi_exscan + end module psi_m_serial_mod diff --git a/base/modules/auxil/psi_s_serial_mod.f90 b/base/modules/auxil/psi_s_serial_mod.f90 index 3c0a3bdc..fee1afc6 100644 --- a/base/modules/auxil/psi_s_serial_mod.f90 +++ b/base/modules/auxil/psi_s_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_s_serial_mod end subroutine psi_ssctv end interface psi_sct + interface psi_exscan + subroutine psi_s_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_spk_ + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_s_exscanv + end interface psi_exscan + end module psi_s_serial_mod diff --git a/base/modules/auxil/psi_z_serial_mod.f90 b/base/modules/auxil/psi_z_serial_mod.f90 index 7bc9728e..3ab430cc 100644 --- a/base/modules/auxil/psi_z_serial_mod.f90 +++ b/base/modules/auxil/psi_z_serial_mod.f90 @@ -156,4 +156,16 @@ module psi_z_serial_mod end subroutine psi_zsctv end interface psi_sct + interface psi_exscan + subroutine psi_z_exscanv(n,x,info,shift,ibase) + import :: psb_ipk_, psb_dpk_ + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + end subroutine psi_z_exscanv + end interface psi_exscan + end module psi_z_serial_mod diff --git a/base/serial/impl/psb_c_csc_impl.f90 b/base/serial/impl/psb_c_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_c_csc_impl.f90 rename to base/serial/impl/psb_c_csc_impl.F90 index 87d7e3dd..c573a40d 100644 --- a/base/serial/impl/psb_c_csc_impl.f90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_c_base_mat_mod use psb_c_csc_mat_mod, psb_protect_name => psb_c_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index e091e020..388e9e2b 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) character(len=20) :: name='c_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/impl/psb_d_csc_impl.f90 b/base/serial/impl/psb_d_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_d_csc_impl.f90 rename to base/serial/impl/psb_d_csc_impl.F90 index 4f10439b..891df5a3 100644 --- a/base/serial/impl/psb_d_csc_impl.f90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_d_base_mat_mod use psb_d_csc_mat_mod, psb_protect_name => psb_d_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index e6b9f958..1762604d 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) character(len=20) :: name='d_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/impl/psb_s_csc_impl.f90 b/base/serial/impl/psb_s_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_s_csc_impl.f90 rename to base/serial/impl/psb_s_csc_impl.F90 index e52086d1..2bf77184 100644 --- a/base/serial/impl/psb_s_csc_impl.f90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_s_base_mat_mod use psb_s_csc_mat_mod, psb_protect_name => psb_s_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 73f5484a..71da21a4 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) character(len=20) :: name='s_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/impl/psb_z_csc_impl.f90 b/base/serial/impl/psb_z_csc_impl.F90 similarity index 99% rename from base/serial/impl/psb_z_csc_impl.f90 rename to base/serial/impl/psb_z_csc_impl.F90 index 8b0ccc65..22ea3677 100644 --- a/base/serial/impl/psb_z_csc_impl.f90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_z_base_mat_mod use psb_z_csc_mat_mod, psb_protect_name => psb_z_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,35 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + !$OMP END PARALLEL + +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip +#endif + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) call a%set_host() diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index b07e37b7..34332f46 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2876,12 +2876,6 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) character(len=20) :: name='z_cp_csr_from_coo' logical :: use_openmp = .false. -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2927,8 +2921,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -2943,51 +2937,6 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2995,14 +2944,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3122,12 +3065,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' -#if defined(OPENMP) - integer(psb_ipk_), allocatable :: suma(:) - integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - integer(psb_ipk_) :: nxt_val,old_val,saved_elem -#endif - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -3153,8 +3090,8 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,i,first_idx,last_idx,s) + !$OMP shared(nr,a,itemp,nza) & + !$OMP private(i,info) !$OMP WORKSHARE a%irp(:) = 0 @@ -3169,51 +3106,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - work = (nr+1)/nthreads - if (ithread < MOD((nr+1),nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD((nr+1),nthreads) + 1 - end if - - last_idx = min(first_idx + work - 1,nr+1) - s = 0 - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + a%irp(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + a%irp(i) - a%irp(i) = a%irp(i)+a%irp(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,nr+1 - a%irp(i) = suma(ithread+1) + a%irp(i) +1 - end do - !$OMP END DO - !$OMP SINGLE - a%irp(1) = 1 - !$OMP END SINGLE - !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3221,14 +3113,8 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip #endif + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) !write(0,*) name,a%irp(:) diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index 1da2ce6e..91f463f4 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_c_exscanv(n,x,info,shift,ibase) + use psi_c_serial_mod, psb_protect_name => psi_c_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + complex(psb_spk_) :: shift_, tp, ts + complex(psb_spk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = czero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_c_exscanv + subroutine psb_m_cgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_cgelp use psb_const_mod diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index 8c65b349..d8fb92d7 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_d_exscanv(n,x,info,shift,ibase) + use psi_d_serial_mod, psb_protect_name => psi_d_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + real(psb_dpk_) :: shift_, tp, ts + real(psb_dpk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = dzero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_d_exscanv + subroutine psb_m_dgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_dgelp use psb_const_mod diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index 988bad52..24ce41f1 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_e_exscanv(n,x,info,shift,ibase) + use psi_e_serial_mod, psb_protect_name => psi_e_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_epk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_epk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + integer(psb_epk_) :: shift_, tp, ts + integer(psb_epk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = ezero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_e_exscanv + subroutine psb_m_egelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_egelp use psb_const_mod @@ -441,9 +532,9 @@ subroutine psi_eaxpby(m,n,alpha, x, beta, y, info) integer(psb_epk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +593,8 @@ subroutine psi_eaxpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +624,106 @@ subroutine psi_eaxpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call eaxpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call eaxpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.ezero) then + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = ezero + enddo + else if (beta.eq.eone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +746,7 @@ subroutine psi_eaxpbyv2(m,alpha, x, beta, y, z, info) integer(psb_epk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +785,105 @@ subroutine psi_eaxpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call eaxpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.ezero) then + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = ezero + enddo + else if (beta.eq.eone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1231,7 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.ezero) then if (beta.eq.ezero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = ezero enddo @@ -953,12 +1243,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1261,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1276,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1294,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1309,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1327,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1342,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1435,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1453,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1468,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1486,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1501,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1519,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1534,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index 83b078f0..c5889013 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_i2_exscanv(n,x,info,shift,ibase) + use psi_i2_serial_mod, psb_protect_name => psi_i2_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_i2pk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_i2pk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + integer(psb_i2pk_) :: shift_, tp, ts + integer(psb_i2pk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = i2zero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_i2_exscanv + subroutine psb_m_i2gelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_i2gelp use psb_const_mod @@ -441,9 +532,9 @@ subroutine psi_i2axpby(m,n,alpha, x, beta, y, info) integer(psb_i2pk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +593,8 @@ subroutine psi_i2axpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +624,106 @@ subroutine psi_i2axpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call i2axpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call i2axpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.i2zero) then + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = i2zero + enddo + else if (beta.eq.i2one) then + ! + ! Do nothing! + ! + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +746,7 @@ subroutine psi_i2axpbyv2(m,alpha, x, beta, y, z, info) integer(psb_i2pk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +785,105 @@ subroutine psi_i2axpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call i2axpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.i2zero) then + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = i2zero + enddo + else if (beta.eq.i2one) then + ! + ! Do nothing! + ! + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1231,7 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.i2zero) then if (beta.eq.i2zero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = i2zero enddo @@ -953,12 +1243,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1261,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1276,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1294,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1309,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1327,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1342,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1435,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1453,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1468,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1486,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1501,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1519,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1534,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index 950e2358..d7849cd7 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -29,6 +29,99 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_m_exscanv(n,x,info,shift,ibase) + use psi_m_serial_mod, psb_protect_name => psi_m_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_mpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + integer(psb_mpk_) :: shift_, tp, ts + integer(psb_mpk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + logical is_nested + + if (present(shift)) then + shift_ = shift + else + shift_ = mzero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + is_nested = omp_get_nested() + call omp_set_nested(.true.) + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + !$OMP END PARALLEL + call omp_set_nested(is_nested) +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_m_exscanv + subroutine psb_m_mgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_mgelp use psb_const_mod @@ -441,9 +534,9 @@ subroutine psi_maxpby(m,n,alpha, x, beta, y, info) integer(psb_mpk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +595,8 @@ subroutine psi_maxpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +626,106 @@ subroutine psi_maxpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call maxpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call maxpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.mzero) then + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = mzero + enddo + else if (beta.eq.mone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +748,7 @@ subroutine psi_maxpbyv2(m,alpha, x, beta, y, z, info) integer(psb_mpk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +787,105 @@ subroutine psi_maxpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call maxpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.mzero) then + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = mzero + enddo + else if (beta.eq.mone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1233,7 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.mzero) then if (beta.eq.mzero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = mzero enddo @@ -953,12 +1245,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1263,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1278,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1296,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1311,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1329,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1344,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1437,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1455,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1470,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1488,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1503,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1521,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1536,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index 6c8e21e2..ec22ade7 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_s_exscanv(n,x,info,shift,ibase) + use psi_s_serial_mod, psb_protect_name => psi_s_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_spk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + real(psb_spk_) :: shift_, tp, ts + real(psb_spk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = szero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_s_exscanv + subroutine psb_m_sgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_sgelp use psb_const_mod diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index f3087992..9bcdfd7e 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -29,6 +29,97 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_z_exscanv(n,x,info,shift,ibase) + use psi_z_serial_mod, psb_protect_name => psi_z_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_dpk_), intent(in), optional :: shift + integer(psb_ipk_), intent(in), optional :: ibase + + complex(psb_dpk_) :: shift_, tp, ts + complex(psb_dpk_), allocatable :: suma(:) + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + + if (present(shift)) then + shift_ = shift + else + shift_ = zzero + end if + if (present(ibase)) then + ibase_ = ibase + else + ibase_ = ione + end if + +#if defined(OPENMP) + + !$OMP PARALLEL default(none) & + !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & + !$OMP private(ithread,wrk,i,first_idx,last_idx) + + !$OMP SINGLE + nthreads = omp_get_num_threads() + allocate(suma(nthreads+1)) + suma(:) = 0 + !suma(1) = 1 + !$OMP END SINGLE + ithread = omp_get_thread_num() + + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) + if (first_idx<=last_idx) then + suma(ithread+2) = suma(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + suma(ithread+2) = suma(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + !$OMP SINGLE + do i=2,nthreads+1 + suma(i) = suma(i) + suma(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = suma(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + !$OMP END SINGLE + + !$OMP END PARALLEL +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif + +end subroutine psi_z_exscanv + subroutine psb_m_zgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_zgelp use psb_const_mod From 9c248a31e2a74248cbab7028f00b16ffcdf34b75 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 19 Apr 2023 12:12:58 +0200 Subject: [PATCH 17/38] Refactored EXSCAN and its OpenMP usage. --- base/serial/impl/psb_c_csr_impl.F90 | 20 ++--- base/serial/impl/psb_d_csr_impl.F90 | 20 ++--- base/serial/impl/psb_s_csr_impl.F90 | 20 ++--- base/serial/impl/psb_z_csr_impl.F90 | 20 ++--- base/serial/psi_c_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_d_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_e_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_i2_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_m_serial_impl.F90 | 115 +++++++++++++++------------- base/serial/psi_s_serial_impl.F90 | 113 +++++++++++++++------------ base/serial/psi_z_serial_impl.F90 | 113 +++++++++++++++------------ 11 files changed, 473 insertions(+), 400 deletions(-) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 388e9e2b..b742204d 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=cone,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_c_mv_csr_from_coo diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 1762604d..86287e32 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=done,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_d_mv_csr_from_coo diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 71da21a4..46ead8fc 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=sone,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_s_mv_csr_from_coo diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 34332f46..b6ec8fe7 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2920,16 +2920,14 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2937,6 +2935,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2944,8 +2943,8 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do + call psi_exscan(nr+1,a%irp,info,shift=zone,ibase=ione) #endif - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) call a%set_host() @@ -3089,16 +3088,14 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) reduction(max:info) !$OMP WORKSHARE a%irp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -3106,6 +3103,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3113,11 +3111,9 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do -#endif call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) +#endif - !write(0,*) name,' Check:',a%irp(nr+1),all(a%irp(1:nr) < a%irp(nr+1)) - !write(0,*) name,a%irp(:) call a%set_host() end subroutine psb_z_mv_csr_from_coo diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index 91f463f4..2391becb 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase complex(psb_spk_) :: shift_, tp, ts - complex(psb_spk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_c_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_c_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_c_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + complex(psb_spk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_c_exscan +#endif end subroutine psi_c_exscanv subroutine psb_m_cgelp(trans,iperm,x,info) diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index d8fb92d7..099dd1d4 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase real(psb_dpk_) :: shift_, tp, ts - real(psb_dpk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_d_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_d_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_d_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + real(psb_dpk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_d_exscan +#endif end subroutine psi_d_exscanv subroutine psb_m_dgelp(trans,iperm,x,info) diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index 24ce41f1..10ca93b4 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase integer(psb_epk_) :: shift_, tp, ts - integer(psb_epk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_e_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_e_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_e_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + integer(psb_epk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_e_exscan +#endif end subroutine psi_e_exscanv subroutine psb_m_egelp(trans,iperm,x,info) diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index c5889013..3ccd35bc 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase integer(psb_i2pk_) :: shift_, tp, ts - integer(psb_i2pk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_i2_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_i2_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_i2_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + integer(psb_i2pk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_i2_exscan +#endif end subroutine psi_i2_exscanv subroutine psb_m_i2gelp(trans,iperm,x,info) diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index d7849cd7..c3d73e5a 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -44,9 +44,8 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase integer(psb_mpk_) :: shift_, tp, ts - integer(psb_mpk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ - logical is_nested + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -60,56 +59,14 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - is_nested = omp_get_nested() - call omp_set_nested(.true.) - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_m_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + !$OMP PARALLEL default(shared) + call inner_m_exscan() + !$OMP END PARALLEL end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do - end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - !$OMP END PARALLEL - call omp_set_nested(is_nested) #else tp = shift_ do i=1,n @@ -119,7 +76,61 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_m_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + integer(psb_mpk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_m_exscan +#endif end subroutine psi_m_exscanv subroutine psb_m_mgelp(trans,iperm,x,info) diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index ec22ade7..fd1c4cd4 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase real(psb_spk_) :: shift_, tp, ts - real(psb_spk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_s_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_s_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_s_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + real(psb_spk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_s_exscan +#endif end subroutine psi_s_exscanv subroutine psb_m_sgelp(trans,iperm,x,info) diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index 9bcdfd7e..bf055476 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -44,8 +44,8 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) integer(psb_ipk_), intent(in), optional :: ibase complex(psb_dpk_) :: shift_, tp, ts - complex(psb_dpk_), allocatable :: suma(:) - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk, ibase_ + integer(psb_ipk_) :: ibase_ + logical is_nested, is_parallel if (present(shift)) then shift_ = shift @@ -59,55 +59,14 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) end if #if defined(OPENMP) - - !$OMP PARALLEL default(none) & - !$OMP shared(suma,nthreads,n,x,shift_,ibase_) & - !$OMP private(ithread,wrk,i,first_idx,last_idx) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - allocate(suma(nthreads+1)) - suma(:) = 0 - !suma(1) = 1 - !$OMP END SINGLE - ithread = omp_get_thread_num() - - - wrk = (n)/nthreads - if (ithread < MOD((n),nthreads)) then - wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_z_exscan() else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ - end if - - last_idx = min(first_idx + wrk - 1,n - (ibase_-ione)) - if (first_idx<=last_idx) then - suma(ithread+2) = suma(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - suma(ithread+2) = suma(ithread+2) + x(i) - x(i) = x(i)+x(i-1) - end do + !$OMP PARALLEL default(shared) + call inner_z_exscan() + !$OMP END PARALLEL end if - !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - suma(i) = suma(i) + suma(i-1) - end do - !$OMP END SINGLE - - !$OMP BARRIER - - !$OMP DO SCHEDULE(STATIC) - do i=1,n - x(i) = suma(ithread+1) + x(i) + shift_ - end do - !$OMP END DO - !$OMP SINGLE - x(1) = shift_ - !$OMP END SINGLE - - !$OMP END PARALLEL #else tp = shift_ do i=1,n @@ -117,7 +76,61 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) end do #endif - +#if defined(OPENMP) +contains + subroutine inner_z_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk + complex(psb_dpk_), pointer :: sumb(:) + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + first_idx = ithread*wrk + ibase_ + else + first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + end if + + last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + if (first_idx<=last_idx) then + sumb(ithread+2) = sumb(ithread+2) + x(first_idx) + do i=first_idx+1,last_idx + sumb(ithread+2) = sumb(ithread+2) + x(i) + x(i) = x(i)+x(i-1) + end do + end if + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = sumb(ithread+1) + x(i) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + x(1) = shift_ + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_z_exscan +#endif end subroutine psi_z_exscanv subroutine psb_m_zgelp(trans,iperm,x,info) From 05b684ddbb947e60cb41a09f87878cb61c16a9a8 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 19 Apr 2023 13:23:48 +0200 Subject: [PATCH 18/38] Updated use of exscan in CSC --- base/serial/impl/psb_c_csc_impl.F90 | 10 ++++------ base/serial/impl/psb_d_csc_impl.F90 | 10 ++++------ base/serial/impl/psb_s_csc_impl.F90 | 10 ++++------ base/serial/impl/psb_z_csc_impl.F90 | 10 ++++------ 4 files changed, 16 insertions(+), 24 deletions(-) diff --git a/base/serial/impl/psb_c_csc_impl.F90 b/base/serial/impl/psb_c_csc_impl.F90 index c573a40d..7532d5a0 100644 --- a/base/serial/impl/psb_c_csc_impl.F90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2228,16 +2228,14 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) !$OMP WORKSHARE a%icp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2245,16 +2243,16 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) !$OMP END PARALLEL - #else a%icp(:) = 0 do k=1,nza i = itemp(k) a%icp(i) = a%icp(i) + 1 end do -#endif call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) +#endif call a%set_host() diff --git a/base/serial/impl/psb_d_csc_impl.F90 b/base/serial/impl/psb_d_csc_impl.F90 index 891df5a3..38f746de 100644 --- a/base/serial/impl/psb_d_csc_impl.F90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2228,16 +2228,14 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) !$OMP WORKSHARE a%icp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2245,16 +2243,16 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) !$OMP END PARALLEL - #else a%icp(:) = 0 do k=1,nza i = itemp(k) a%icp(i) = a%icp(i) + 1 end do -#endif call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) +#endif call a%set_host() diff --git a/base/serial/impl/psb_s_csc_impl.F90 b/base/serial/impl/psb_s_csc_impl.F90 index 2bf77184..d1fb8c4b 100644 --- a/base/serial/impl/psb_s_csc_impl.F90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2228,16 +2228,14 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) !$OMP WORKSHARE a%icp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2245,16 +2243,16 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) !$OMP END PARALLEL - #else a%icp(:) = 0 do k=1,nza i = itemp(k) a%icp(i) = a%icp(i) + 1 end do -#endif call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) +#endif call a%set_host() diff --git a/base/serial/impl/psb_z_csc_impl.F90 b/base/serial/impl/psb_z_csc_impl.F90 index 22ea3677..2735bcbd 100644 --- a/base/serial/impl/psb_z_csc_impl.F90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2228,16 +2228,14 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) #if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,a,itemp,nza) & - !$OMP private(i,info) + !$OMP PARALLEL default(shared) !$OMP WORKSHARE a%icp(:) = 0 !$OMP END WORKSHARE !$OMP DO schedule(STATIC) & - !$OMP private(k) + !$OMP private(k,i) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE @@ -2245,16 +2243,16 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) !$OMP END PARALLEL - #else a%icp(:) = 0 do k=1,nza i = itemp(k) a%icp(i) = a%icp(i) + 1 end do -#endif call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) +#endif call a%set_host() From 08ff37332a53be836255b7ed41b3d08bc405d462 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 19 Apr 2023 14:06:40 +0200 Subject: [PATCH 19/38] Enable OpenMP in test/pargen --- test/omp/psb_tomp.F90 | 14 +- test/pargen/psb_d_pde2d.F90 | 216 ++++++++++++++++----------- test/pargen/psb_d_pde3d.F90 | 290 ++++++++++++++---------------------- test/pargen/psb_s_pde2d.F90 | 216 ++++++++++++++++----------- test/pargen/psb_s_pde3d.F90 | 290 ++++++++++++++---------------------- 5 files changed, 467 insertions(+), 559 deletions(-) diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 index f3d19a25..fda08f4e 100644 --- a/test/omp/psb_tomp.F90 +++ b/test/omp/psb_tomp.F90 @@ -452,9 +452,6 @@ contains call psb_barrier(ctxt) t1 = psb_wtime() !$omp parallel shared(deltah,myidx,a,desc_a) - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. ! block integer(psb_ipk_) :: i,j,ii,ib,icoeff, ix,iy,iz, ith,nth @@ -639,16 +636,7 @@ contains write(psb_out_unit,'("-total time : ",es12.5)') ttot end if -!!$ !$omp parallel -!!$ !$omp master -!!$ block -!!$ character(len=1024) :: fname -!!$ write(fname,'(a,i4.4,a,i4.4,a)') 'a-',iam,'-',np,'.mtx' -!!$ write(0,*) iam,' Size of A ',a%get_nrows(),a%get_ncols(),a%get_nzeros() -!!$ call a%print(fname,head='Test') -!!$ end block -!!$ !$omp end master -!!$ !$omp end parallel + call psb_erractionrestore(err_act) return diff --git a/test/pargen/psb_d_pde2d.F90 b/test/pargen/psb_d_pde2d.F90 index 6da97828..11777b19 100644 --- a/test/pargen/psb_d_pde2d.F90 +++ b/test/pargen/psb_d_pde2d.F90 @@ -156,6 +156,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -192,7 +195,7 @@ contains type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr real(psb_dpk_) :: zt(nb),x,y,z - integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_, mysz integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner ! For 2D partition @@ -204,8 +207,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2 @@ -391,7 +393,6 @@ contains end if end block - case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -418,94 +419,114 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(done,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif - + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(done,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel + + tgen = psb_wtime()-t1 if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -514,8 +535,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info,mold=imold) @@ -579,6 +598,9 @@ program psb_d_pde2d use psb_krylov_mod use psb_util_mod use psb_d_pde2d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -600,7 +622,7 @@ program psb_d_pde2d type(psb_d_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -625,7 +647,16 @@ program psb_d_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -750,6 +781,8 @@ program psb_d_pde2d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter @@ -790,7 +823,8 @@ contains ! ! get iteration parameters from standard input ! - subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ipart,parms) + subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,& + & itmax,itrace,irst,ipart,parms) type(psb_ctxt_type) :: ctxt character(len=*) :: kmethd, ptype, afmt integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst,ipart diff --git a/test/pargen/psb_d_pde3d.F90 b/test/pargen/psb_d_pde3d.F90 index d4eeccf2..6e895c00 100644 --- a/test/pargen/psb_d_pde3d.F90 +++ b/test/pargen/psb_d_pde3d.F90 @@ -172,6 +172,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -220,8 +223,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2 @@ -377,7 +379,6 @@ contains ! call psb_cdall(ctxt,desc_a,info,vl=myidx) - ! ! Specify process topology ! @@ -447,69 +448,79 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr, nb - ib = min(nb,nlr-ii+1) - !ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + !ib = min(nb,mysz-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif ! term depending on (x,y,z) val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & @@ -546,14 +557,22 @@ contains endif end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle + end do + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -569,113 +588,6 @@ contains call psb_cdasb(desc_a,info,mold=imold) tcdasb = psb_wtime()-t1 - if (.false.) then - ! - ! Add extra rows to test remote build. - ! - block - integer(psb_ipk_) :: ks, i - ks = desc_a%get_local_cols()-desc_a%get_local_rows() - if (ks > 0) ks = max(1,ks / 10) - mysz = nlr+ks - call psb_realloc(mysz,myidx,info) - do i=nlr+1, mysz - myidx(i) = i - end do - call desc_a%l2gv1(myidx(nlr+1:mysz),info) - !write(0,*) iam,' Check on extra nodes ',nlr,mysz,':',myidx(nlr+1:mysz) - do ii= nlr+1, mysz, nb - ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - - end block - end if - call psb_barrier(ctxt) t1 = psb_wtime() if (info == psb_success_) then @@ -719,7 +631,7 @@ contains write(psb_out_unit,'("-total time : ",es12.5)') ttot end if - deallocate(val,irow,icol) + call psb_erractionrestore(err_act) return @@ -744,6 +656,9 @@ program psb_d_pde3d use psb_krylov_mod use psb_util_mod use psb_d_pde3d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -765,7 +680,7 @@ program psb_d_pde3d type(psb_d_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -790,7 +705,16 @@ program psb_d_pde3d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -914,6 +838,8 @@ program psb_d_pde3d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter diff --git a/test/pargen/psb_s_pde2d.F90 b/test/pargen/psb_s_pde2d.F90 index 664d5d08..f14d2cb4 100644 --- a/test/pargen/psb_s_pde2d.F90 +++ b/test/pargen/psb_s_pde2d.F90 @@ -156,6 +156,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -192,7 +195,7 @@ contains type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr real(psb_spk_) :: zt(nb),x,y,z - integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_, mysz integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner ! For 2D partition @@ -204,8 +207,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2 @@ -391,7 +393,6 @@ contains end if end block - case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -418,94 +419,114 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif - + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel + + tgen = psb_wtime()-t1 if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -514,8 +535,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info,mold=imold) @@ -579,6 +598,9 @@ program psb_s_pde2d use psb_krylov_mod use psb_util_mod use psb_s_pde2d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -600,7 +622,7 @@ program psb_s_pde2d type(psb_s_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -625,7 +647,16 @@ program psb_s_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -750,6 +781,8 @@ program psb_s_pde2d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter @@ -790,7 +823,8 @@ contains ! ! get iteration parameters from standard input ! - subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ipart,parms) + subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,& + & itmax,itrace,irst,ipart,parms) type(psb_ctxt_type) :: ctxt character(len=*) :: kmethd, ptype, afmt integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst,ipart diff --git a/test/pargen/psb_s_pde3d.F90 b/test/pargen/psb_s_pde3d.F90 index 280a7c34..2938a4ff 100644 --- a/test/pargen/psb_s_pde3d.F90 +++ b/test/pargen/psb_s_pde3d.F90 @@ -172,6 +172,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -220,8 +223,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2 @@ -377,7 +379,6 @@ contains ! call psb_cdall(ctxt,desc_a,info,vl=myidx) - ! ! Specify process topology ! @@ -447,69 +448,79 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr, nb - ib = min(nb,nlr-ii+1) - !ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + !ib = min(nb,mysz-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif ! term depending on (x,y,z) val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & @@ -546,14 +557,22 @@ contains endif end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle + end do + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -569,113 +588,6 @@ contains call psb_cdasb(desc_a,info,mold=imold) tcdasb = psb_wtime()-t1 - if (.false.) then - ! - ! Add extra rows to test remote build. - ! - block - integer(psb_ipk_) :: ks, i - ks = desc_a%get_local_cols()-desc_a%get_local_rows() - if (ks > 0) ks = max(1,ks / 10) - mysz = nlr+ks - call psb_realloc(mysz,myidx,info) - do i=nlr+1, mysz - myidx(i) = i - end do - call desc_a%l2gv1(myidx(nlr+1:mysz),info) - !write(0,*) iam,' Check on extra nodes ',nlr,mysz,':',myidx(nlr+1:mysz) - do ii= nlr+1, mysz, nb - ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - - end block - end if - call psb_barrier(ctxt) t1 = psb_wtime() if (info == psb_success_) then @@ -719,7 +631,7 @@ contains write(psb_out_unit,'("-total time : ",es12.5)') ttot end if - deallocate(val,irow,icol) + call psb_erractionrestore(err_act) return @@ -744,6 +656,9 @@ program psb_s_pde3d use psb_krylov_mod use psb_util_mod use psb_s_pde3d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -765,7 +680,7 @@ program psb_s_pde3d type(psb_s_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -790,7 +705,16 @@ program psb_s_pde3d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -914,6 +838,8 @@ program psb_s_pde3d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter From f3efea0a893cb7b2246e68a768989d7fec566a5b Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 20 Apr 2023 12:31:32 +0200 Subject: [PATCH 20/38] Take out IBASE from exscan, makes no sense. --- base/modules/auxil/psi_c_serial_mod.f90 | 3 +-- base/modules/auxil/psi_d_serial_mod.f90 | 3 +-- base/modules/auxil/psi_e_serial_mod.f90 | 3 +-- base/modules/auxil/psi_i2_serial_mod.f90 | 3 +-- base/modules/auxil/psi_m_serial_mod.f90 | 3 +-- base/modules/auxil/psi_s_serial_mod.f90 | 3 +-- base/modules/auxil/psi_z_serial_mod.f90 | 3 +-- base/serial/impl/psb_c_csc_impl.F90 | 4 ++-- base/serial/impl/psb_c_csr_impl.F90 | 8 ++++---- base/serial/impl/psb_d_csc_impl.F90 | 4 ++-- base/serial/impl/psb_d_csr_impl.F90 | 8 ++++---- base/serial/impl/psb_s_csc_impl.F90 | 4 ++-- base/serial/impl/psb_s_csr_impl.F90 | 8 ++++---- base/serial/impl/psb_z_csc_impl.F90 | 4 ++-- base/serial/impl/psb_z_csr_impl.F90 | 8 ++++---- base/serial/psi_c_serial_impl.F90 | 15 ++++----------- base/serial/psi_d_serial_impl.F90 | 15 ++++----------- base/serial/psi_e_serial_impl.F90 | 15 ++++----------- base/serial/psi_i2_serial_impl.F90 | 15 ++++----------- base/serial/psi_m_serial_impl.F90 | 15 ++++----------- base/serial/psi_s_serial_impl.F90 | 15 ++++----------- base/serial/psi_z_serial_impl.F90 | 15 ++++----------- 22 files changed, 59 insertions(+), 115 deletions(-) diff --git a/base/modules/auxil/psi_c_serial_mod.f90 b/base/modules/auxil/psi_c_serial_mod.f90 index d62ba3bc..0fdff04b 100644 --- a/base/modules/auxil/psi_c_serial_mod.f90 +++ b/base/modules/auxil/psi_c_serial_mod.f90 @@ -157,14 +157,13 @@ module psi_c_serial_mod end interface psi_sct interface psi_exscan - subroutine psi_c_exscanv(n,x,info,shift,ibase) + subroutine psi_c_exscanv(n,x,info,shift) import :: psb_ipk_, psb_spk_ implicit none integer(psb_ipk_), intent(in) :: n complex(psb_spk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info complex(psb_spk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase end subroutine psi_c_exscanv end interface psi_exscan diff --git a/base/modules/auxil/psi_d_serial_mod.f90 b/base/modules/auxil/psi_d_serial_mod.f90 index ae88be74..0ce14dbb 100644 --- a/base/modules/auxil/psi_d_serial_mod.f90 +++ b/base/modules/auxil/psi_d_serial_mod.f90 @@ -157,14 +157,13 @@ module psi_d_serial_mod end interface psi_sct interface psi_exscan - subroutine psi_d_exscanv(n,x,info,shift,ibase) + subroutine psi_d_exscanv(n,x,info,shift) import :: psb_ipk_, psb_dpk_ implicit none integer(psb_ipk_), intent(in) :: n real(psb_dpk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info real(psb_dpk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase end subroutine psi_d_exscanv end interface psi_exscan diff --git a/base/modules/auxil/psi_e_serial_mod.f90 b/base/modules/auxil/psi_e_serial_mod.f90 index a5544075..f0372e01 100644 --- a/base/modules/auxil/psi_e_serial_mod.f90 +++ b/base/modules/auxil/psi_e_serial_mod.f90 @@ -157,14 +157,13 @@ module psi_e_serial_mod end interface psi_sct interface psi_exscan - subroutine psi_e_exscanv(n,x,info,shift,ibase) + subroutine psi_e_exscanv(n,x,info,shift) import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ implicit none integer(psb_ipk_), intent(in) :: n integer(psb_epk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info integer(psb_epk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase end subroutine psi_e_exscanv end interface psi_exscan diff --git a/base/modules/auxil/psi_i2_serial_mod.f90 b/base/modules/auxil/psi_i2_serial_mod.f90 index c0b5a327..70dd95e1 100644 --- a/base/modules/auxil/psi_i2_serial_mod.f90 +++ b/base/modules/auxil/psi_i2_serial_mod.f90 @@ -157,14 +157,13 @@ module psi_i2_serial_mod end interface psi_sct interface psi_exscan - subroutine psi_i2_exscanv(n,x,info,shift,ibase) + subroutine psi_i2_exscanv(n,x,info,shift) import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ implicit none integer(psb_ipk_), intent(in) :: n integer(psb_i2pk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info integer(psb_i2pk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase end subroutine psi_i2_exscanv end interface psi_exscan diff --git a/base/modules/auxil/psi_m_serial_mod.f90 b/base/modules/auxil/psi_m_serial_mod.f90 index ab875f7b..cfd1348e 100644 --- a/base/modules/auxil/psi_m_serial_mod.f90 +++ b/base/modules/auxil/psi_m_serial_mod.f90 @@ -157,14 +157,13 @@ module psi_m_serial_mod end interface psi_sct interface psi_exscan - subroutine psi_m_exscanv(n,x,info,shift,ibase) + subroutine psi_m_exscanv(n,x,info,shift) import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ implicit none integer(psb_ipk_), intent(in) :: n integer(psb_mpk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info integer(psb_mpk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase end subroutine psi_m_exscanv end interface psi_exscan diff --git a/base/modules/auxil/psi_s_serial_mod.f90 b/base/modules/auxil/psi_s_serial_mod.f90 index fee1afc6..25c4a7ef 100644 --- a/base/modules/auxil/psi_s_serial_mod.f90 +++ b/base/modules/auxil/psi_s_serial_mod.f90 @@ -157,14 +157,13 @@ module psi_s_serial_mod end interface psi_sct interface psi_exscan - subroutine psi_s_exscanv(n,x,info,shift,ibase) + subroutine psi_s_exscanv(n,x,info,shift) import :: psb_ipk_, psb_spk_ implicit none integer(psb_ipk_), intent(in) :: n real(psb_spk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info real(psb_spk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase end subroutine psi_s_exscanv end interface psi_exscan diff --git a/base/modules/auxil/psi_z_serial_mod.f90 b/base/modules/auxil/psi_z_serial_mod.f90 index 3ab430cc..b40cf05a 100644 --- a/base/modules/auxil/psi_z_serial_mod.f90 +++ b/base/modules/auxil/psi_z_serial_mod.f90 @@ -157,14 +157,13 @@ module psi_z_serial_mod end interface psi_sct interface psi_exscan - subroutine psi_z_exscanv(n,x,info,shift,ibase) + subroutine psi_z_exscanv(n,x,info,shift) import :: psb_ipk_, psb_dpk_ implicit none integer(psb_ipk_), intent(in) :: n complex(psb_dpk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info complex(psb_dpk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase end subroutine psi_z_exscanv end interface psi_exscan diff --git a/base/serial/impl/psb_c_csc_impl.F90 b/base/serial/impl/psb_c_csc_impl.F90 index 7532d5a0..bb500628 100644 --- a/base/serial/impl/psb_c_csc_impl.F90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2243,7 +2243,7 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) !$OMP END PARALLEL #else a%icp(:) = 0 @@ -2251,7 +2251,7 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) i = itemp(k) a%icp(i) = a%icp(i) + 1 end do - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index b742204d..06dec27d 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2935,7 +2935,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2943,7 +2943,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=cone,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() @@ -3103,7 +3103,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3111,7 +3111,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/impl/psb_d_csc_impl.F90 b/base/serial/impl/psb_d_csc_impl.F90 index 38f746de..ec80875a 100644 --- a/base/serial/impl/psb_d_csc_impl.F90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2243,7 +2243,7 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) !$OMP END PARALLEL #else a%icp(:) = 0 @@ -2251,7 +2251,7 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) i = itemp(k) a%icp(i) = a%icp(i) + 1 end do - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 86287e32..f071a5b8 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2935,7 +2935,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2943,7 +2943,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=done,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() @@ -3103,7 +3103,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3111,7 +3111,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/impl/psb_s_csc_impl.F90 b/base/serial/impl/psb_s_csc_impl.F90 index d1fb8c4b..fa3fe880 100644 --- a/base/serial/impl/psb_s_csc_impl.F90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2243,7 +2243,7 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) !$OMP END PARALLEL #else a%icp(:) = 0 @@ -2251,7 +2251,7 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) i = itemp(k) a%icp(i) = a%icp(i) + 1 end do - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 46ead8fc..cd2933f7 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2935,7 +2935,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2943,7 +2943,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=sone,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() @@ -3103,7 +3103,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3111,7 +3111,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/impl/psb_z_csc_impl.F90 b/base/serial/impl/psb_z_csc_impl.F90 index 2735bcbd..28285e4d 100644 --- a/base/serial/impl/psb_z_csc_impl.F90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2243,7 +2243,7 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) !$OMP END PARALLEL #else a%icp(:) = 0 @@ -2251,7 +2251,7 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) i = itemp(k) a%icp(i) = a%icp(i) + 1 end do - call psi_exscan(nc+1,a%icp,info,shift=ione,ibase=ione) + call psi_exscan(nc+1,a%icp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index b6ec8fe7..88f49335 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2935,7 +2935,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -2943,7 +2943,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=zone,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() @@ -3103,7 +3103,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) !$OMP END ATOMIC end do !$OMP END DO - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) !$OMP END PARALLEL #else a%irp(:) = 0 @@ -3111,7 +3111,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) i = itemp(k) a%irp(i) = a%irp(i) + 1 end do - call psi_exscan(nr+1,a%irp,info,shift=ione,ibase=ione) + call psi_exscan(nr+1,a%irp,info,shift=ione) #endif call a%set_host() diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index 2391becb..d7c92d3a 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -29,7 +29,7 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! -subroutine psi_c_exscanv(n,x,info,shift,ibase) +subroutine psi_c_exscanv(n,x,info,shift) use psi_c_serial_mod, psb_protect_name => psi_c_exscanv use psb_const_mod use psb_error_mod @@ -41,10 +41,8 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) complex(psb_spk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info complex(psb_spk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase complex(psb_spk_) :: shift_, tp, ts - integer(psb_ipk_) :: ibase_ logical is_nested, is_parallel if (present(shift)) then @@ -52,11 +50,6 @@ subroutine psi_c_exscanv(n,x,info,shift,ibase) else shift_ = czero end if - if (present(ibase)) then - ibase_ = ibase - else - ibase_ = ione - end if #if defined(OPENMP) is_parallel = omp_in_parallel() @@ -97,12 +90,12 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + first_idx = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + first_idx = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + last_idx = min(first_idx + wrk - 1,n ) if (first_idx<=last_idx) then sumb(ithread+2) = sumb(ithread+2) + x(first_idx) do i=first_idx+1,last_idx diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index 099dd1d4..56bedc84 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -29,7 +29,7 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! -subroutine psi_d_exscanv(n,x,info,shift,ibase) +subroutine psi_d_exscanv(n,x,info,shift) use psi_d_serial_mod, psb_protect_name => psi_d_exscanv use psb_const_mod use psb_error_mod @@ -41,10 +41,8 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) real(psb_dpk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info real(psb_dpk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase real(psb_dpk_) :: shift_, tp, ts - integer(psb_ipk_) :: ibase_ logical is_nested, is_parallel if (present(shift)) then @@ -52,11 +50,6 @@ subroutine psi_d_exscanv(n,x,info,shift,ibase) else shift_ = dzero end if - if (present(ibase)) then - ibase_ = ibase - else - ibase_ = ione - end if #if defined(OPENMP) is_parallel = omp_in_parallel() @@ -97,12 +90,12 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + first_idx = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + first_idx = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + last_idx = min(first_idx + wrk - 1,n ) if (first_idx<=last_idx) then sumb(ithread+2) = sumb(ithread+2) + x(first_idx) do i=first_idx+1,last_idx diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index 10ca93b4..ba9a0b2a 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -29,7 +29,7 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! -subroutine psi_e_exscanv(n,x,info,shift,ibase) +subroutine psi_e_exscanv(n,x,info,shift) use psi_e_serial_mod, psb_protect_name => psi_e_exscanv use psb_const_mod use psb_error_mod @@ -41,10 +41,8 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) integer(psb_epk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info integer(psb_epk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase integer(psb_epk_) :: shift_, tp, ts - integer(psb_ipk_) :: ibase_ logical is_nested, is_parallel if (present(shift)) then @@ -52,11 +50,6 @@ subroutine psi_e_exscanv(n,x,info,shift,ibase) else shift_ = ezero end if - if (present(ibase)) then - ibase_ = ibase - else - ibase_ = ione - end if #if defined(OPENMP) is_parallel = omp_in_parallel() @@ -97,12 +90,12 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + first_idx = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + first_idx = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + last_idx = min(first_idx + wrk - 1,n ) if (first_idx<=last_idx) then sumb(ithread+2) = sumb(ithread+2) + x(first_idx) do i=first_idx+1,last_idx diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index 3ccd35bc..200e378a 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -29,7 +29,7 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! -subroutine psi_i2_exscanv(n,x,info,shift,ibase) +subroutine psi_i2_exscanv(n,x,info,shift) use psi_i2_serial_mod, psb_protect_name => psi_i2_exscanv use psb_const_mod use psb_error_mod @@ -41,10 +41,8 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) integer(psb_i2pk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info integer(psb_i2pk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase integer(psb_i2pk_) :: shift_, tp, ts - integer(psb_ipk_) :: ibase_ logical is_nested, is_parallel if (present(shift)) then @@ -52,11 +50,6 @@ subroutine psi_i2_exscanv(n,x,info,shift,ibase) else shift_ = i2zero end if - if (present(ibase)) then - ibase_ = ibase - else - ibase_ = ione - end if #if defined(OPENMP) is_parallel = omp_in_parallel() @@ -97,12 +90,12 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + first_idx = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + first_idx = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + last_idx = min(first_idx + wrk - 1,n ) if (first_idx<=last_idx) then sumb(ithread+2) = sumb(ithread+2) + x(first_idx) do i=first_idx+1,last_idx diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index c3d73e5a..05461e1e 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -29,7 +29,7 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! -subroutine psi_m_exscanv(n,x,info,shift,ibase) +subroutine psi_m_exscanv(n,x,info,shift) use psi_m_serial_mod, psb_protect_name => psi_m_exscanv use psb_const_mod use psb_error_mod @@ -41,10 +41,8 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) integer(psb_mpk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info integer(psb_mpk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase integer(psb_mpk_) :: shift_, tp, ts - integer(psb_ipk_) :: ibase_ logical is_nested, is_parallel if (present(shift)) then @@ -52,11 +50,6 @@ subroutine psi_m_exscanv(n,x,info,shift,ibase) else shift_ = mzero end if - if (present(ibase)) then - ibase_ = ibase - else - ibase_ = ione - end if #if defined(OPENMP) is_parallel = omp_in_parallel() @@ -97,12 +90,12 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + first_idx = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + first_idx = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + last_idx = min(first_idx + wrk - 1,n ) if (first_idx<=last_idx) then sumb(ithread+2) = sumb(ithread+2) + x(first_idx) do i=first_idx+1,last_idx diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index fd1c4cd4..6e5eadd7 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -29,7 +29,7 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! -subroutine psi_s_exscanv(n,x,info,shift,ibase) +subroutine psi_s_exscanv(n,x,info,shift) use psi_s_serial_mod, psb_protect_name => psi_s_exscanv use psb_const_mod use psb_error_mod @@ -41,10 +41,8 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) real(psb_spk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info real(psb_spk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase real(psb_spk_) :: shift_, tp, ts - integer(psb_ipk_) :: ibase_ logical is_nested, is_parallel if (present(shift)) then @@ -52,11 +50,6 @@ subroutine psi_s_exscanv(n,x,info,shift,ibase) else shift_ = szero end if - if (present(ibase)) then - ibase_ = ibase - else - ibase_ = ione - end if #if defined(OPENMP) is_parallel = omp_in_parallel() @@ -97,12 +90,12 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + first_idx = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + first_idx = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + last_idx = min(first_idx + wrk - 1,n ) if (first_idx<=last_idx) then sumb(ithread+2) = sumb(ithread+2) + x(first_idx) do i=first_idx+1,last_idx diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index bf055476..ef13db0b 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -29,7 +29,7 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! -subroutine psi_z_exscanv(n,x,info,shift,ibase) +subroutine psi_z_exscanv(n,x,info,shift) use psi_z_serial_mod, psb_protect_name => psi_z_exscanv use psb_const_mod use psb_error_mod @@ -41,10 +41,8 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) complex(psb_dpk_), intent (inout) :: x(:) integer(psb_ipk_), intent(out) :: info complex(psb_dpk_), intent(in), optional :: shift - integer(psb_ipk_), intent(in), optional :: ibase complex(psb_dpk_) :: shift_, tp, ts - integer(psb_ipk_) :: ibase_ logical is_nested, is_parallel if (present(shift)) then @@ -52,11 +50,6 @@ subroutine psi_z_exscanv(n,x,info,shift,ibase) else shift_ = zzero end if - if (present(ibase)) then - ibase_ = ibase - else - ibase_ = ione - end if #if defined(OPENMP) is_parallel = omp_in_parallel() @@ -97,12 +90,12 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + ibase_ + first_idx = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + ibase_ + first_idx = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n - (ione-ibase_)) + last_idx = min(first_idx + wrk - 1,n ) if (first_idx<=last_idx) then sumb(ithread+2) = sumb(ithread+2) + x(first_idx) do i=first_idx+1,last_idx From 5bc02fb2e6b47d0e6559f3a6951b266dd25f3dc1 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Fri, 21 Apr 2023 08:55:20 +0200 Subject: [PATCH 21/38] Take out redundant statements in SPINS --- base/tools/psb_cspins.F90 | 10 ++-------- base/tools/psb_dspins.F90 | 10 ++-------- base/tools/psb_sspins.F90 | 10 ++-------- base/tools/psb_zspins.F90 | 10 ++-------- 4 files changed, 8 insertions(+), 32 deletions(-) diff --git a/base/tools/psb_cspins.F90 b/base/tools/psb_cspins.F90 index 66e03ed9..0f5fc9df 100644 --- a/base/tools/psb_cspins.F90 +++ b/base/tools/psb_cspins.F90 @@ -73,7 +73,7 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k, ith, nth + integer(psb_ipk_) :: i,k integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) complex(psb_spk_), allocatable :: lval(:) @@ -85,13 +85,7 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) -#if defined(OPENMP) - nth = omp_get_num_threads() - ith = omp_get_thread_num() -#else - nth = 1 - ith = 0 -#endif + if (nz < 0) then info = 1111 call psb_errpush(info,name) diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 2e11c511..06f913b5 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -73,7 +73,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k, ith, nth + integer(psb_ipk_) :: i,k integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) real(psb_dpk_), allocatable :: lval(:) @@ -85,13 +85,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) -#if defined(OPENMP) - nth = omp_get_num_threads() - ith = omp_get_thread_num() -#else - nth = 1 - ith = 0 -#endif + if (nz < 0) then info = 1111 call psb_errpush(info,name) diff --git a/base/tools/psb_sspins.F90 b/base/tools/psb_sspins.F90 index 90fa9e2d..56ef9c97 100644 --- a/base/tools/psb_sspins.F90 +++ b/base/tools/psb_sspins.F90 @@ -73,7 +73,7 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k, ith, nth + integer(psb_ipk_) :: i,k integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) real(psb_spk_), allocatable :: lval(:) @@ -85,13 +85,7 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) -#if defined(OPENMP) - nth = omp_get_num_threads() - ith = omp_get_thread_num() -#else - nth = 1 - ith = 0 -#endif + if (nz < 0) then info = 1111 call psb_errpush(info,name) diff --git a/base/tools/psb_zspins.F90 b/base/tools/psb_zspins.F90 index 74e2d5dc..d483f198 100644 --- a/base/tools/psb_zspins.F90 +++ b/base/tools/psb_zspins.F90 @@ -73,7 +73,7 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) integer(psb_ipk_), parameter :: relocsz=200 logical :: rebuild_, local_ integer(psb_ipk_), allocatable :: ila(:),jla(:) - integer(psb_ipk_) :: i,k, ith, nth + integer(psb_ipk_) :: i,k integer(psb_lpk_) :: nnl integer(psb_lpk_), allocatable :: lila(:),ljla(:) complex(psb_dpk_), allocatable :: lval(:) @@ -85,13 +85,7 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) ctxt = desc_a%get_context() call psb_info(ctxt, me, np) -#if defined(OPENMP) - nth = omp_get_num_threads() - ith = omp_get_thread_num() -#else - nth = 1 - ith = 0 -#endif + if (nz < 0) then info = 1111 call psb_errpush(info,name) From 74a821752031efcc74c94dcd22a5fd6d196039c7 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 21 Apr 2023 12:02:18 +0200 Subject: [PATCH 22/38] Fixed silly bug in EXSCAN and usage in CSR_IMPL --- base/serial/impl/psb_c_csr_impl.F90 | 4 ++-- base/serial/impl/psb_d_csr_impl.F90 | 4 ++-- base/serial/impl/psb_s_csr_impl.F90 | 4 ++-- base/serial/impl/psb_z_csr_impl.F90 | 4 ++-- base/serial/psi_c_serial_impl.F90 | 30 ++++++++++++++++------------- base/serial/psi_d_serial_impl.F90 | 30 ++++++++++++++++------------- base/serial/psi_e_serial_impl.F90 | 30 ++++++++++++++++------------- base/serial/psi_i2_serial_impl.F90 | 30 ++++++++++++++++------------- base/serial/psi_m_serial_impl.F90 | 30 ++++++++++++++++------------- base/serial/psi_s_serial_impl.F90 | 30 ++++++++++++++++------------- base/serial/psi_z_serial_impl.F90 | 30 ++++++++++++++++------------- 11 files changed, 127 insertions(+), 99 deletions(-) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 06dec27d..7c7452a0 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index f071a5b8..56273b49 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index cd2933f7..17e73934 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 88f49335..f88008d8 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index d7c92d3a..a3898349 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_c_exscanv(n,x,info,shift) complex(psb_spk_), intent(in), optional :: shift complex(psb_spk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - complex(psb_spk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + complex(psb_spk_), pointer :: sumb(:) + complex(psb_spk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = czero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_c_exscan diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index 56bedc84..1b5b1442 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_d_exscanv(n,x,info,shift) real(psb_dpk_), intent(in), optional :: shift real(psb_dpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - real(psb_dpk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + real(psb_dpk_), pointer :: sumb(:) + real(psb_dpk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = dzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_d_exscan diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index ba9a0b2a..9cdcdf0e 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_e_exscanv(n,x,info,shift) integer(psb_epk_), intent(in), optional :: shift integer(psb_epk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - integer(psb_epk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_epk_), pointer :: sumb(:) + integer(psb_epk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = ezero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_e_exscan diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index 200e378a..d25617a9 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_i2_exscanv(n,x,info,shift) integer(psb_i2pk_), intent(in), optional :: shift integer(psb_i2pk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - integer(psb_i2pk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_i2pk_), pointer :: sumb(:) + integer(psb_i2pk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = i2zero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_i2_exscan diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index 05461e1e..05c8e60f 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_m_exscanv(n,x,info,shift) integer(psb_mpk_), intent(in), optional :: shift integer(psb_mpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - integer(psb_mpk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_mpk_), pointer :: sumb(:) + integer(psb_mpk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = mzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_m_exscan diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index 6e5eadd7..26a57e68 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_s_exscanv(n,x,info,shift) real(psb_spk_), intent(in), optional :: shift real(psb_spk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - real(psb_spk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + real(psb_spk_), pointer :: sumb(:) + real(psb_spk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = szero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_s_exscan diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index ef13db0b..0b15b2d6 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_z_exscanv(n,x,info,shift) complex(psb_dpk_), intent(in), optional :: shift complex(psb_dpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - complex(psb_dpk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + complex(psb_dpk_), pointer :: sumb(:) + complex(psb_dpk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = zzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_z_exscan From 40cc78854a6dfd1386a769828e21dd611792fce1 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 21 Apr 2023 13:41:16 +0200 Subject: [PATCH 23/38] Improve implementation of fix_coo using exscan --- base/modules/serial/psb_c_base_mat_mod.F90 | 12 - base/modules/serial/psb_d_base_mat_mod.F90 | 12 - base/modules/serial/psb_s_base_mat_mod.F90 | 12 - base/modules/serial/psb_z_base_mat_mod.F90 | 12 - base/serial/impl/psb_c_coo_impl.F90 | 795 +-------------------- base/serial/impl/psb_d_coo_impl.F90 | 795 +-------------------- base/serial/impl/psb_s_coo_impl.F90 | 795 +-------------------- base/serial/impl/psb_z_coo_impl.F90 | 795 +-------------------- 8 files changed, 92 insertions(+), 3136 deletions(-) diff --git a/base/modules/serial/psb_c_base_mat_mod.F90 b/base/modules/serial/psb_c_base_mat_mod.F90 index 7a7bbb1f..a5dd0fd0 100644 --- a/base/modules/serial/psb_c_base_mat_mod.F90 +++ b/base/modules/serial/psb_c_base_mat_mod.F90 @@ -1866,18 +1866,6 @@ module psb_c_base_mat_mod end subroutine psb_c_fix_coo_inner end interface - interface - subroutine psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_c_fix_coo_inner_colmajor - end interface - interface subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) diff --git a/base/modules/serial/psb_d_base_mat_mod.F90 b/base/modules/serial/psb_d_base_mat_mod.F90 index eb49905d..fac0af20 100644 --- a/base/modules/serial/psb_d_base_mat_mod.F90 +++ b/base/modules/serial/psb_d_base_mat_mod.F90 @@ -1866,18 +1866,6 @@ module psb_d_base_mat_mod end subroutine psb_d_fix_coo_inner end interface - interface - subroutine psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_d_fix_coo_inner_colmajor - end interface - interface subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) diff --git a/base/modules/serial/psb_s_base_mat_mod.F90 b/base/modules/serial/psb_s_base_mat_mod.F90 index 79c8222b..186ae577 100644 --- a/base/modules/serial/psb_s_base_mat_mod.F90 +++ b/base/modules/serial/psb_s_base_mat_mod.F90 @@ -1866,18 +1866,6 @@ module psb_s_base_mat_mod end subroutine psb_s_fix_coo_inner end interface - interface - subroutine psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_s_fix_coo_inner_colmajor - end interface - interface subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) diff --git a/base/modules/serial/psb_z_base_mat_mod.F90 b/base/modules/serial/psb_z_base_mat_mod.F90 index 5b6ca07b..3ce9074f 100644 --- a/base/modules/serial/psb_z_base_mat_mod.F90 +++ b/base/modules/serial/psb_z_base_mat_mod.F90 @@ -1866,18 +1866,6 @@ module psb_z_base_mat_mod end subroutine psb_z_fix_coo_inner end interface - interface - subroutine psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_z_fix_coo_inner_colmajor - end interface - interface subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index c9be113e..da6e97ca 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -3573,7 +3573,7 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3632,8 +3632,6 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_c_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3677,7 +3675,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3760,8 +3758,8 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3774,72 +3772,32 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do @@ -3848,7 +3806,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3877,45 +3835,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3923,7 +3843,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3952,7 +3872,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3981,7 +3901,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4186,7 +4106,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4200,22 +4120,22 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4345,676 +4265,6 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_c_fix_coo_inner_rowmajor -subroutine psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod - use psb_error_mod - use psb_c_base_mat_mod, psb_protect_name => psb_c_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif - implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - complex(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_c_fix_coo_inner_colmajor - subroutine psb_c_cp_coo_to_lcoo(a,b,info) use psb_error_mod @@ -8347,3 +7597,4 @@ subroutine psb_lc_cp_coo_from_icoo(a,b,info) return end subroutine psb_lc_cp_coo_from_icoo + diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index bb845f4b..08276552 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -3573,7 +3573,7 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3632,8 +3632,6 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_d_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3677,7 +3675,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3760,8 +3758,8 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3774,72 +3772,32 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do @@ -3848,7 +3806,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3877,45 +3835,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3923,7 +3843,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3952,7 +3872,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3981,7 +3901,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4186,7 +4106,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4200,22 +4120,22 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4345,676 +4265,6 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_d_fix_coo_inner_rowmajor -subroutine psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod - use psb_error_mod - use psb_d_base_mat_mod, psb_protect_name => psb_d_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif - implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - real(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_d_fix_coo_inner_colmajor - subroutine psb_d_cp_coo_to_lcoo(a,b,info) use psb_error_mod @@ -8347,3 +7597,4 @@ subroutine psb_ld_cp_coo_from_icoo(a,b,info) return end subroutine psb_ld_cp_coo_from_icoo + diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 0b201684..65bc5e10 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -3573,7 +3573,7 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3632,8 +3632,6 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_s_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3677,7 +3675,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3760,8 +3758,8 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3774,72 +3772,32 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do @@ -3848,7 +3806,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3877,45 +3835,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3923,7 +3843,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3952,7 +3872,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3981,7 +3901,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4186,7 +4106,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4200,22 +4120,22 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4345,676 +4265,6 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_s_fix_coo_inner_rowmajor -subroutine psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod - use psb_error_mod - use psb_s_base_mat_mod, psb_protect_name => psb_s_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif - implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - real(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_s_fix_coo_inner_colmajor - subroutine psb_s_cp_coo_to_lcoo(a,b,info) use psb_error_mod @@ -8347,3 +7597,4 @@ subroutine psb_ls_cp_coo_from_icoo(a,b,info) return end subroutine psb_ls_cp_coo_from_icoo + diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 14410f23..721d2eda 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -3573,7 +3573,7 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3632,8 +3632,6 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_z_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3677,7 +3675,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3760,8 +3758,8 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3774,72 +3772,32 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do @@ -3848,7 +3806,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3877,45 +3835,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3923,7 +3843,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3952,7 +3872,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3981,7 +3901,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4186,7 +4106,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4200,22 +4120,22 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4345,676 +4265,6 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_z_fix_coo_inner_rowmajor -subroutine psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod - use psb_error_mod - use psb_z_base_mat_mod, psb_protect_name => psb_z_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif - implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - complex(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_z_fix_coo_inner_colmajor - subroutine psb_z_cp_coo_to_lcoo(a,b,info) use psb_error_mod @@ -8347,3 +7597,4 @@ subroutine psb_lz_cp_coo_from_icoo(a,b,info) return end subroutine psb_lz_cp_coo_from_icoo + From 7e5dc20e0310e3524f57c9b209d89f0a2b12ce93 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 27 Apr 2023 13:05:46 +0200 Subject: [PATCH 24/38] Define new options for BSRCH, clean interface --- base/modules/auxil/psb_c_qsort_mod.f90 | 1 - base/modules/auxil/psb_d_qsort_mod.f90 | 5 +- base/modules/auxil/psb_e_qsort_mod.f90 | 5 +- base/modules/auxil/psb_i2_qsort_mod.f90 | 5 +- base/modules/auxil/psb_m_qsort_mod.f90 | 5 +- base/modules/auxil/psb_s_qsort_mod.f90 | 5 +- base/modules/auxil/psb_z_qsort_mod.f90 | 1 - base/modules/psb_const_mod.F90 | 16 ++-- base/serial/impl/psb_c_coo_impl.F90 | 13 ++- base/serial/impl/psb_d_coo_impl.F90 | 13 ++- base/serial/impl/psb_s_coo_impl.F90 | 13 ++- base/serial/impl/psb_z_coo_impl.F90 | 13 ++- base/serial/sort/psb_c_qsort_impl.f90 | 1 + base/serial/sort/psb_d_msort_impl.f90 | 58 ----------- base/serial/sort/psb_d_qsort_impl.f90 | 122 ++++++++++++++++++++++++ base/serial/sort/psb_e_msort_impl.f90 | 58 ----------- base/serial/sort/psb_e_qsort_impl.f90 | 122 ++++++++++++++++++++++++ base/serial/sort/psb_m_msort_impl.f90 | 58 ----------- base/serial/sort/psb_m_qsort_impl.f90 | 122 ++++++++++++++++++++++++ base/serial/sort/psb_s_msort_impl.f90 | 58 ----------- base/serial/sort/psb_s_qsort_impl.f90 | 122 ++++++++++++++++++++++++ base/serial/sort/psb_z_qsort_impl.f90 | 1 + base/tools/psb_cspins.F90 | 19 +++- base/tools/psb_dspins.F90 | 19 +++- base/tools/psb_sspins.F90 | 19 +++- base/tools/psb_zspins.F90 | 19 +++- test/omp/psb_tomp.F90 | 26 ++++- test/pargen/runs/ppde.inp | 4 +- 28 files changed, 641 insertions(+), 282 deletions(-) diff --git a/base/modules/auxil/psb_c_qsort_mod.f90 b/base/modules/auxil/psb_c_qsort_mod.f90 index 8b365222..6c4ceb3f 100644 --- a/base/modules/auxil/psb_c_qsort_mod.f90 +++ b/base/modules/auxil/psb_c_qsort_mod.f90 @@ -44,7 +44,6 @@ module psb_c_qsort_mod use psb_const_mod - interface psb_qsort subroutine psb_cqsort(x,ix,dir,flag) import diff --git a/base/modules/auxil/psb_d_qsort_mod.f90 b/base/modules/auxil/psb_d_qsort_mod.f90 index 4e1be1d1..4da0b840 100644 --- a/base/modules/auxil/psb_d_qsort_mod.f90 +++ b/base/modules/auxil/psb_d_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_d_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_dbsrch(key,n,v) result(ipos) + function psb_dbsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n real(psb_dpk_) :: key real(psb_dpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_dbsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_e_qsort_mod.f90 b/base/modules/auxil/psb_e_qsort_mod.f90 index 17943bbf..09f45d45 100644 --- a/base/modules/auxil/psb_e_qsort_mod.f90 +++ b/base/modules/auxil/psb_e_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_e_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_ebsrch(key,n,v) result(ipos) + function psb_ebsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n integer(psb_epk_) :: key integer(psb_epk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_ebsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_i2_qsort_mod.f90 b/base/modules/auxil/psb_i2_qsort_mod.f90 index 944a436e..2f192a0a 100644 --- a/base/modules/auxil/psb_i2_qsort_mod.f90 +++ b/base/modules/auxil/psb_i2_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_i2_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_i2bsrch(key,n,v) result(ipos) + function psb_i2bsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n integer(psb_i2pk_) :: key integer(psb_i2pk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_i2bsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_m_qsort_mod.f90 b/base/modules/auxil/psb_m_qsort_mod.f90 index cb4c81c1..bf029065 100644 --- a/base/modules/auxil/psb_m_qsort_mod.f90 +++ b/base/modules/auxil/psb_m_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_m_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_mbsrch(key,n,v) result(ipos) + function psb_mbsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n integer(psb_mpk_) :: key integer(psb_mpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_mbsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_s_qsort_mod.f90 b/base/modules/auxil/psb_s_qsort_mod.f90 index d4851fd1..a5bdb2d9 100644 --- a/base/modules/auxil/psb_s_qsort_mod.f90 +++ b/base/modules/auxil/psb_s_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_s_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_sbsrch(key,n,v) result(ipos) + function psb_sbsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n real(psb_spk_) :: key real(psb_spk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_sbsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_z_qsort_mod.f90 b/base/modules/auxil/psb_z_qsort_mod.f90 index 14ee0c57..2fc6baab 100644 --- a/base/modules/auxil/psb_z_qsort_mod.f90 +++ b/base/modules/auxil/psb_z_qsort_mod.f90 @@ -44,7 +44,6 @@ module psb_z_qsort_mod use psb_const_mod - interface psb_qsort subroutine psb_zqsort(x,ix,dir,flag) import diff --git a/base/modules/psb_const_mod.F90 b/base/modules/psb_const_mod.F90 index cc29f049..56134474 100644 --- a/base/modules/psb_const_mod.F90 +++ b/base/modules/psb_const_mod.F90 @@ -185,13 +185,17 @@ module psb_const_mod ! The up/down constant are defined in pairs having ! opposite values. We make use of this fact in the heapsort routine. ! - integer(psb_ipk_), parameter :: psb_sort_up_ = 1, psb_sort_down_ = -1 - integer(psb_ipk_), parameter :: psb_lsort_up_ = 2, psb_lsort_down_ = -2 - integer(psb_ipk_), parameter :: psb_asort_up_ = 3, psb_asort_down_ = -3 - integer(psb_ipk_), parameter :: psb_alsort_up_ = 4, psb_alsort_down_ = -4 - integer(psb_ipk_), parameter :: psb_sort_ovw_idx_ = 0, psb_sort_keep_idx_ = 1 - integer(psb_ipk_), parameter :: psb_heap_resize = 200 + integer(psb_ipk_), parameter :: psb_sort_up_ = 1, psb_sort_down_ = -1 + integer(psb_ipk_), parameter :: psb_lsort_up_ = 2, psb_lsort_down_ = -2 + integer(psb_ipk_), parameter :: psb_asort_up_ = 3, psb_asort_down_ = -3 + integer(psb_ipk_), parameter :: psb_alsort_up_ = 4, psb_alsort_down_ = -4 + integer(psb_ipk_), parameter :: psb_sort_ovw_idx_ = 0, psb_sort_keep_idx_ = 1 + integer(psb_ipk_), parameter :: psb_heap_resize = 200 + integer(psb_ipk_), parameter :: psb_find_any_ = 0 + integer(psb_ipk_), parameter :: psb_find_first_ge_ = 1 + integer(psb_ipk_), parameter :: psb_find_last_le_ = 2 + ! ! Sparse matrix constants diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index da6e97ca..53bc34f6 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -3674,6 +3674,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads @@ -3757,7 +3758,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) @@ -3782,7 +3783,10 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - + !$OMP SINGLE + t0 = omp_get_wtime() + !$OMP END SINGLE + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving @@ -3803,7 +3807,10 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end do !$OMP BARRIER - + !$OMP SINGLE + t1 = omp_get_wtime() + write(0,*) 'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' do j=idxstart,idxend diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 08276552..030dc867 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -3674,6 +3674,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads @@ -3757,7 +3758,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) @@ -3782,7 +3783,10 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - + !$OMP SINGLE + t0 = omp_get_wtime() + !$OMP END SINGLE + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving @@ -3803,7 +3807,10 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end do !$OMP BARRIER - + !$OMP SINGLE + t1 = omp_get_wtime() + write(0,*) 'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' do j=idxstart,idxend diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 65bc5e10..fb19d523 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -3674,6 +3674,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads @@ -3757,7 +3758,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) @@ -3782,7 +3783,10 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - + !$OMP SINGLE + t0 = omp_get_wtime() + !$OMP END SINGLE + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving @@ -3803,7 +3807,10 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end do !$OMP BARRIER - + !$OMP SINGLE + t1 = omp_get_wtime() + write(0,*) 'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' do j=idxstart,idxend diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 721d2eda..ba14eeba 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -3674,6 +3674,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads @@ -3757,7 +3758,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) @@ -3782,7 +3783,10 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - + !$OMP SINGLE + t0 = omp_get_wtime() + !$OMP END SINGLE + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving @@ -3803,7 +3807,10 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end do !$OMP BARRIER - + !$OMP SINGLE + t1 = omp_get_wtime() + write(0,*) 'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' do j=idxstart,idxend diff --git a/base/serial/sort/psb_c_qsort_impl.f90 b/base/serial/sort/psb_c_qsort_impl.f90 index 712529fc..7f33c099 100644 --- a/base/serial/sort/psb_c_qsort_impl.f90 +++ b/base/serial/sort/psb_c_qsort_impl.f90 @@ -40,6 +40,7 @@ ! Data Structures and Algorithms ! Addison-Wesley ! + subroutine psb_cqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_cqsort use psb_error_mod diff --git a/base/serial/sort/psb_d_msort_impl.f90 b/base/serial/sort/psb_d_msort_impl.f90 index 11029818..66ad7897 100644 --- a/base/serial/sort/psb_d_msort_impl.f90 +++ b/base/serial/sort/psb_d_msort_impl.f90 @@ -76,64 +76,6 @@ subroutine psb_dmsort_u(x,nout,dir) return end subroutine psb_dmsort_u - -function psb_dbsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_dbsrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_dpk_) :: key - real(psb_dpk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_dbsrch - -function psb_dssrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_dssrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_dpk_) :: key - real(psb_dpk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_dssrch - subroutine psb_dmsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_dmsort use psb_error_mod diff --git a/base/serial/sort/psb_d_qsort_impl.f90 b/base/serial/sort/psb_d_qsort_impl.f90 index 13328188..4d6918e7 100644 --- a/base/serial/sort/psb_d_qsort_impl.f90 +++ b/base/serial/sort/psb_d_qsort_impl.f90 @@ -40,6 +40,128 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_dbsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_dbsrch + implicit none + integer(psb_ipk_) :: ipos, n +real(psb_dpk_) :: key +real(psb_dpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) then + ipos = m + exit + end if + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) then + ipos = m + exit + end if + m = m + 1 + end do + else + do while (m>n) + if (v(m)>=key) then + m=m-1 + else + exit + end if + end do + end if + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_dbsrch + +function psb_dssrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_dssrch + implicit none + integer(psb_ipk_) :: ipos, n + real(psb_dpk_) :: key + real(psb_dpk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_dssrch + subroutine psb_dqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_dqsort use psb_error_mod diff --git a/base/serial/sort/psb_e_msort_impl.f90 b/base/serial/sort/psb_e_msort_impl.f90 index d8cd6404..b97d448a 100644 --- a/base/serial/sort/psb_e_msort_impl.f90 +++ b/base/serial/sort/psb_e_msort_impl.f90 @@ -131,64 +131,6 @@ subroutine psb_emsort_u(x,nout,dir) return end subroutine psb_emsort_u - -function psb_ebsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_ebsrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_epk_) :: key - integer(psb_epk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_ebsrch - -function psb_essrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_essrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_epk_) :: key - integer(psb_epk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_essrch - subroutine psb_emsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_emsort use psb_error_mod diff --git a/base/serial/sort/psb_e_qsort_impl.f90 b/base/serial/sort/psb_e_qsort_impl.f90 index 9b95c78e..8be3cd78 100644 --- a/base/serial/sort/psb_e_qsort_impl.f90 +++ b/base/serial/sort/psb_e_qsort_impl.f90 @@ -40,6 +40,128 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_ebsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_ebsrch + implicit none + integer(psb_ipk_) :: ipos, n +integer(psb_epk_) :: key +integer(psb_epk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) then + ipos = m + exit + end if + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) then + ipos = m + exit + end if + m = m + 1 + end do + else + do while (m>n) + if (v(m)>=key) then + m=m-1 + else + exit + end if + end do + end if + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_ebsrch + +function psb_essrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_essrch + implicit none + integer(psb_ipk_) :: ipos, n + integer(psb_epk_) :: key + integer(psb_epk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_essrch + subroutine psb_eqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_eqsort use psb_error_mod diff --git a/base/serial/sort/psb_m_msort_impl.f90 b/base/serial/sort/psb_m_msort_impl.f90 index cd99a3c5..437d1069 100644 --- a/base/serial/sort/psb_m_msort_impl.f90 +++ b/base/serial/sort/psb_m_msort_impl.f90 @@ -131,64 +131,6 @@ subroutine psb_mmsort_u(x,nout,dir) return end subroutine psb_mmsort_u - -function psb_mbsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_mbsrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_mpk_) :: key - integer(psb_mpk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_mbsrch - -function psb_mssrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_mssrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_mpk_) :: key - integer(psb_mpk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_mssrch - subroutine psb_mmsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_mmsort use psb_error_mod diff --git a/base/serial/sort/psb_m_qsort_impl.f90 b/base/serial/sort/psb_m_qsort_impl.f90 index ac8241f5..460bff43 100644 --- a/base/serial/sort/psb_m_qsort_impl.f90 +++ b/base/serial/sort/psb_m_qsort_impl.f90 @@ -40,6 +40,128 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_mbsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_mbsrch + implicit none + integer(psb_ipk_) :: ipos, n +integer(psb_mpk_) :: key +integer(psb_mpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) then + ipos = m + exit + end if + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) then + ipos = m + exit + end if + m = m + 1 + end do + else + do while (m>n) + if (v(m)>=key) then + m=m-1 + else + exit + end if + end do + end if + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_mbsrch + +function psb_mssrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_mssrch + implicit none + integer(psb_ipk_) :: ipos, n + integer(psb_mpk_) :: key + integer(psb_mpk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_mssrch + subroutine psb_mqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_mqsort use psb_error_mod diff --git a/base/serial/sort/psb_s_msort_impl.f90 b/base/serial/sort/psb_s_msort_impl.f90 index dfd7508c..e3382f27 100644 --- a/base/serial/sort/psb_s_msort_impl.f90 +++ b/base/serial/sort/psb_s_msort_impl.f90 @@ -76,64 +76,6 @@ subroutine psb_smsort_u(x,nout,dir) return end subroutine psb_smsort_u - -function psb_sbsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_sbsrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_spk_) :: key - real(psb_spk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_sbsrch - -function psb_sssrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_sssrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_spk_) :: key - real(psb_spk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_sssrch - subroutine psb_smsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_smsort use psb_error_mod diff --git a/base/serial/sort/psb_s_qsort_impl.f90 b/base/serial/sort/psb_s_qsort_impl.f90 index d6e0e66e..44a46e0a 100644 --- a/base/serial/sort/psb_s_qsort_impl.f90 +++ b/base/serial/sort/psb_s_qsort_impl.f90 @@ -40,6 +40,128 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_sbsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_sbsrch + implicit none + integer(psb_ipk_) :: ipos, n +real(psb_spk_) :: key +real(psb_spk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) then + ipos = m + exit + end if + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) then + ipos = m + exit + end if + m = m + 1 + end do + else + do while (m>n) + if (v(m)>=key) then + m=m-1 + else + exit + end if + end do + end if + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_sbsrch + +function psb_sssrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_sssrch + implicit none + integer(psb_ipk_) :: ipos, n + real(psb_spk_) :: key + real(psb_spk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_sssrch + subroutine psb_sqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_sqsort use psb_error_mod diff --git a/base/serial/sort/psb_z_qsort_impl.f90 b/base/serial/sort/psb_z_qsort_impl.f90 index 7b0af1c5..a1cdb193 100644 --- a/base/serial/sort/psb_z_qsort_impl.f90 +++ b/base/serial/sort/psb_z_qsort_impl.f90 @@ -40,6 +40,7 @@ ! Data Structures and Algorithms ! Addison-Wesley ! + subroutine psb_zqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_zqsort use psb_error_mod diff --git a/base/tools/psb_cspins.F90 b/base/tools/psb_cspins.F90 index 0f5fc9df..f523a529 100644 --- a/base/tools/psb_cspins.F90 +++ b/base/tools/psb_cspins.F90 @@ -135,7 +135,7 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& @@ -198,9 +198,18 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -210,7 +219,7 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -229,8 +238,12 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 06f913b5..3b7e0e80 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -135,7 +135,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& @@ -198,9 +198,18 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -210,7 +219,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -229,8 +238,12 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/base/tools/psb_sspins.F90 b/base/tools/psb_sspins.F90 index 56ef9c97..9781eaae 100644 --- a/base/tools/psb_sspins.F90 +++ b/base/tools/psb_sspins.F90 @@ -135,7 +135,7 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& @@ -198,9 +198,18 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -210,7 +219,7 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -229,8 +238,12 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/base/tools/psb_zspins.F90 b/base/tools/psb_zspins.F90 index d483f198..36b0b5a5 100644 --- a/base/tools/psb_zspins.F90 +++ b/base/tools/psb_zspins.F90 @@ -135,7 +135,7 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) #endif call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& @@ -198,9 +198,18 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -210,7 +219,7 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -229,8 +238,12 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 index fda08f4e..79097ca8 100644 --- a/test/omp/psb_tomp.F90 +++ b/test/omp/psb_tomp.F90 @@ -736,6 +736,30 @@ program psb_d_pde3d write(*,*) 'Welcome to PSBLAS version: ',psb_version_string_ write(*,*) 'This is the ',trim(name),' sample program' end if +#if 0 + block + integer(psb_ipk_), parameter :: ntv=10 + integer(psb_ipk_) :: itv(ntv+1),i + itv(:) = 0 + do i=1,ntv + itv(i) = 2 + mod(i,2) + end do + write(0,*) 'ITV before : ',itv(:) + call psi_exscan(ntv,itv,info) + write(0,*) 'ITV after : ',itv(:) + itv(:) = 0 + do i=1,ntv + itv(i) = 2 + mod(i,2) + end do + write(0,*) 'ITV before 1: ',itv(:) + call psi_exscan(ntv,itv,info,shift=ione) + write(0,*) 'ITV after 1: ',itv(:) + ! call a%print('a.mtx',head='Test') + end block +!!$ +!!$ call psb_exit(ctxt) +!!$ stop +#endif ! ! get parameters ! @@ -756,6 +780,7 @@ program psb_d_pde3d end if if (iam == psb_root_) write(psb_out_unit,'("Overall matrix creation time : ",es12.5)')t2 if (iam == psb_root_) write(psb_out_unit,'(" ")') + call a%print('a.mtx',head='Test') ! ! prepare the preconditioner. ! @@ -858,7 +883,6 @@ program psb_d_pde3d write(psb_out_unit,'("Storage format for DESC_A: ",a)') desc_a%get_fmt() end if - ! ! cleanup storage and exit ! diff --git a/test/pargen/runs/ppde.inp b/test/pargen/runs/ppde.inp index fb7af68c..40e3358d 100644 --- a/test/pargen/runs/ppde.inp +++ b/test/pargen/runs/ppde.inp @@ -2,10 +2,10 @@ BICGSTAB Iterative method BICGSTAB CGS BICG BICGSTABL RGMRES FCG CGR BJAC Preconditioner NONE DIAG BJAC CSR Storage format for matrix A: CSR COO -040 Domain size (acutal system is this**3 (pde3d) or **2 (pde2d) ) +140 Domain size (acutal system is this**3 (pde3d) or **2 (pde2d) ) 3 Partition: 1 BLOCK 3 3D 2 Stopping criterion 1 2 -0100 MAXIT +0200 MAXIT 05 ITRACE 002 IRST restart for RGMRES and BiCGSTABL INVK Block Solver ILU,ILUT,INVK,AINVT,AORTH From 494e29dd2eac32b8ebbf72ab16c6fabd26dbed0b Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 23 May 2023 15:56:48 +0200 Subject: [PATCH 25/38] Cosmetic adjustments to COO and BSRCH --- base/serial/impl/psb_c_coo_impl.F90 | 11 +++++------ base/serial/impl/psb_d_coo_impl.F90 | 11 +++++------ base/serial/impl/psb_s_coo_impl.F90 | 11 +++++------ base/serial/impl/psb_z_coo_impl.F90 | 11 +++++------ base/serial/sort/psb_d_qsort_impl.f90 | 23 ++++++++++------------- base/serial/sort/psb_e_qsort_impl.f90 | 23 ++++++++++------------- base/serial/sort/psb_m_qsort_impl.f90 | 23 ++++++++++------------- base/serial/sort/psb_s_qsort_impl.f90 | 23 ++++++++++------------- 8 files changed, 60 insertions(+), 76 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 8f1f54ae..03939afe 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -3669,7 +3669,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_spk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers @@ -3759,7 +3759,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP PARALLEL default(none) & !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3783,7 +3783,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER !$OMP SINGLE - t0 = omp_get_wtime() + !t0 = omp_get_wtime() !$OMP END SINGLE ! ------------------ Sorting and buffers ------------------- @@ -3793,7 +3793,6 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) @@ -3807,8 +3806,8 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP BARRIER !$OMP SINGLE - t1 = omp_get_wtime() - write(0,*) 'Srt&Cpy :',t1-t0 + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 5d50788a..9597c5f5 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -3669,7 +3669,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_dpk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers @@ -3759,7 +3759,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP PARALLEL default(none) & !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3783,7 +3783,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER !$OMP SINGLE - t0 = omp_get_wtime() + !t0 = omp_get_wtime() !$OMP END SINGLE ! ------------------ Sorting and buffers ------------------- @@ -3793,7 +3793,6 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) @@ -3807,8 +3806,8 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP BARRIER !$OMP SINGLE - t1 = omp_get_wtime() - write(0,*) 'Srt&Cpy :',t1-t0 + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 837eb429..174d3e07 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -3669,7 +3669,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_spk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers @@ -3759,7 +3759,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP PARALLEL default(none) & !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3783,7 +3783,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER !$OMP SINGLE - t0 = omp_get_wtime() + !t0 = omp_get_wtime() !$OMP END SINGLE ! ------------------ Sorting and buffers ------------------- @@ -3793,7 +3793,6 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) @@ -3807,8 +3806,8 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP BARRIER !$OMP SINGLE - t1 = omp_get_wtime() - write(0,*) 'Srt&Cpy :',t1-t0 + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 21ae8658..2ccc614a 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -3669,7 +3669,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_dpk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers @@ -3759,7 +3759,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP PARALLEL default(none) & !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) reduction(max: info) + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3783,7 +3783,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER !$OMP SINGLE - t0 = omp_get_wtime() + !t0 = omp_get_wtime() !$OMP END SINGLE ! ------------------ Sorting and buffers ------------------- @@ -3793,7 +3793,6 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) @@ -3807,8 +3806,8 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP BARRIER !$OMP SINGLE - t1 = omp_get_wtime() - write(0,*) 'Srt&Cpy :',t1-t0 + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' diff --git a/base/serial/sort/psb_d_qsort_impl.f90 b/base/serial/sort/psb_d_qsort_impl.f90 index 4d6918e7..8e4b1d21 100644 --- a/base/serial/sort/psb_d_qsort_impl.f90 +++ b/base/serial/sort/psb_d_qsort_impl.f90 @@ -44,8 +44,8 @@ function psb_dbsrch(key,n,v,dir,find) result(ipos) use psb_sort_mod, psb_protect_name => psb_dbsrch implicit none integer(psb_ipk_) :: ipos, n -real(psb_dpk_) :: key -real(psb_dpk_) :: v(:) + real(psb_dpk_) :: key + real(psb_dpk_) :: v(:) integer(psb_ipk_), optional :: dir, find integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ @@ -95,40 +95,37 @@ real(psb_dpk_) :: v(:) if ((m>n) .or. (m<1)) then m = n do while (m>=1) - if (v(m)<=key) then - ipos = m - exit - end if + if (v(m)<=key) exit m = m - 1 end do else do while (mn) .or. (m<1)) then m = 1 do while (m<=n) - if (v(m)>=key) then - ipos = m - exit - end if + if (v(m)>=key) exit m = m + 1 end do else - do while (m>n) - if (v(m)>=key) then + do while (m>1) + if (v(m-1)>=key) then m=m-1 else exit end if end do end if + ipos = max(m,1) + case default write(0,*) 'Wrong FIND' end select diff --git a/base/serial/sort/psb_e_qsort_impl.f90 b/base/serial/sort/psb_e_qsort_impl.f90 index 8be3cd78..c70f8051 100644 --- a/base/serial/sort/psb_e_qsort_impl.f90 +++ b/base/serial/sort/psb_e_qsort_impl.f90 @@ -44,8 +44,8 @@ function psb_ebsrch(key,n,v,dir,find) result(ipos) use psb_sort_mod, psb_protect_name => psb_ebsrch implicit none integer(psb_ipk_) :: ipos, n -integer(psb_epk_) :: key -integer(psb_epk_) :: v(:) + integer(psb_epk_) :: key + integer(psb_epk_) :: v(:) integer(psb_ipk_), optional :: dir, find integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ @@ -95,40 +95,37 @@ integer(psb_epk_) :: v(:) if ((m>n) .or. (m<1)) then m = n do while (m>=1) - if (v(m)<=key) then - ipos = m - exit - end if + if (v(m)<=key) exit m = m - 1 end do else do while (mn) .or. (m<1)) then m = 1 do while (m<=n) - if (v(m)>=key) then - ipos = m - exit - end if + if (v(m)>=key) exit m = m + 1 end do else - do while (m>n) - if (v(m)>=key) then + do while (m>1) + if (v(m-1)>=key) then m=m-1 else exit end if end do end if + ipos = max(m,1) + case default write(0,*) 'Wrong FIND' end select diff --git a/base/serial/sort/psb_m_qsort_impl.f90 b/base/serial/sort/psb_m_qsort_impl.f90 index 460bff43..6b70c3a0 100644 --- a/base/serial/sort/psb_m_qsort_impl.f90 +++ b/base/serial/sort/psb_m_qsort_impl.f90 @@ -44,8 +44,8 @@ function psb_mbsrch(key,n,v,dir,find) result(ipos) use psb_sort_mod, psb_protect_name => psb_mbsrch implicit none integer(psb_ipk_) :: ipos, n -integer(psb_mpk_) :: key -integer(psb_mpk_) :: v(:) + integer(psb_mpk_) :: key + integer(psb_mpk_) :: v(:) integer(psb_ipk_), optional :: dir, find integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ @@ -95,40 +95,37 @@ integer(psb_mpk_) :: v(:) if ((m>n) .or. (m<1)) then m = n do while (m>=1) - if (v(m)<=key) then - ipos = m - exit - end if + if (v(m)<=key) exit m = m - 1 end do else do while (mn) .or. (m<1)) then m = 1 do while (m<=n) - if (v(m)>=key) then - ipos = m - exit - end if + if (v(m)>=key) exit m = m + 1 end do else - do while (m>n) - if (v(m)>=key) then + do while (m>1) + if (v(m-1)>=key) then m=m-1 else exit end if end do end if + ipos = max(m,1) + case default write(0,*) 'Wrong FIND' end select diff --git a/base/serial/sort/psb_s_qsort_impl.f90 b/base/serial/sort/psb_s_qsort_impl.f90 index 44a46e0a..cae32546 100644 --- a/base/serial/sort/psb_s_qsort_impl.f90 +++ b/base/serial/sort/psb_s_qsort_impl.f90 @@ -44,8 +44,8 @@ function psb_sbsrch(key,n,v,dir,find) result(ipos) use psb_sort_mod, psb_protect_name => psb_sbsrch implicit none integer(psb_ipk_) :: ipos, n -real(psb_spk_) :: key -real(psb_spk_) :: v(:) + real(psb_spk_) :: key + real(psb_spk_) :: v(:) integer(psb_ipk_), optional :: dir, find integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ @@ -95,40 +95,37 @@ real(psb_spk_) :: v(:) if ((m>n) .or. (m<1)) then m = n do while (m>=1) - if (v(m)<=key) then - ipos = m - exit - end if + if (v(m)<=key) exit m = m - 1 end do else do while (mn) .or. (m<1)) then m = 1 do while (m<=n) - if (v(m)>=key) then - ipos = m - exit - end if + if (v(m)>=key) exit m = m + 1 end do else - do while (m>n) - if (v(m)>=key) then + do while (m>1) + if (v(m-1)>=key) then m=m-1 else exit end if end do end if + ipos = max(m,1) + case default write(0,*) 'Wrong FIND' end select From 1941affe7a8969bbea6bec721003dad131597b7d Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 30 May 2023 12:26:58 +0200 Subject: [PATCH 26/38] Exposed error in AMG test when not parallelizing generation loop --- base/modules/auxil/psb_e_realloc_mod.F90 | 12 +++++--- base/modules/desc/psb_gen_block_map_mod.F90 | 2 +- base/modules/desc/psb_hash_map_mod.F90 | 33 +++++++++++++++------ base/modules/desc/psb_hash_mod.F90 | 7 ++++- base/tools/psb_dspins.F90 | 8 +++-- test/pargen/psb_d_pde3d.F90 | 1 + 6 files changed, 45 insertions(+), 18 deletions(-) diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index 0f2431fd..e11abd7c 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -139,7 +139,7 @@ Contains name='psb_r_m_e_rk1' call psb_erractionsave(err_act) info=psb_success_ - if (debug) write(psb_err_unit,*) 'reallocate D',len + if (debug) write(psb_err_unit,*) 'e_reallocate D',len if (present(lb)) then lb_ = lb @@ -170,7 +170,7 @@ Contains End If else dim = 0 - Allocate(rrax(lb_:ub_),stat=info) + Allocate(rrax(lb_:ub_),stat=info) if (info /= psb_success_) then err=4025 call psb_errpush(err,name, l_err=(/len*1_psb_lpk_/), & @@ -178,6 +178,7 @@ Contains goto 9999 end if endif + if (debug) write(psb_err_unit,*) 'reallocate 2 info',info if (present(pad)) then !$omp parallel do private(i) shared(dim,len) do i=lb_-1+dim+1,lb_-1+len @@ -187,8 +188,9 @@ Contains call psb_erractionrestore(err_act) return -9999 continue +9999 continue info = err + !write(0,*) 'e_realloc : ',info call psb_error_handler(err_act) return @@ -986,7 +988,8 @@ Contains call psb_erractionsave(err_act) info = psb_success_ - if (psb_errstatus_fatal()) then + if (psb_errstatus_fatal()) then + write(0,*) 'From ensure_size: errstatus_fatal()' info=psb_err_from_subroutine_ goto 9999 end if @@ -1028,6 +1031,7 @@ Contains !$OMP END CRITICAL if (info /= psb_success_) then + write(0,*) 'From ensure_size: ',info,psb_err_from_subroutine_ info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') goto 9999 diff --git a/base/modules/desc/psb_gen_block_map_mod.F90 b/base/modules/desc/psb_gen_block_map_mod.F90 index 2c20a547..f0c433e0 100644 --- a/base/modules/desc/psb_gen_block_map_mod.F90 +++ b/base/modules/desc/psb_gen_block_map_mod.F90 @@ -488,7 +488,7 @@ contains integer(psb_ipk_) :: iam, np logical :: owned_ - write(0,*) 'block_g2lv2' + !write(0,*) 'block_g2lv2' info = 0 ctxt = idxmap%get_ctxt() call psb_info(ctxt,iam,np) diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index eac8cc7a..4e04a371 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -515,7 +515,6 @@ contains endif end if enddo - else write(0,*) 'Hash status: invalid ',idxmap%get_state() idxout(1:is) = -1 @@ -655,7 +654,7 @@ contains #endif logical, volatile :: isLoopValid info = psb_success_ - name = 'hash_g2l_ins' + name = 'hash_g2lv1_ins' call psb_erractionsave(err_act) ctxt = idxmap%get_ctxt() @@ -679,7 +678,7 @@ contains mglob = idxmap%get_gr() nrow = idxmap%get_lr() - + !write(0,*) me,name,' before loop ',psb_errstatus_fatal() if (use_openmp) then #ifdef OPENMP !call OMP_init_lock(ins_lck) @@ -751,6 +750,7 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then + !write(0,*) 'Error spot 1' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -832,6 +832,7 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then + !write(0,*) 'Error spot 2' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -883,38 +884,42 @@ contains ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, ! but inside a critical region. + !write(0,*) me,name,' b hic 1 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - + !write(0,*) me,name,' a hic 1 ',psb_errstatus_fatal() if (lip < 0) then !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing, so this lock is relatively rare). ncol = idxmap%get_lc() - nxt = ncol + 1 + nxt = ncol + 1 + !write(0,*) me,name,' b hic 2 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - + !write(0,*) me,name,' a hic 2 ',psb_errstatus_fatal() if (lip > 0) then idx(i) = lip else if (lip < 0) then ! Index not found + !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() lip = tlip if (info >= 0) then + !write(0,*) 'Error before spot 3', info ! 'nxt' is not equal to 'tlip' when the key is already inside ! the hash map. In that case 'tlip' is the value corresponding ! to the existing mapping. if (nxt == tlip) then ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then + !write(0,*) 'Error spot 3', info call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -996,6 +1001,7 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then + !write(0,*) 'Error spot 4' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -1069,6 +1075,7 @@ contains call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then + !write(0,*) 'Error spot' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. @@ -1113,9 +1120,11 @@ contains if (info >=0) then if (nxt == lip) then ncol = max(nxt,ncol) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then info=1 + !write(0,*) 'Error spot' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. @@ -1162,6 +1171,7 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then info=1 + write(0,*) 'Error spot 5' call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. @@ -1205,6 +1215,7 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then info=1 + write(0,*) 'Error spot 6' ch_err='psb_ensure_size' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) @@ -1239,6 +1250,7 @@ contains #endif if (.not. isLoopValid) goto 9999 end if + !write(0,*) me,name,' after loop ',psb_errstatus_fatal() call psb_erractionrestore(err_act) return @@ -1252,6 +1264,7 @@ contains subroutine hash_g2lv2_ins(idxin,idxout,idxmap,info,mask,lidx) use psb_realloc_mod + use psb_error_mod implicit none class(psb_hash_map), intent(inout) :: idxmap integer(psb_lpk_), intent(in) :: idxin(:) @@ -1264,7 +1277,9 @@ contains is = size(idxin) im = min(is,size(idxout)) + !write(0,*) 'g2lv2_ins before realloc ',psb_errstatus_fatal() call psb_realloc(im,tidx,info) + !write(0,*) 'g2lv2_ins after realloc ',psb_errstatus_fatal() tidx(1:im) = idxin(1:im) call idxmap%g2lip_ins(tidx(1:im),info,mask=mask,lidx=lidx) idxout(1:im) = tidx(1:im) diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index c3c5ba20..68e10a05 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -407,6 +407,7 @@ contains if (hash%table(hk,1) == key) then val = hash%table(hk,2) info = HashDuplicate + !write(0,*) 'In searchinskey 1 : ', info, HashDuplicate return end if !$omp critical(hashsearchins) @@ -440,11 +441,15 @@ contains end if end if !$omp end critical(hashsearchins) - if (info /= HashOk) return + if (info /= HashOk) then + write(0,*) 'In searchinskey 2: ', info + return + end if if (val > 0) return hk = hk - hd if (hk < 0) hk = hk + hsize end do + write(0,*) 'In searchinskey 3: ', info end subroutine psb_hash_lsearchinskey recursive subroutine psb_hash_isearchinskey(key,val,nextval,hash,info) diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 3b7e0e80..094d0a4b 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -137,10 +137,12 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) #if defined(OPENMP) !$omp parallel private(ila,jla,nrow,ncol,nnl,k) #endif + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) @@ -148,7 +150,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then @@ -181,7 +183,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name) !goto 9999 end if - + !write(0,*) me,' after csput',psb_errstatus_fatal() #if defined(OPENMP) !$omp end parallel #endif diff --git a/test/pargen/psb_d_pde3d.F90 b/test/pargen/psb_d_pde3d.F90 index 6e895c00..eebc5ad8 100644 --- a/test/pargen/psb_d_pde3d.F90 +++ b/test/pargen/psb_d_pde3d.F90 @@ -737,6 +737,7 @@ program psb_d_pde3d ! ! allocate and fill in the coefficient matrix, rhs and initial guess ! + call psb_cd_set_large_threshold(100_psb_lpk_) call psb_barrier(ctxt) t1 = psb_wtime() call psb_gen_pde3d(ctxt,idim,a,bv,xxv,desc_a,afmt,info,partition=ipart) From db0e4db507027444f5c93e489596398c893571fb Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 2 Jun 2023 11:06:53 +0200 Subject: [PATCH 27/38] Minimize debug sttements in hash_ins --- base/modules/auxil/psb_e_realloc_mod.F90 | 12 ++++-------- base/modules/desc/psb_hash_map_mod.F90 | 12 +++++++++++- base/modules/desc/psb_hash_mod.F90 | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index e11abd7c..0f2431fd 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -139,7 +139,7 @@ Contains name='psb_r_m_e_rk1' call psb_erractionsave(err_act) info=psb_success_ - if (debug) write(psb_err_unit,*) 'e_reallocate D',len + if (debug) write(psb_err_unit,*) 'reallocate D',len if (present(lb)) then lb_ = lb @@ -170,7 +170,7 @@ Contains End If else dim = 0 - Allocate(rrax(lb_:ub_),stat=info) + Allocate(rrax(lb_:ub_),stat=info) if (info /= psb_success_) then err=4025 call psb_errpush(err,name, l_err=(/len*1_psb_lpk_/), & @@ -178,7 +178,6 @@ Contains goto 9999 end if endif - if (debug) write(psb_err_unit,*) 'reallocate 2 info',info if (present(pad)) then !$omp parallel do private(i) shared(dim,len) do i=lb_-1+dim+1,lb_-1+len @@ -188,9 +187,8 @@ Contains call psb_erractionrestore(err_act) return -9999 continue +9999 continue info = err - !write(0,*) 'e_realloc : ',info call psb_error_handler(err_act) return @@ -988,8 +986,7 @@ Contains call psb_erractionsave(err_act) info = psb_success_ - if (psb_errstatus_fatal()) then - write(0,*) 'From ensure_size: errstatus_fatal()' + if (psb_errstatus_fatal()) then info=psb_err_from_subroutine_ goto 9999 end if @@ -1031,7 +1028,6 @@ Contains !$OMP END CRITICAL if (info /= psb_success_) then - write(0,*) 'From ensure_size: ',info,psb_err_from_subroutine_ info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') goto 9999 diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index 4e04a371..b7f53879 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -375,6 +375,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -414,6 +415,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -503,6 +505,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -541,6 +544,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -704,6 +708,7 @@ contains ! $ OMP reduction(.AND.:isLoopValid) do i = 1, is info = 0 + if (.not. isLoopValid) cycle if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -790,6 +795,7 @@ contains ! $ OMP reduction(.AND.:isLoopValid) do i = 1, is info = 0 + if (.not. isLoopValid) cycle ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 @@ -867,10 +873,11 @@ contains ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & ! $ OMP private(i,ip,lip,tlip,nxt,info) & ! $ OMP reduction(.AND.:isLoopValid) - !$omp critical(hash_g2l_ins) + !$omp critical(hash_g2l_ins) do i = 1, is info = 0 + if (.not. isLoopValid) cycle if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -905,6 +912,7 @@ contains ! Index not found !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + if (psb_errstatus_fatal()) write(0,*) me,name,' a hsik ',info,omp_get_thread_num() !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() lip = tlip @@ -918,6 +926,7 @@ contains ncol = MAX(ncol,nxt) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) + if (psb_errstatus_fatal()) write(0,*) me,name,' a esz ',info,omp_get_thread_num() if (info /= psb_success_) then !write(0,*) 'Error spot 3', info call psb_errpush(psb_err_from_subroutine_ai_,name,& @@ -958,6 +967,7 @@ contains !$omp critical(hash_g2l_ins) do i = 1, is info = 0 + if (.not. isLoopValid) cycle ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index 68e10a05..9e45e1f0 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -449,7 +449,7 @@ contains hk = hk - hd if (hk < 0) hk = hk + hsize end do - write(0,*) 'In searchinskey 3: ', info + !write(0,*) 'In searchinskey 3: ', info end subroutine psb_hash_lsearchinskey recursive subroutine psb_hash_isearchinskey(key,val,nextval,hash,info) From 347352fe1e5b3ba8f8ebfa085c34c010a8474df0 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 2 Jun 2023 11:07:23 +0200 Subject: [PATCH 28/38] Make spins work in OpenMP from either par or serial --- base/tools/psb_cspins.F90 | 123 ++++++++++++++++++++++++++++++++---- base/tools/psb_dspins.F90 | 119 +++++++++++++++++++++++++++++++--- base/tools/psb_sspins.F90 | 123 ++++++++++++++++++++++++++++++++---- base/tools/psb_zspins.F90 | 123 ++++++++++++++++++++++++++++++++---- test/pargen/psb_d_pde3d.F90 | 1 - 5 files changed, 445 insertions(+), 44 deletions(-) diff --git a/base/tools/psb_cspins.F90 b/base/tools/psb_cspins.F90 index f523a529..e5f2731d 100644 --- a/base/tools/psb_cspins.F90 +++ b/base/tools/psb_cspins.F90 @@ -135,28 +135,132 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol,nnl,k) -#endif + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - + + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - !goto 9999 + goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - !goto 9999 + goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then @@ -179,11 +283,8 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - !goto 9999 + goto 9999 end if - -#if defined(OPENMP) - !$omp end parallel #endif if (info /= 0) goto 9999 endif diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 094d0a4b..cdeaa931 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -135,18 +135,120 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol,nnl,k) -#endif + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) - !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - !goto 9999 + goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() @@ -156,9 +258,9 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - !goto 9999 + goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then @@ -181,11 +283,8 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - !goto 9999 + goto 9999 end if - !write(0,*) me,' after csput',psb_errstatus_fatal() -#if defined(OPENMP) - !$omp end parallel #endif if (info /= 0) goto 9999 endif diff --git a/base/tools/psb_sspins.F90 b/base/tools/psb_sspins.F90 index 9781eaae..39e4ad79 100644 --- a/base/tools/psb_sspins.F90 +++ b/base/tools/psb_sspins.F90 @@ -135,28 +135,132 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol,nnl,k) -#endif + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - + + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - !goto 9999 + goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - !goto 9999 + goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then @@ -179,11 +283,8 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - !goto 9999 + goto 9999 end if - -#if defined(OPENMP) - !$omp end parallel #endif if (info /= 0) goto 9999 endif diff --git a/base/tools/psb_zspins.F90 b/base/tools/psb_zspins.F90 index 36b0b5a5..0c0ff91f 100644 --- a/base/tools/psb_zspins.F90 +++ b/base/tools/psb_zspins.F90 @@ -135,28 +135,132 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) goto 9999 end if #if defined(OPENMP) - !$omp parallel private(ila,jla,nrow,ncol,nnl,k) -#endif + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) - + + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) - !goto 9999 + goto 9999 end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - !goto 9999 + goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then @@ -179,11 +283,8 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) else info = psb_err_invalid_a_and_cd_state_ call psb_errpush(info,name) - !goto 9999 + goto 9999 end if - -#if defined(OPENMP) - !$omp end parallel #endif if (info /= 0) goto 9999 endif diff --git a/test/pargen/psb_d_pde3d.F90 b/test/pargen/psb_d_pde3d.F90 index eebc5ad8..6e895c00 100644 --- a/test/pargen/psb_d_pde3d.F90 +++ b/test/pargen/psb_d_pde3d.F90 @@ -737,7 +737,6 @@ program psb_d_pde3d ! ! allocate and fill in the coefficient matrix, rhs and initial guess ! - call psb_cd_set_large_threshold(100_psb_lpk_) call psb_barrier(ctxt) t1 = psb_wtime() call psb_gen_pde3d(ctxt,idim,a,bv,xxv,desc_a,afmt,info,partition=ipart) From a66778f2706db6c498fc45efb22994a76128cfd4 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 21 Jun 2023 12:21:27 +0200 Subject: [PATCH 29/38] Improve coo and merge development --- base/modules/auxil/psb_c_hsort_x_mod.f90 | 6 +-- base/modules/auxil/psb_c_realloc_mod.F90 | 52 ++++++++++++----------- base/modules/auxil/psb_d_hsort_x_mod.f90 | 6 +-- base/modules/auxil/psb_d_realloc_mod.F90 | 52 ++++++++++++----------- base/modules/auxil/psb_e_realloc_mod.F90 | 52 ++++++++++++----------- base/modules/auxil/psb_i2_realloc_mod.F90 | 52 ++++++++++++----------- base/modules/auxil/psb_i_hsort_x_mod.f90 | 6 +-- base/modules/auxil/psb_l_hsort_x_mod.f90 | 6 +-- base/modules/auxil/psb_m_realloc_mod.F90 | 52 ++++++++++++----------- base/modules/auxil/psb_s_hsort_x_mod.f90 | 6 +-- base/modules/auxil/psb_s_realloc_mod.F90 | 52 ++++++++++++----------- base/modules/auxil/psb_z_hsort_x_mod.f90 | 6 +-- base/modules/auxil/psb_z_realloc_mod.F90 | 52 ++++++++++++----------- base/serial/impl/psb_c_coo_impl.F90 | 6 +-- base/serial/impl/psb_d_coo_impl.F90 | 6 +-- base/serial/impl/psb_s_coo_impl.F90 | 6 +-- base/serial/impl/psb_z_coo_impl.F90 | 6 +-- 17 files changed, 219 insertions(+), 205 deletions(-) diff --git a/base/modules/auxil/psb_c_hsort_x_mod.f90 b/base/modules/auxil/psb_c_hsort_x_mod.f90 index c0e39411..8f0437f7 100644 --- a/base/modules/auxil/psb_c_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_c_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_c_realloc_mod.F90 b/base/modules/auxil/psb_c_realloc_mod.F90 index 9e6af5a8..11aebdd8 100644 --- a/base/modules/auxil/psb_c_realloc_mod.F90 +++ b/base/modules/auxil/psb_c_realloc_mod.F90 @@ -790,7 +790,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:) = vin(:) + !$omp end workshare endif endif @@ -836,7 +838,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -1009,18 +1013,17 @@ Contains !!$ goto 9999 !!$ End If !!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz)) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1036,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1089,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_d_hsort_x_mod.f90 b/base/modules/auxil/psb_d_hsort_x_mod.f90 index 7273e972..ba45d683 100644 --- a/base/modules/auxil/psb_d_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_d_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_d_realloc_mod.F90 b/base/modules/auxil/psb_d_realloc_mod.F90 index 672b4677..79c7864d 100644 --- a/base/modules/auxil/psb_d_realloc_mod.F90 +++ b/base/modules/auxil/psb_d_realloc_mod.F90 @@ -790,7 +790,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:) = vin(:) + !$omp end workshare endif endif @@ -836,7 +838,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -1009,18 +1013,17 @@ Contains !!$ goto 9999 !!$ End If !!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz)) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1036,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1089,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index 0f2431fd..2cdbe6ff 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -790,7 +790,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:) = vin(:) + !$omp end workshare endif endif @@ -836,7 +838,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -1009,18 +1013,17 @@ Contains !!$ goto 9999 !!$ End If !!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz)) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1036,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1089,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_i2_realloc_mod.F90 b/base/modules/auxil/psb_i2_realloc_mod.F90 index 22e85d36..b89c641f 100644 --- a/base/modules/auxil/psb_i2_realloc_mod.F90 +++ b/base/modules/auxil/psb_i2_realloc_mod.F90 @@ -790,7 +790,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:) = vin(:) + !$omp end workshare endif endif @@ -836,7 +838,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -1009,18 +1013,17 @@ Contains !!$ goto 9999 !!$ End If !!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz)) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1036,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1089,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_i_hsort_x_mod.f90 b/base/modules/auxil/psb_i_hsort_x_mod.f90 index 0d1288a6..4bbc3d7f 100644 --- a/base/modules/auxil/psb_i_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_i_hsort_x_mod.f90 @@ -124,7 +124,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -237,9 +237,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_l_hsort_x_mod.f90 b/base/modules/auxil/psb_l_hsort_x_mod.f90 index 487e8ce9..5134d6bb 100644 --- a/base/modules/auxil/psb_l_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_l_hsort_x_mod.f90 @@ -124,7 +124,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_lpk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -237,9 +237,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_lpk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_lpk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_m_realloc_mod.F90 b/base/modules/auxil/psb_m_realloc_mod.F90 index c81ed83a..a2261bd6 100644 --- a/base/modules/auxil/psb_m_realloc_mod.F90 +++ b/base/modules/auxil/psb_m_realloc_mod.F90 @@ -790,7 +790,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:) = vin(:) + !$omp end workshare endif endif @@ -836,7 +838,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -1009,18 +1013,17 @@ Contains !!$ goto 9999 !!$ End If !!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz)) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1036,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1089,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_s_hsort_x_mod.f90 b/base/modules/auxil/psb_s_hsort_x_mod.f90 index 34f69ea4..204dbbf4 100644 --- a/base/modules/auxil/psb_s_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_s_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_s_realloc_mod.F90 b/base/modules/auxil/psb_s_realloc_mod.F90 index 0b2873e3..28e82312 100644 --- a/base/modules/auxil/psb_s_realloc_mod.F90 +++ b/base/modules/auxil/psb_s_realloc_mod.F90 @@ -790,7 +790,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:) = vin(:) + !$omp end workshare endif endif @@ -836,7 +838,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -1009,18 +1013,17 @@ Contains !!$ goto 9999 !!$ End If !!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz)) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1036,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1089,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_z_hsort_x_mod.f90 b/base/modules/auxil/psb_z_hsort_x_mod.f90 index 39f52e4f..4b7302aa 100644 --- a/base/modules/auxil/psb_z_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_z_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_z_realloc_mod.F90 b/base/modules/auxil/psb_z_realloc_mod.F90 index e6eeac2f..a699fadb 100644 --- a/base/modules/auxil/psb_z_realloc_mod.F90 +++ b/base/modules/auxil/psb_z_realloc_mod.F90 @@ -790,7 +790,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:) = vin(:) + !$omp end workshare endif endif @@ -836,7 +838,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -1009,18 +1013,17 @@ Contains !!$ goto 9999 !!$ End If !!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz)) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1036,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1089,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 03939afe..a8ea3613 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -4563,7 +4563,7 @@ function psb_lc_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +4630,7 @@ function psb_lc_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +4680,7 @@ function psb_lc_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 9597c5f5..86a5d84a 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -4563,7 +4563,7 @@ function psb_ld_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +4630,7 @@ function psb_ld_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +4680,7 @@ function psb_ld_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 174d3e07..d857b74f 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -4563,7 +4563,7 @@ function psb_ls_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +4630,7 @@ function psb_ls_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +4680,7 @@ function psb_ls_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 2ccc614a..ac39bcba 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -4563,7 +4563,7 @@ function psb_lz_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +4630,7 @@ function psb_lz_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +4680,7 @@ function psb_lz_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do From d378266f334004a143ea29b8e03dade3692456c9 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 21 Jun 2023 13:37:23 +0200 Subject: [PATCH 30/38] Fix synatx error --- base/modules/auxil/psb_c_realloc_mod.F90 | 2 +- base/modules/auxil/psb_d_realloc_mod.F90 | 2 +- base/modules/auxil/psb_e_realloc_mod.F90 | 2 +- base/modules/auxil/psb_i2_realloc_mod.F90 | 2 +- base/modules/auxil/psb_m_realloc_mod.F90 | 2 +- base/modules/auxil/psb_s_realloc_mod.F90 | 2 +- base/modules/auxil/psb_z_realloc_mod.F90 | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/base/modules/auxil/psb_c_realloc_mod.F90 b/base/modules/auxil/psb_c_realloc_mod.F90 index 11aebdd8..f5f46fcd 100644 --- a/base/modules/auxil/psb_c_realloc_mod.F90 +++ b/base/modules/auxil/psb_c_realloc_mod.F90 @@ -1021,7 +1021,7 @@ Contains if (present(newsz)) then isz = max(len+1,1,newsz) else if (present(addsz)) then - isz = max(len,1,isz+addsz)) + isz = max(len,1,isz+addsz) else isz = max(len,1,int(1.25*isz)) endif diff --git a/base/modules/auxil/psb_d_realloc_mod.F90 b/base/modules/auxil/psb_d_realloc_mod.F90 index 79c7864d..9f994e49 100644 --- a/base/modules/auxil/psb_d_realloc_mod.F90 +++ b/base/modules/auxil/psb_d_realloc_mod.F90 @@ -1021,7 +1021,7 @@ Contains if (present(newsz)) then isz = max(len+1,1,newsz) else if (present(addsz)) then - isz = max(len,1,isz+addsz)) + isz = max(len,1,isz+addsz) else isz = max(len,1,int(1.25*isz)) endif diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index 2cdbe6ff..5b9c38be 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -1021,7 +1021,7 @@ Contains if (present(newsz)) then isz = max(len+1,1,newsz) else if (present(addsz)) then - isz = max(len,1,isz+addsz)) + isz = max(len,1,isz+addsz) else isz = max(len,1,int(1.25*isz)) endif diff --git a/base/modules/auxil/psb_i2_realloc_mod.F90 b/base/modules/auxil/psb_i2_realloc_mod.F90 index b89c641f..4a25a44e 100644 --- a/base/modules/auxil/psb_i2_realloc_mod.F90 +++ b/base/modules/auxil/psb_i2_realloc_mod.F90 @@ -1021,7 +1021,7 @@ Contains if (present(newsz)) then isz = max(len+1,1,newsz) else if (present(addsz)) then - isz = max(len,1,isz+addsz)) + isz = max(len,1,isz+addsz) else isz = max(len,1,int(1.25*isz)) endif diff --git a/base/modules/auxil/psb_m_realloc_mod.F90 b/base/modules/auxil/psb_m_realloc_mod.F90 index a2261bd6..26584fbd 100644 --- a/base/modules/auxil/psb_m_realloc_mod.F90 +++ b/base/modules/auxil/psb_m_realloc_mod.F90 @@ -1021,7 +1021,7 @@ Contains if (present(newsz)) then isz = max(len+1,1,newsz) else if (present(addsz)) then - isz = max(len,1,isz+addsz)) + isz = max(len,1,isz+addsz) else isz = max(len,1,int(1.25*isz)) endif diff --git a/base/modules/auxil/psb_s_realloc_mod.F90 b/base/modules/auxil/psb_s_realloc_mod.F90 index 28e82312..1392bc18 100644 --- a/base/modules/auxil/psb_s_realloc_mod.F90 +++ b/base/modules/auxil/psb_s_realloc_mod.F90 @@ -1021,7 +1021,7 @@ Contains if (present(newsz)) then isz = max(len+1,1,newsz) else if (present(addsz)) then - isz = max(len,1,isz+addsz)) + isz = max(len,1,isz+addsz) else isz = max(len,1,int(1.25*isz)) endif diff --git a/base/modules/auxil/psb_z_realloc_mod.F90 b/base/modules/auxil/psb_z_realloc_mod.F90 index a699fadb..c9201e83 100644 --- a/base/modules/auxil/psb_z_realloc_mod.F90 +++ b/base/modules/auxil/psb_z_realloc_mod.F90 @@ -1021,7 +1021,7 @@ Contains if (present(newsz)) then isz = max(len+1,1,newsz) else if (present(addsz)) then - isz = max(len,1,isz+addsz)) + isz = max(len,1,isz+addsz) else isz = max(len,1,int(1.25*isz)) endif From 2f403e0df75a9d41d5a5664355c6d043bba9f477 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 28 Jun 2023 11:16:53 +0200 Subject: [PATCH 31/38] Rework cp_{from|to}_fmt for better OpenMP performance --- base/modules/auxil/psb_c_realloc_mod.F90 | 10 +++-- base/modules/auxil/psb_d_realloc_mod.F90 | 10 +++-- base/modules/auxil/psb_e_realloc_mod.F90 | 10 +++-- base/modules/auxil/psb_i2_realloc_mod.F90 | 10 +++-- base/modules/auxil/psb_m_realloc_mod.F90 | 10 +++-- base/modules/auxil/psb_s_realloc_mod.F90 | 10 +++-- base/modules/auxil/psb_z_realloc_mod.F90 | 10 +++-- base/serial/impl/psb_c_csc_impl.F90 | 49 ++++++++++++++++++++--- base/serial/impl/psb_c_csr_impl.F90 | 49 ++++++++++++++++++++--- base/serial/impl/psb_d_csc_impl.F90 | 49 ++++++++++++++++++++--- base/serial/impl/psb_d_csr_impl.F90 | 49 ++++++++++++++++++++--- base/serial/impl/psb_s_csc_impl.F90 | 49 ++++++++++++++++++++--- base/serial/impl/psb_s_csr_impl.F90 | 49 ++++++++++++++++++++--- base/serial/impl/psb_z_csc_impl.F90 | 49 ++++++++++++++++++++--- base/serial/impl/psb_z_csr_impl.F90 | 49 ++++++++++++++++++++--- 15 files changed, 386 insertions(+), 76 deletions(-) diff --git a/base/modules/auxil/psb_c_realloc_mod.F90 b/base/modules/auxil/psb_c_realloc_mod.F90 index f5f46fcd..c042f2e6 100644 --- a/base/modules/auxil/psb_c_realloc_mod.F90 +++ b/base/modules/auxil/psb_c_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,9 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - !$omp workshare - vout(:) = vin(:) - !$omp end workshare + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif diff --git a/base/modules/auxil/psb_d_realloc_mod.F90 b/base/modules/auxil/psb_d_realloc_mod.F90 index 9f994e49..f1fac143 100644 --- a/base/modules/auxil/psb_d_realloc_mod.F90 +++ b/base/modules/auxil/psb_d_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,9 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - !$omp workshare - vout(:) = vin(:) - !$omp end workshare + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index 5b9c38be..3f8b67f8 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,9 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - !$omp workshare - vout(:) = vin(:) - !$omp end workshare + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif diff --git a/base/modules/auxil/psb_i2_realloc_mod.F90 b/base/modules/auxil/psb_i2_realloc_mod.F90 index 4a25a44e..c6babd68 100644 --- a/base/modules/auxil/psb_i2_realloc_mod.F90 +++ b/base/modules/auxil/psb_i2_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,9 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - !$omp workshare - vout(:) = vin(:) - !$omp end workshare + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif diff --git a/base/modules/auxil/psb_m_realloc_mod.F90 b/base/modules/auxil/psb_m_realloc_mod.F90 index 26584fbd..2027d9b7 100644 --- a/base/modules/auxil/psb_m_realloc_mod.F90 +++ b/base/modules/auxil/psb_m_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,9 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - !$omp workshare - vout(:) = vin(:) - !$omp end workshare + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif diff --git a/base/modules/auxil/psb_s_realloc_mod.F90 b/base/modules/auxil/psb_s_realloc_mod.F90 index 1392bc18..1c0ca7df 100644 --- a/base/modules/auxil/psb_s_realloc_mod.F90 +++ b/base/modules/auxil/psb_s_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,9 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - !$omp workshare - vout(:) = vin(:) - !$omp end workshare + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif diff --git a/base/modules/auxil/psb_z_realloc_mod.F90 b/base/modules/auxil/psb_z_realloc_mod.F90 index c9201e83..ece5680d 100644 --- a/base/modules/auxil/psb_z_realloc_mod.F90 +++ b/base/modules/auxil/psb_z_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,9 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - !$omp workshare - vout(:) = vin(:) - !$omp end workshare + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif diff --git a/base/serial/impl/psb_c_csc_impl.F90 b/base/serial/impl/psb_c_csc_impl.F90 index d8837b76..54332d06 100644 --- a/base/serial/impl/psb_c_csc_impl.F90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_c_cp_csc_to_fmt(a,b,info) b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_c_cp_csc_from_fmt(a,b,info) a%psb_c_base_sparse_mat = b%psb_c_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 47779646..f06bf549 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -3190,9 +3190,28 @@ subroutine psb_c_cp_csr_to_fmt(a,b,info) b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3295,27 @@ subroutine psb_c_cp_csr_from_fmt(a,b,info) a%psb_c_base_sparse_mat = b%psb_c_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_d_csc_impl.F90 b/base/serial/impl/psb_d_csc_impl.F90 index a50d5026..1761b051 100644 --- a/base/serial/impl/psb_d_csc_impl.F90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_d_cp_csc_to_fmt(a,b,info) b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_d_cp_csc_from_fmt(a,b,info) a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 10518a2d..529fed79 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3190,9 +3190,28 @@ subroutine psb_d_cp_csr_to_fmt(a,b,info) b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3295,27 @@ subroutine psb_d_cp_csr_from_fmt(a,b,info) a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_s_csc_impl.F90 b/base/serial/impl/psb_s_csc_impl.F90 index dc4ca30b..a66b7dc0 100644 --- a/base/serial/impl/psb_s_csc_impl.F90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_s_cp_csc_to_fmt(a,b,info) b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_s_cp_csc_from_fmt(a,b,info) a%psb_s_base_sparse_mat = b%psb_s_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 22ceb6e5..cb9e413d 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -3190,9 +3190,28 @@ subroutine psb_s_cp_csr_to_fmt(a,b,info) b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3295,27 @@ subroutine psb_s_cp_csr_from_fmt(a,b,info) a%psb_s_base_sparse_mat = b%psb_s_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_z_csc_impl.F90 b/base/serial/impl/psb_z_csc_impl.F90 index 035d0e83..e5516bd9 100644 --- a/base/serial/impl/psb_z_csc_impl.F90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_z_cp_csc_to_fmt(a,b,info) b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_z_cp_csc_from_fmt(a,b,info) a%psb_z_base_sparse_mat = b%psb_z_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 60839e5d..4c976e14 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -3190,9 +3190,28 @@ subroutine psb_z_cp_csr_to_fmt(a,b,info) b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3295,27 @@ subroutine psb_z_cp_csr_from_fmt(a,b,info) a%psb_z_base_sparse_mat = b%psb_z_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default From bb9f213551200577899fd86ad9cef01a1e096d17 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 18 Jul 2023 18:19:03 +0200 Subject: [PATCH 32/38] Define and implement OMP version of TRIL/TRIU --- base/modules/serial/psb_c_base_mat_mod.F90 | 89 +++ base/modules/serial/psb_d_base_mat_mod.F90 | 89 +++ base/modules/serial/psb_s_base_mat_mod.F90 | 89 +++ base/modules/serial/psb_z_base_mat_mod.F90 | 89 +++ base/serial/impl/psb_c_coo_impl.F90 | 597 +++++++++++++++++++++ base/serial/impl/psb_d_coo_impl.F90 | 597 +++++++++++++++++++++ base/serial/impl/psb_s_coo_impl.F90 | 597 +++++++++++++++++++++ base/serial/impl/psb_z_coo_impl.F90 | 597 +++++++++++++++++++++ 8 files changed, 2744 insertions(+) diff --git a/base/modules/serial/psb_c_base_mat_mod.F90 b/base/modules/serial/psb_c_base_mat_mod.F90 index a5dd0fd0..33982e3a 100644 --- a/base/modules/serial/psb_c_base_mat_mod.F90 +++ b/base/modules/serial/psb_c_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_c_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_c_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_c_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_c_coo_ensure_size + procedure, pass(a) :: tril => psb_c_coo_tril + procedure, pass(a) :: triu => psb_c_coo_triu procedure, pass(a) :: cp_to_coo => psb_c_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_c_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_c_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_c_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_c_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_c_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_c_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_c_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_c_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_c_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_c_coo_triu + end interface + !> !! \memberof psb_c_coo_sparse_mat diff --git a/base/modules/serial/psb_d_base_mat_mod.F90 b/base/modules/serial/psb_d_base_mat_mod.F90 index fac0af20..5f4c76df 100644 --- a/base/modules/serial/psb_d_base_mat_mod.F90 +++ b/base/modules/serial/psb_d_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_d_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_d_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_d_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_d_coo_ensure_size + procedure, pass(a) :: tril => psb_d_coo_tril + procedure, pass(a) :: triu => psb_d_coo_triu procedure, pass(a) :: cp_to_coo => psb_d_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_d_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_d_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_d_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_d_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_d_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_d_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_d_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_d_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_d_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_d_coo_triu + end interface + !> !! \memberof psb_d_coo_sparse_mat diff --git a/base/modules/serial/psb_s_base_mat_mod.F90 b/base/modules/serial/psb_s_base_mat_mod.F90 index 186ae577..92bda7d8 100644 --- a/base/modules/serial/psb_s_base_mat_mod.F90 +++ b/base/modules/serial/psb_s_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_s_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_s_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_s_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_s_coo_ensure_size + procedure, pass(a) :: tril => psb_s_coo_tril + procedure, pass(a) :: triu => psb_s_coo_triu procedure, pass(a) :: cp_to_coo => psb_s_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_s_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_s_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_s_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_s_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_s_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_s_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_s_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_s_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_s_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_s_coo_triu + end interface + !> !! \memberof psb_s_coo_sparse_mat diff --git a/base/modules/serial/psb_z_base_mat_mod.F90 b/base/modules/serial/psb_z_base_mat_mod.F90 index 3ce9074f..3e8196f4 100644 --- a/base/modules/serial/psb_z_base_mat_mod.F90 +++ b/base/modules/serial/psb_z_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_z_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_z_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_z_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_z_coo_ensure_size + procedure, pass(a) :: tril => psb_z_coo_tril + procedure, pass(a) :: triu => psb_z_coo_triu procedure, pass(a) :: cp_to_coo => psb_z_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_z_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_z_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_z_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_z_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_z_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_z_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_z_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_z_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_z_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_z_coo_triu + end interface + !> !! \memberof psb_z_coo_sparse_mat diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index a8ea3613..f740b3a7 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -3489,6 +3489,603 @@ subroutine psb_c_coo_mv_from(a,b) end subroutine psb_c_coo_mv_from +! +! CSR implementation of tril/triu +! +subroutine psb_c_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_tril + implicit none + + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_coo_tril + +subroutine psb_c_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_triu + implicit none + + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do + end do + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_coo_triu + subroutine psb_c_fix_coo(a,info,idir) use psb_const_mod diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 86a5d84a..1aafe2e4 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -3489,6 +3489,603 @@ subroutine psb_d_coo_mv_from(a,b) end subroutine psb_d_coo_mv_from +! +! CSR implementation of tril/triu +! +subroutine psb_d_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_tril + implicit none + + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_d_coo_tril + +subroutine psb_d_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_triu + implicit none + + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do + end do + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_d_coo_triu + subroutine psb_d_fix_coo(a,info,idir) use psb_const_mod diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index d857b74f..d99017bf 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -3489,6 +3489,603 @@ subroutine psb_s_coo_mv_from(a,b) end subroutine psb_s_coo_mv_from +! +! CSR implementation of tril/triu +! +subroutine psb_s_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_tril + implicit none + + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_coo_tril + +subroutine psb_s_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_triu + implicit none + + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do + end do + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_coo_triu + subroutine psb_s_fix_coo(a,info,idir) use psb_const_mod diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index ac39bcba..5532c7f1 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -3489,6 +3489,603 @@ subroutine psb_z_coo_mv_from(a,b) end subroutine psb_z_coo_mv_from +! +! CSR implementation of tril/triu +! +subroutine psb_z_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_tril + implicit none + + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_z_coo_tril + +subroutine psb_z_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_triu + implicit none + + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)a%val, ja => a%ja, irp=>a%irp) + do i=imin_,imax_ + do k=irp(i),irp(i+1)-1 + if ((jmin_<=j).and.(j<=jmax_)) then + if ((ja(k)-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do + end do + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_z_coo_triu + subroutine psb_z_fix_coo(a,info,idir) use psb_const_mod From 5e691d5bffc858592527a6f13c010f39ff7d98d8 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 24 Jul 2023 13:21:20 +0200 Subject: [PATCH 33/38] Some improvements for openmp vector updates --- base/modules/auxil/psb_c_realloc_mod.F90 | 18 ------------------ base/modules/auxil/psb_d_realloc_mod.F90 | 18 ------------------ base/modules/auxil/psb_e_realloc_mod.F90 | 18 ------------------ base/modules/auxil/psb_i2_realloc_mod.F90 | 18 ------------------ base/modules/auxil/psb_m_realloc_mod.F90 | 18 ------------------ base/modules/auxil/psb_s_realloc_mod.F90 | 18 ------------------ base/modules/auxil/psb_z_realloc_mod.F90 | 18 ------------------ base/modules/serial/psb_c_base_vect_mod.F90 | 6 +++++- base/modules/serial/psb_d_base_vect_mod.F90 | 6 +++++- base/modules/serial/psb_i_base_vect_mod.F90 | 6 +++++- base/modules/serial/psb_l_base_vect_mod.F90 | 6 +++++- base/modules/serial/psb_s_base_vect_mod.F90 | 6 +++++- base/modules/serial/psb_z_base_vect_mod.F90 | 6 +++++- 13 files changed, 30 insertions(+), 132 deletions(-) diff --git a/base/modules/auxil/psb_c_realloc_mod.F90 b/base/modules/auxil/psb_c_realloc_mod.F90 index c042f2e6..9b22bee7 100644 --- a/base/modules/auxil/psb_c_realloc_mod.F90 +++ b/base/modules/auxil/psb_c_realloc_mod.F90 @@ -997,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_d_realloc_mod.F90 b/base/modules/auxil/psb_d_realloc_mod.F90 index f1fac143..ca85e0ec 100644 --- a/base/modules/auxil/psb_d_realloc_mod.F90 +++ b/base/modules/auxil/psb_d_realloc_mod.F90 @@ -997,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index 3f8b67f8..06a6d034 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -997,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_i2_realloc_mod.F90 b/base/modules/auxil/psb_i2_realloc_mod.F90 index c6babd68..146bdf7e 100644 --- a/base/modules/auxil/psb_i2_realloc_mod.F90 +++ b/base/modules/auxil/psb_i2_realloc_mod.F90 @@ -997,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_m_realloc_mod.F90 b/base/modules/auxil/psb_m_realloc_mod.F90 index 2027d9b7..4d2f9316 100644 --- a/base/modules/auxil/psb_m_realloc_mod.F90 +++ b/base/modules/auxil/psb_m_realloc_mod.F90 @@ -997,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_s_realloc_mod.F90 b/base/modules/auxil/psb_s_realloc_mod.F90 index 1c0ca7df..f064e606 100644 --- a/base/modules/auxil/psb_s_realloc_mod.F90 +++ b/base/modules/auxil/psb_s_realloc_mod.F90 @@ -997,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_z_realloc_mod.F90 b/base/modules/auxil/psb_z_realloc_mod.F90 index ece5680d..e9eb26d3 100644 --- a/base/modules/auxil/psb_z_realloc_mod.F90 +++ b/base/modules/auxil/psb_z_realloc_mod.F90 @@ -997,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/serial/psb_c_base_vect_mod.F90 b/base/modules/serial/psb_c_base_vect_mod.F90 index 44044771..df15e0c9 100644 --- a/base/modules/serial/psb_c_base_vect_mod.F90 +++ b/base/modules/serial/psb_c_base_vect_mod.F90 @@ -481,7 +481,11 @@ contains implicit none class(psb_c_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=czero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=czero + !$omp end workshare + end if call x%set_host() end subroutine c_base_zero diff --git a/base/modules/serial/psb_d_base_vect_mod.F90 b/base/modules/serial/psb_d_base_vect_mod.F90 index a28d12f6..87f5b0e4 100644 --- a/base/modules/serial/psb_d_base_vect_mod.F90 +++ b/base/modules/serial/psb_d_base_vect_mod.F90 @@ -488,7 +488,11 @@ contains implicit none class(psb_d_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=dzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=dzero + !$omp end workshare + end if call x%set_host() end subroutine d_base_zero diff --git a/base/modules/serial/psb_i_base_vect_mod.F90 b/base/modules/serial/psb_i_base_vect_mod.F90 index 0289ecd0..a5cddeb5 100644 --- a/base/modules/serial/psb_i_base_vect_mod.F90 +++ b/base/modules/serial/psb_i_base_vect_mod.F90 @@ -417,7 +417,11 @@ contains implicit none class(psb_i_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=izero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=izero + !$omp end workshare + end if call x%set_host() end subroutine i_base_zero diff --git a/base/modules/serial/psb_l_base_vect_mod.F90 b/base/modules/serial/psb_l_base_vect_mod.F90 index d8654f63..93b29e17 100644 --- a/base/modules/serial/psb_l_base_vect_mod.F90 +++ b/base/modules/serial/psb_l_base_vect_mod.F90 @@ -418,7 +418,11 @@ contains implicit none class(psb_l_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=lzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=lzero + !$omp end workshare + end if call x%set_host() end subroutine l_base_zero diff --git a/base/modules/serial/psb_s_base_vect_mod.F90 b/base/modules/serial/psb_s_base_vect_mod.F90 index 4bd6bbfb..fccd846b 100644 --- a/base/modules/serial/psb_s_base_vect_mod.F90 +++ b/base/modules/serial/psb_s_base_vect_mod.F90 @@ -488,7 +488,11 @@ contains implicit none class(psb_s_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=szero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=szero + !$omp end workshare + end if call x%set_host() end subroutine s_base_zero diff --git a/base/modules/serial/psb_z_base_vect_mod.F90 b/base/modules/serial/psb_z_base_vect_mod.F90 index c52dcd59..2a14de21 100644 --- a/base/modules/serial/psb_z_base_vect_mod.F90 +++ b/base/modules/serial/psb_z_base_vect_mod.F90 @@ -481,7 +481,11 @@ contains implicit none class(psb_z_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=zzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=zzero + !$omp end workshare + end if call x%set_host() end subroutine z_base_zero From ca82520b88f404833295697e1a3921bd76c11c52 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 1 Aug 2023 10:16:22 +0200 Subject: [PATCH 34/38] Reworked CSR TRIL/TRIU for OpenMP --- base/serial/impl/psb_c_coo_impl.F90 | 2 +- base/serial/impl/psb_c_csr_impl.F90 | 304 +++++++++++++++++++++++++++- base/serial/impl/psb_d_coo_impl.F90 | 2 +- base/serial/impl/psb_d_csr_impl.F90 | 304 +++++++++++++++++++++++++++- base/serial/impl/psb_s_coo_impl.F90 | 2 +- base/serial/impl/psb_s_csr_impl.F90 | 304 +++++++++++++++++++++++++++- base/serial/impl/psb_z_coo_impl.F90 | 2 +- base/serial/impl/psb_z_csr_impl.F90 | 304 +++++++++++++++++++++++++++- 8 files changed, 1212 insertions(+), 12 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index f740b3a7..b800ce22 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -3490,7 +3490,7 @@ end subroutine psb_c_coo_mv_from ! -! CSR implementation of tril/triu +! COO implementation of tril/triu ! subroutine psb_c_coo_tril(a,l,info,& & diag,imin,imax,jmin,jmax,rscale,cscale,u) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index f06bf549..6c3ecec6 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_c_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_c_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_c_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_c_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 1aafe2e4..5452ab2b 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -3490,7 +3490,7 @@ end subroutine psb_d_coo_mv_from ! -! CSR implementation of tril/triu +! COO implementation of tril/triu ! subroutine psb_d_coo_tril(a,l,info,& & diag,imin,imax,jmin,jmax,rscale,cscale,u) diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 529fed79..5b667342 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_d_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_d_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_d_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_d_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index d99017bf..cfc4a2f9 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -3490,7 +3490,7 @@ end subroutine psb_s_coo_mv_from ! -! CSR implementation of tril/triu +! COO implementation of tril/triu ! subroutine psb_s_coo_tril(a,l,info,& & diag,imin,imax,jmin,jmax,rscale,cscale,u) diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index cb9e413d..dbe7a4be 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_s_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_s_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_s_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_s_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 5532c7f1..fb4aa48e 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -3490,7 +3490,7 @@ end subroutine psb_z_coo_mv_from ! -! CSR implementation of tril/triu +! COO implementation of tril/triu ! subroutine psb_z_coo_tril(a,l,info,& & diag,imin,imax,jmin,jmax,rscale,cscale,u) diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 4c976e14..9322105e 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_z_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_z_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_z_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_z_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) From 08c1ab0cd1189ca65c8607b880e67b96160b2de7 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 21 Aug 2023 16:32:55 +0200 Subject: [PATCH 35/38] Fix tril/triu in COO for non-OMP paths. --- base/serial/impl/psb_c_coo_impl.F90 | 59 ++++++++++++++--------------- base/serial/impl/psb_d_coo_impl.F90 | 59 ++++++++++++++--------------- base/serial/impl/psb_s_coo_impl.F90 | 59 ++++++++++++++--------------- base/serial/impl/psb_z_coo_impl.F90 | 59 ++++++++++++++--------------- 4 files changed, 116 insertions(+), 120 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index b800ce22..6cc5fd3a 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -4013,25 +4013,24 @@ subroutine psb_c_coo_triu(a,u,info,& nzuin = u%get_nzeros() ! At this point it should be 0 call l%allocate(mb,nb,nz) nzlin = l%get_nzeros() ! At this point it should be 0 - associate(val =>a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - j = ja(k) - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)>=diag_) then - nzin = nzin + 1 - u%ia(nzin) = i - u%ja(nzin) = ja(k) - u%val(nzin) = val(k) - end if + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) end if - end do - end do + end if + end do loop2 end associate call u%set_nzeros(nzin) end if diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 5452ab2b..9f9f4356 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -4013,25 +4013,24 @@ subroutine psb_d_coo_triu(a,u,info,& nzuin = u%get_nzeros() ! At this point it should be 0 call l%allocate(mb,nb,nz) nzlin = l%get_nzeros() ! At this point it should be 0 - associate(val =>a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - j = ja(k) - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)>=diag_) then - nzin = nzin + 1 - u%ia(nzin) = i - u%ja(nzin) = ja(k) - u%val(nzin) = val(k) - end if + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) end if - end do - end do + end if + end do loop2 end associate call u%set_nzeros(nzin) end if diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index cfc4a2f9..83285cdf 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -4013,25 +4013,24 @@ subroutine psb_s_coo_triu(a,u,info,& nzuin = u%get_nzeros() ! At this point it should be 0 call l%allocate(mb,nb,nz) nzlin = l%get_nzeros() ! At this point it should be 0 - associate(val =>a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - j = ja(k) - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)>=diag_) then - nzin = nzin + 1 - u%ia(nzin) = i - u%ja(nzin) = ja(k) - u%val(nzin) = val(k) - end if + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) end if - end do - end do + end if + end do loop2 end associate call u%set_nzeros(nzin) end if diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index fb4aa48e..3d9562ef 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -4013,25 +4013,24 @@ subroutine psb_z_coo_triu(a,u,info,& nzuin = u%get_nzeros() ! At this point it should be 0 call l%allocate(mb,nb,nz) nzlin = l%get_nzeros() ! At this point it should be 0 - associate(val =>a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - j = ja(k) - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, irp=>a%irp) - do i=imin_,imax_ - do k=irp(i),irp(i+1)-1 - if ((jmin_<=j).and.(j<=jmax_)) then - if ((ja(k)-i)>=diag_) then - nzin = nzin + 1 - u%ia(nzin) = i - u%ja(nzin) = ja(k) - u%val(nzin) = val(k) - end if + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) end if - end do - end do + end if + end do loop2 end associate call u%set_nzeros(nzin) end if From 3aa748b0e3780f691ee4a58dd1746cc18a0a6ab7 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 21 Aug 2023 17:22:16 +0200 Subject: [PATCH 36/38] Finish dual OMP/notOMP g2lv1_ins --- base/modules/desc/psb_hash_map_mod.F90 | 721 ++++++++++++------------- 1 file changed, 355 insertions(+), 366 deletions(-) diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index b7f53879..058dbb8d 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -652,7 +652,7 @@ contains integer(psb_ipk_) :: me, np character(len=20) :: name,ch_err logical, allocatable :: mask_(:) - logical :: use_openmp = .true. +!!$ logical :: use_openmp = .true. #ifdef OPENMP integer(kind = OMP_lock_kind) :: ins_lck #endif @@ -683,119 +683,32 @@ contains mglob = idxmap%get_gr() nrow = idxmap%get_lr() !write(0,*) me,name,' before loop ',psb_errstatus_fatal() - if (use_openmp) then #ifdef OPENMP - !call OMP_init_lock(ins_lck) + !call OMP_init_lock(ins_lck) - if (idxmap%is_bld()) then + if (idxmap%is_bld()) then - isLoopValid = .true. - ncol = idxmap%get_lc() - if (present(mask)) then - mask_ = mask - else - allocate(mask_(size(idx))) - mask_ = .true. - end if - - if (present(lidx)) then - if (present(mask)) then - !$omp critical(hash_g2l_ins) - - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - !call OMP_set_lock(ins_lck) - ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - !call OMP_set_lock(ins_lck) - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - ncol = idxmap%get_lc() - nxt = lidx(i) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - ! Index not found - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - - if (info /= psb_success_) then - !write(0,*) 'Error spot 1' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - end if - end if - else - idx(i) = -1 - end if - !call OMP_unset_lock(ins_lck) - end if - else - idx(i) = lip - end if - else - idx(i) = -1 - end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) + isLoopValid = .true. + ncol = idxmap%get_lc() + if (present(mask)) then + mask_ = mask + else + allocate(mask_(size(idx))) + mask_ = .true. + end if - if (.not. isLoopValid) then - goto 9999 - end if - else - !$omp critical(hash_g2l_ins) + if (present(lidx)) then + if (present(mask)) then + !$omp critical(hash_g2l_ins) - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 @@ -812,6 +725,7 @@ contains & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). @@ -823,9 +737,11 @@ contains if (lip > 0) then idx(i) = lip else if (lip < 0) then + ! Index not found call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip + if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside ! the hash map. In that case 'tlip' is the value corresponding @@ -838,7 +754,7 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then - !write(0,*) 'Error spot 2' + !write(0,*) 'Error spot 1' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -858,160 +774,160 @@ contains else idx(i) = lip end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if + + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 end if - else if (.not.present(lidx)) then - if(present(mask)) then - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - - !$omp critical(hash_g2l_ins) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - !call OMP_set_lock(ins_lck) - ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) + else + !$omp critical(hash_g2l_ins) - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - !write(0,*) me,name,' b hic 1 ',psb_errstatus_fatal() - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - !write(0,*) me,name,' a hic 1 ',psb_errstatus_fatal() - if (lip < 0) then - !call OMP_set_lock(ins_lck) - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing, so this lock is relatively rare). - ncol = idxmap%get_lc() - nxt = ncol + 1 - !write(0,*) me,name,' b hic 2 ',psb_errstatus_fatal() - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - !write(0,*) me,name,' a hic 2 ',psb_errstatus_fatal() - if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - ! Index not found - !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (psb_errstatus_fatal()) write(0,*) me,name,' a hsik ',info,omp_get_thread_num() - !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() - lip = tlip - - if (info >= 0) then - !write(0,*) 'Error before spot 3', info - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (psb_errstatus_fatal()) write(0,*) me,name,' a esz ',info,omp_get_thread_num() - if (info /= psb_success_) then - !write(0,*) 'Error spot 3', info - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - end if - end if - else + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). + ncol = idxmap%get_lc() + nxt = lidx(i) + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then + + ncol = MAX(ncol,nxt) + + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + + if (info /= psb_success_) then + !write(0,*) 'Error spot 2' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + + isLoopValid = .false. idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) end if - !call OMP_unset_lock(ins_lck) end if else - idx(i) = lip + idx(i) = -1 end if - else - idx(i) = -1 + !call OMP_unset_lock(ins_lck) end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = lip end if - else - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - !$omp critical(hash_g2l_ins) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle + + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 + end if + end if + else if (.not.present(lidx)) then + if(present(mask)) then + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + + !$omp critical(hash_g2l_ins) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - !call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() !call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, ! but inside a critical region. + !write(0,*) me,name,' b hic 1 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) + !write(0,*) me,name,' a hic 1 ',psb_errstatus_fatal() if (lip < 0) then !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index - ! is often already existing). + ! is often already existing, so this lock is relatively rare). ncol = idxmap%get_lc() - nxt = ncol + 1 + nxt = ncol + 1 + !write(0,*) me,name,' b hic 2 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - + !write(0,*) me,name,' a hic 2 ',psb_errstatus_fatal() if (lip > 0) then idx(i) = lip else if (lip < 0) then ! Index not found + !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + if (psb_errstatus_fatal()) write(0,*) me,name,' a hsik ',info,omp_get_thread_num() + !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() lip = tlip if (info >= 0) then + !write(0,*) 'Error before spot 3', info ! 'nxt' is not equal to 'tlip' when the key is already inside ! the hash map. In that case 'tlip' is the value corresponding ! to the existing mapping. if (nxt == tlip) then ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) - + if (psb_errstatus_fatal()) write(0,*) me,name,' a esz ',info,omp_get_thread_num() if (info /= psb_success_) then - !write(0,*) 'Error spot 4' + !write(0,*) 'Error spot 3', info call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -1028,116 +944,145 @@ contains end if !call OMP_unset_lock(ins_lck) end if - else idx(i) = lip end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if - end if - end if - else - ! Wrong state - idx = -1 - info = -1 - end if - !call OMP_destroy_lock(ins_lck) -#endif - else if (.not.use_openmp) then -#ifdef OPENMP - ! $ omp parallel - ! $ omp critical - !write(0,*) 'In cnv: ',omp_get_num_threads() -#endif - isLoopValid = .true. - if (idxmap%is_bld()) then + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) - if (present(lidx)) then - if (present(mask)) then - do i = 1, is + if (.not. isLoopValid) then + goto 9999 + end if + else + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + !$omp critical(hash_g2l_ins) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob) ) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (info >=0) then - if (nxt == tlip) then - ncol = max(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - !write(0,*) 'Error spot' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. - end if - idxmap%loc_to_glob(nxt) = ip + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + ! Index not found + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then + + ncol = MAX(ncol,nxt) + + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + + if (info /= psb_success_) then + !write(0,*) 'Error spot 4' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + + isLoopValid = .false. + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - isLoopValid = .false. + end if end if + else + idx(i) = -1 end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 + !call OMP_unset_lock(ins_lck) end if - enddo - else if (.not.present(mask)) then + else + idx(i) = lip + end if - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 + end if + + end if + end if + else + ! Wrong state + idx = -1 + info = -1 + end if + !call OMP_destroy_lock(ins_lck) +#else +!!$ else if (.not.use_openmp) then + isLoopValid = .true. + if (idxmap%is_bld()) then + + if (present(lidx)) then + if (present(mask)) then + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then idx(i) = -1 cycle endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + tlip = lip nxt = lidx(i) if (nxt <= nrow) then idx(i) = -1 cycle endif call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - if (info >=0) then - if (nxt == lip) then - ncol = max(nxt,ncol) + if (nxt == tlip) then + ncol = max(ncol,nxt) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then - info=1 - !write(0,*) 'Error spot' + !write(0,*) 'Error spot' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. + isLoopValid = .false. end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) @@ -1151,66 +1096,71 @@ contains end if idx(i) = lip info = psb_success_ - enddo + else + idx(i) = -1 + end if + enddo - end if + else if (.not.present(mask)) then - else if (.not.present(lidx)) then + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip - if (present(mask)) then - do i = 1, is - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle + if (info >=0) then + if (nxt == lip) then + ncol = max(nxt,ncol) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + !write(0,*) 'Error spot' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + isLoopValid = .false. + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) endif - ncol = idxmap%get_lc() - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - end if - - if (info >=0) then - if (nxt == lip) then - ncol = nxt - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - write(0,*) 'Error spot 5' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - isLoopValid = .false. - end if - idx(i) = lip info = psb_success_ else - idx(i) = -1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. end if - enddo - else if (.not.present(mask)) then + end if + idx(i) = lip + info = psb_success_ + enddo - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) + end if + + else if (.not.present(lidx)) then + + if (present(mask)) then + do i = 1, is + if (mask(i)) then + ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - nxt = ncol + 1 + ncol = idxmap%get_lc() + nxt = ncol + 1 call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& & idxmap%glb_lc,ncol) if (lip < 0) then @@ -1225,41 +1175,80 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then info=1 - write(0,*) 'Error spot 6' - ch_err='psb_ensure_size' + write(0,*) 'Error spot 5' call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + & a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) endif info = psb_success_ else - ch_err='SearchInsKeyVal' call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - isLoopValid = .false. + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. end if idx(i) = lip info = psb_success_ - enddo + else + idx(i) = -1 + end if + enddo + else if (.not.present(mask)) then + + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + write(0,*) 'Error spot 6' + ch_err='psb_ensure_size' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + isLoopValid = .false. + + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + ch_err='SearchInsKeyVal' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + isLoopValid = .false. + end if + idx(i) = lip + info = psb_success_ + enddo - end if end if - else - ! Wrong state - idx = -1 - info = -1 end if -#ifdef OPENMP - ! $ omp end critical - ! $ omp end parallel - -#endif - if (.not. isLoopValid) goto 9999 + else + ! Wrong state + idx = -1 + info = -1 end if + if (.not. isLoopValid) goto 9999 +#endif !write(0,*) me,name,' after loop ',psb_errstatus_fatal() call psb_erractionrestore(err_act) return From 26bf4c5d699aec0a94f59e4b9231ffa170bce3fe Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 21 Aug 2023 17:26:37 +0200 Subject: [PATCH 37/38] Fixed COO csput for OMP/not OMP --- base/serial/impl/psb_c_coo_impl.F90 | 14 +++++--------- base/serial/impl/psb_d_coo_impl.F90 | 14 +++++--------- base/serial/impl/psb_s_coo_impl.F90 | 14 +++++--------- base/serial/impl/psb_z_coo_impl.F90 | 14 +++++--------- 4 files changed, 20 insertions(+), 36 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 6cc5fd3a..3e75820f 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -2875,18 +2875,14 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - else - nzaold = nza - nza = nza + nz - call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) + + !write(0,*) 'From CSPUT :',nza,nzaold + call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2961,7 +2957,7 @@ contains end do !$OMP END PARALLEL DO - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 9f9f4356..86d93ed6 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -2875,18 +2875,14 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - else - nzaold = nza - nza = nza + nz - call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) + + !write(0,*) 'From CSPUT :',nza,nzaold + call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2961,7 +2957,7 @@ contains end do !$OMP END PARALLEL DO - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 83285cdf..8791f897 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -2875,18 +2875,14 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - else - nzaold = nza - nza = nza + nz - call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) + + !write(0,*) 'From CSPUT :',nza,nzaold + call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2961,7 +2957,7 @@ contains end do !$OMP END PARALLEL DO - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 3d9562ef..952b9751 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -2875,18 +2875,14 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - else - nzaold = nza - nza = nza + nz - call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) + + !write(0,*) 'From CSPUT :',nza,nzaold + call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2961,7 +2957,7 @@ contains end do !$OMP END PARALLEL DO - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) From f001ebbad39ac46f9258bf674704bd2b2858fefa Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 22 Aug 2023 10:22:46 +0200 Subject: [PATCH 38/38] Final fix for COO on OMP --- base/serial/impl/psb_c_coo_impl.F90 | 14 ++++++++------ base/serial/impl/psb_d_coo_impl.F90 | 14 ++++++++------ base/serial/impl/psb_s_coo_impl.F90 | 14 ++++++++------ base/serial/impl/psb_z_coo_impl.F90 | 14 ++++++++------ 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 3e75820f..46391dee 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2875,14 +2877,15 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - - !write(0,*) 'From CSPUT :',nza,nzaold - call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2956,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 86d93ed6..c2babf8e 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2875,14 +2877,15 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - - !write(0,*) 'From CSPUT :',nza,nzaold - call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2956,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 8791f897..402c608a 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2875,14 +2877,15 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - - !write(0,*) 'From CSPUT :',nza,nzaold - call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2956,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 952b9751..542f842e 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2875,14 +2877,15 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - - !write(0,*) 'From CSPUT :',nza,nzaold - call a%set_nzeros(nza) call a%set_sorted(.false.) else if (a%is_upd()) then @@ -2956,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz