diff --git a/base/internals/psi_bld_tmphalo.f90 b/base/internals/psi_bld_tmphalo.f90 index 78ce69ed..9d3fb7ab 100644 --- a/base/internals/psi_bld_tmphalo.f90 +++ b/base/internals/psi_bld_tmphalo.f90 @@ -104,6 +104,7 @@ subroutine psi_bld_tmphalo(desc,info) call desc%indxmap%l2gip(helem(1:nh),info) if (info == psb_success_) call desc%indxmap%fnd_owner(helem(1:nh),hproc,info) + !write(0,*) 'bld_tmphalo calling set_owner',hproc(:) if (info == psb_success_) call desc%indxmap%set_halo_owner(hproc,info) if (info == psb_success_) call desc%indxmap%xtnd_p_adjcncy(hproc) diff --git a/base/modules/auxil/psb_c_qsort_mod.f90 b/base/modules/auxil/psb_c_qsort_mod.f90 index 8b365222..6c4ceb3f 100644 --- a/base/modules/auxil/psb_c_qsort_mod.f90 +++ b/base/modules/auxil/psb_c_qsort_mod.f90 @@ -44,7 +44,6 @@ module psb_c_qsort_mod use psb_const_mod - interface psb_qsort subroutine psb_cqsort(x,ix,dir,flag) import diff --git a/base/modules/auxil/psb_c_realloc_mod.F90 b/base/modules/auxil/psb_c_realloc_mod.F90 index 231fde3f..9b22bee7 100644 --- a/base/modules/auxil/psb_c_realloc_mod.F90 +++ b/base/modules/auxil/psb_c_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_d_qsort_mod.f90 b/base/modules/auxil/psb_d_qsort_mod.f90 index 4e1be1d1..4da0b840 100644 --- a/base/modules/auxil/psb_d_qsort_mod.f90 +++ b/base/modules/auxil/psb_d_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_d_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_dbsrch(key,n,v) result(ipos) + function psb_dbsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n real(psb_dpk_) :: key real(psb_dpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_dbsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_d_realloc_mod.F90 b/base/modules/auxil/psb_d_realloc_mod.F90 index cb48fc66..ca85e0ec 100644 --- a/base/modules/auxil/psb_d_realloc_mod.F90 +++ b/base/modules/auxil/psb_d_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_e_qsort_mod.f90 b/base/modules/auxil/psb_e_qsort_mod.f90 index 17943bbf..09f45d45 100644 --- a/base/modules/auxil/psb_e_qsort_mod.f90 +++ b/base/modules/auxil/psb_e_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_e_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_ebsrch(key,n,v) result(ipos) + function psb_ebsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n integer(psb_epk_) :: key integer(psb_epk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_ebsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index 1bf45e30..06a6d034 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_i2_qsort_mod.f90 b/base/modules/auxil/psb_i2_qsort_mod.f90 index 944a436e..2f192a0a 100644 --- a/base/modules/auxil/psb_i2_qsort_mod.f90 +++ b/base/modules/auxil/psb_i2_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_i2_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_i2bsrch(key,n,v) result(ipos) + function psb_i2bsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n integer(psb_i2pk_) :: key integer(psb_i2pk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_i2bsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_i2_realloc_mod.F90 b/base/modules/auxil/psb_i2_realloc_mod.F90 index cbf4d58a..146bdf7e 100644 --- a/base/modules/auxil/psb_i2_realloc_mod.F90 +++ b/base/modules/auxil/psb_i2_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_m_qsort_mod.f90 b/base/modules/auxil/psb_m_qsort_mod.f90 index cb4c81c1..bf029065 100644 --- a/base/modules/auxil/psb_m_qsort_mod.f90 +++ b/base/modules/auxil/psb_m_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_m_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_mbsrch(key,n,v) result(ipos) + function psb_mbsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n integer(psb_mpk_) :: key integer(psb_mpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_mbsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_m_realloc_mod.F90 b/base/modules/auxil/psb_m_realloc_mod.F90 index e0e486ec..4d2f9316 100644 --- a/base/modules/auxil/psb_m_realloc_mod.F90 +++ b/base/modules/auxil/psb_m_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_s_qsort_mod.f90 b/base/modules/auxil/psb_s_qsort_mod.f90 index d4851fd1..a5bdb2d9 100644 --- a/base/modules/auxil/psb_s_qsort_mod.f90 +++ b/base/modules/auxil/psb_s_qsort_mod.f90 @@ -43,14 +43,13 @@ module psb_s_qsort_mod use psb_const_mod - - interface psb_bsrch - function psb_sbsrch(key,n,v) result(ipos) + function psb_sbsrch(key,n,v,dir,find) result(ipos) import integer(psb_ipk_) :: ipos, n real(psb_spk_) :: key real(psb_spk_) :: v(:) + integer(psb_ipk_), optional :: dir, find end function psb_sbsrch end interface psb_bsrch diff --git a/base/modules/auxil/psb_s_realloc_mod.F90 b/base/modules/auxil/psb_s_realloc_mod.F90 index 676caf31..f064e606 100644 --- a/base/modules/auxil/psb_s_realloc_mod.F90 +++ b/base/modules/auxil/psb_s_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psb_z_qsort_mod.f90 b/base/modules/auxil/psb_z_qsort_mod.f90 index 14ee0c57..2fc6baab 100644 --- a/base/modules/auxil/psb_z_qsort_mod.f90 +++ b/base/modules/auxil/psb_z_qsort_mod.f90 @@ -44,7 +44,6 @@ module psb_z_qsort_mod use psb_const_mod - interface psb_qsort subroutine psb_zqsort(x,ix,dir,flag) import diff --git a/base/modules/auxil/psb_z_realloc_mod.F90 b/base/modules/auxil/psb_z_realloc_mod.F90 index 652d3c99..e9eb26d3 100644 --- a/base/modules/auxil/psb_z_realloc_mod.F90 +++ b/base/modules/auxil/psb_z_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,24 +997,6 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If isz = psb_size(v) If (len > isz) Then #if defined(OPENMP) diff --git a/base/modules/auxil/psi_c_serial_mod.f90 b/base/modules/auxil/psi_c_serial_mod.f90 index 191e7ef3..0fdff04b 100644 --- a/base/modules/auxil/psi_c_serial_mod.f90 +++ b/base/modules/auxil/psi_c_serial_mod.f90 @@ -156,4 +156,15 @@ module psi_c_serial_mod end subroutine psi_csctv end interface psi_sct + interface psi_exscan + subroutine psi_c_exscanv(n,x,info,shift) + import :: psb_ipk_, psb_spk_ + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_spk_), intent(in), optional :: shift + end subroutine psi_c_exscanv + end interface psi_exscan + end module psi_c_serial_mod diff --git a/base/modules/auxil/psi_d_serial_mod.f90 b/base/modules/auxil/psi_d_serial_mod.f90 index f1dbc16c..0ce14dbb 100644 --- a/base/modules/auxil/psi_d_serial_mod.f90 +++ b/base/modules/auxil/psi_d_serial_mod.f90 @@ -156,4 +156,15 @@ module psi_d_serial_mod end subroutine psi_dsctv end interface psi_sct + interface psi_exscan + subroutine psi_d_exscanv(n,x,info,shift) + import :: psb_ipk_, psb_dpk_ + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_dpk_), intent(in), optional :: shift + end subroutine psi_d_exscanv + end interface psi_exscan + end module psi_d_serial_mod diff --git a/base/modules/auxil/psi_e_serial_mod.f90 b/base/modules/auxil/psi_e_serial_mod.f90 index 909b025c..f0372e01 100644 --- a/base/modules/auxil/psi_e_serial_mod.f90 +++ b/base/modules/auxil/psi_e_serial_mod.f90 @@ -156,4 +156,15 @@ module psi_e_serial_mod end subroutine psi_esctv end interface psi_sct + interface psi_exscan + subroutine psi_e_exscanv(n,x,info,shift) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_epk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_epk_), intent(in), optional :: shift + end subroutine psi_e_exscanv + end interface psi_exscan + end module psi_e_serial_mod diff --git a/base/modules/auxil/psi_i2_serial_mod.f90 b/base/modules/auxil/psi_i2_serial_mod.f90 index 38ef9c38..70dd95e1 100644 --- a/base/modules/auxil/psi_i2_serial_mod.f90 +++ b/base/modules/auxil/psi_i2_serial_mod.f90 @@ -156,4 +156,15 @@ module psi_i2_serial_mod end subroutine psi_i2sctv end interface psi_sct + interface psi_exscan + subroutine psi_i2_exscanv(n,x,info,shift) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_i2pk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_i2pk_), intent(in), optional :: shift + end subroutine psi_i2_exscanv + end interface psi_exscan + end module psi_i2_serial_mod diff --git a/base/modules/auxil/psi_m_serial_mod.f90 b/base/modules/auxil/psi_m_serial_mod.f90 index a80d0ffe..cfd1348e 100644 --- a/base/modules/auxil/psi_m_serial_mod.f90 +++ b/base/modules/auxil/psi_m_serial_mod.f90 @@ -156,4 +156,15 @@ module psi_m_serial_mod end subroutine psi_msctv end interface psi_sct + interface psi_exscan + subroutine psi_m_exscanv(n,x,info,shift) + import :: psb_ipk_, psb_lpk_,psb_mpk_, psb_epk_ + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_mpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_), intent(in), optional :: shift + end subroutine psi_m_exscanv + end interface psi_exscan + end module psi_m_serial_mod diff --git a/base/modules/auxil/psi_s_serial_mod.f90 b/base/modules/auxil/psi_s_serial_mod.f90 index 3c0a3bdc..25c4a7ef 100644 --- a/base/modules/auxil/psi_s_serial_mod.f90 +++ b/base/modules/auxil/psi_s_serial_mod.f90 @@ -156,4 +156,15 @@ module psi_s_serial_mod end subroutine psi_ssctv end interface psi_sct + interface psi_exscan + subroutine psi_s_exscanv(n,x,info,shift) + import :: psb_ipk_, psb_spk_ + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_spk_), intent(in), optional :: shift + end subroutine psi_s_exscanv + end interface psi_exscan + end module psi_s_serial_mod diff --git a/base/modules/auxil/psi_z_serial_mod.f90 b/base/modules/auxil/psi_z_serial_mod.f90 index 7bc9728e..b40cf05a 100644 --- a/base/modules/auxil/psi_z_serial_mod.f90 +++ b/base/modules/auxil/psi_z_serial_mod.f90 @@ -156,4 +156,15 @@ module psi_z_serial_mod end subroutine psi_zsctv end interface psi_sct + interface psi_exscan + subroutine psi_z_exscanv(n,x,info,shift) + import :: psb_ipk_, psb_dpk_ + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_dpk_), intent(in), optional :: shift + end subroutine psi_z_exscanv + end interface psi_exscan + end module psi_z_serial_mod diff --git a/base/modules/desc/psb_gen_block_map_mod.F90 b/base/modules/desc/psb_gen_block_map_mod.F90 index dfb47b61..f0c433e0 100644 --- a/base/modules/desc/psb_gen_block_map_mod.F90 +++ b/base/modules/desc/psb_gen_block_map_mod.F90 @@ -488,6 +488,7 @@ contains integer(psb_ipk_) :: iam, np logical :: owned_ + !write(0,*) 'block_g2lv2' info = 0 ctxt = idxmap%get_ctxt() call psb_info(ctxt,iam,np) diff --git a/base/modules/desc/psb_glist_map_mod.f90 b/base/modules/desc/psb_glist_map_mod.F90 similarity index 100% rename from base/modules/desc/psb_glist_map_mod.f90 rename to base/modules/desc/psb_glist_map_mod.F90 diff --git a/base/modules/desc/psb_hash_map_mod.f90 b/base/modules/desc/psb_hash_map_mod.F90 similarity index 78% rename from base/modules/desc/psb_hash_map_mod.f90 rename to base/modules/desc/psb_hash_map_mod.F90 index 06e68451..058dbb8d 100644 --- a/base/modules/desc/psb_hash_map_mod.f90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -207,7 +207,6 @@ contains integer(psb_ipk_) :: i logical :: owned_ info = 0 - if (present(mask)) then if (size(mask) < size(idx)) then info = -1 @@ -249,7 +248,6 @@ contains end do end if - end subroutine hash_l2gv1 subroutine hash_l2gv2(idxin,idxout,idxmap,info,mask,owned) @@ -334,7 +332,6 @@ contains info = 0 ctxt = idxmap%get_ctxt() call psb_info(ctxt,iam,np) - if (present(mask)) then if (size(mask) < size(idx)) then info = -1 @@ -378,6 +375,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -417,6 +415,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -437,7 +436,6 @@ contains end if end if - end subroutine hash_g2lv1 subroutine hash_g2lv2(idxin,idxout,idxmap,info,mask,owned) @@ -460,7 +458,6 @@ contains is = size(idxin) im = min(is,size(idxout)) - info = 0 ctxt = idxmap%get_ctxt() call psb_info(ctxt,iam,np) @@ -508,6 +505,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -520,7 +518,6 @@ contains endif end if enddo - else write(0,*) 'Hash status: invalid ',idxmap%get_state() idxout(1:is) = -1 @@ -547,6 +544,7 @@ contains if (lip < 0) then call psb_hash_searchkey(ip,tlip,idxmap%hash,info) lip = tlip + info = 0 end if if (owned_) then if (lip<=nrow) then @@ -567,7 +565,6 @@ contains end if end if - end subroutine hash_g2lv2 @@ -636,7 +633,9 @@ contains use psb_realloc_mod use psb_sort_mod use psb_penv_mod - !$ use omp_lib +#ifdef OPENMP + use omp_lib +#endif implicit none @@ -652,15 +651,14 @@ contains type(psb_ctxt_type) :: ctxt integer(psb_ipk_) :: me, np character(len=20) :: name,ch_err - logical :: use_openmp = .false. - + logical, allocatable :: mask_(:) +!!$ logical :: use_openmp = .true. +#ifdef OPENMP + integer(kind = OMP_lock_kind) :: ins_lck +#endif logical, volatile :: isLoopValid - !$ integer(kind = OMP_lock_kind) :: ins_lck - - !$ use_openmp = .true. - info = psb_success_ - name = 'hash_g2l_ins' + name = 'hash_g2lv1_ins' call psb_erractionsave(err_act) ctxt = idxmap%get_ctxt() @@ -684,340 +682,321 @@ contains mglob = idxmap%get_gr() nrow = idxmap%get_lr() - - if (idxmap%is_bld()) then - if (use_openmp) then - !$ call OMP_init_lock(ins_lck) - isLoopValid = .true. - ncol = idxmap%get_lc() + !write(0,*) me,name,' before loop ',psb_errstatus_fatal() +#ifdef OPENMP + !call OMP_init_lock(ins_lck) + + if (idxmap%is_bld()) then + + isLoopValid = .true. + ncol = idxmap%get_lc() + if (present(mask)) then + mask_ = mask + else + allocate(mask_(size(idx))) + mask_ = .true. end if if (present(lidx)) then if (present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,mask,lidx,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is - - if (mask(i)) then - ip = idx(i) + !$omp critical(hash_g2l_ins) + + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + !call OMP_set_lock(ins_lck) - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). + ncol = idxmap%get_lc() + nxt = lidx(i) call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - !$ call OMP_set_lock(ins_lck) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap read/write. - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) + ncol = MAX(ncol,nxt) - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) - isLoopValid = .false. - cycle - end if + if (info /= psb_success_) then + !write(0,*) 'Error spot 1' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) - idxmap%loc_to_glob(nxt) = ip + isLoopValid = .false. + idx(i) = -1 else - !$ call OMP_unset_lock(ins_lck) + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) end if - - info = psb_success_ - - else - !$ call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle end if else - !$ call OMP_unset_lock(ins_lck) + idx(i) = -1 end if + !call OMP_unset_lock(ins_lck) end if - - idx(i) = lip - info = psb_success_ else - idx(i) = -1 + idx(i) = lip end if - end do - !$OMP END PARALLEL DO - - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if - else - do i = 1, is + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 + end if + else + !$omp critical(hash_g2l_ins) + + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob) ) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (info >=0) then - if (nxt == tlip) then - ncol = max(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip + nxt = lidx(i) + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then + + ncol = MAX(ncol,nxt) + + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + + if (info /= psb_success_) then + !write(0,*) 'Error spot 2' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + + isLoopValid = .false. + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 + end if end if + else + idx(i) = -1 end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 + !call OMP_unset_lock(ins_lck) end if - enddo + else + idx(i) = lip + end if + + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 end if - else if (.not.present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,lidx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is + end if + else if (.not.present(lidx)) then + if(present(mask)) then + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + + !$omp critical(hash_g2l_ins) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + if (mask(i)) then ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) - ! In OMP logic the index research limit is turned off. It is - ! a correct way to fit the subroutine? + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + !write(0,*) me,name,' b hic 1 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - + !write(0,*) me,name,' a hic 1 ',psb_errstatus_fatal() if (lip < 0) then - tlip = lip - nxt = lidx(i) - - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - + !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index - ! is often already existing). - !$ call OMP_set_lock(ins_lck) + ! is often already existing, so this lock is relatively rare). + ncol = idxmap%get_lc() + nxt = ncol + 1 + !write(0,*) me,name,' b hic 2 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip < 0) then - ! Locking system to handle concurrent write/access. Under checking! + & idxmap%hashv,idxmap%glb_lc,ncol) + !write(0,*) me,name,' a hic 2 ',psb_errstatus_fatal() + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + ! Index not found + !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - !$ call OMP_unset_lock(ins_lck) + if (psb_errstatus_fatal()) write(0,*) me,name,' a hsik ',info,omp_get_thread_num() + !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() + lip = tlip if (info >= 0) then + !write(0,*) 'Error before spot 3', info ! 'nxt' is not equal to 'tlip' when the key is already inside ! the hash map. In that case 'tlip' is the value corresponding ! to the existing mapping. if (nxt == tlip) then ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) - - ! Under checking! call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) - + if (psb_errstatus_fatal()) write(0,*) me,name,' a esz ',info,omp_get_thread_num() if (info /= psb_success_) then + !write(0,*) 'Error spot 3', info call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - cycle + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) end if - - idxmap%loc_to_glob(nxt) = ip - else - !$ call OMP_unset_lock(ins_lck) end if - - info = psb_success_ else - !$ call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle + idx(i) = -1 end if - else - !$ call OMP_unset_lock(ins_lck) + !call OMP_unset_lock(ins_lck) end if + else + idx(i) = lip end if - - idx(i) = lip - info = psb_success_ - end do - !$OMP END PARALLEL DO - - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if - else - do i = 1, is + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 + end if + else + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + !$omp critical(hash_g2l_ins) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + ! Index not found call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip - - if (info >=0) then - if (nxt == lip) then - ncol = max(nxt,ncol) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 - end if - end if - idx(i) = lip - info = psb_success_ - enddo - end if - end if - - else if (.not.present(lidx)) then - - if (present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mask,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is - - if (mask(i)) then - ip = idx(i) - - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip < 0) then - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - !$ call OMP_set_lock(ins_lck) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap write/access. - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - else - !$ call OMP_unset_lock(ins_lck) - end if - - idx(i) = lip - info = psb_success_ - end if if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside @@ -1026,74 +1005,84 @@ contains if (nxt == tlip) then ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then + !write(0,*) 'Error spot 4' call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) + &a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - cycle + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) end if - - idxmap%loc_to_glob(nxt) = ip - else - !$ call OMP_unset_lock(ins_lck) end if - - info = psb_success_ else - !$ call OMP_unset_lock(ins_lck) - - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - - isLoopValid = .false. - cycle + idx(i) = -1 end if - - else - idx(i) = -1 + !call OMP_unset_lock(ins_lck) end if - end do - !$OMP END PARALLEL DO - call idxmap%set_lc(ncol) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = lip end if - else - do i = 1, is - ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 + end if + + end if + end if + else + ! Wrong state + idx = -1 + info = -1 + end if + !call OMP_destroy_lock(ins_lck) +#else +!!$ else if (.not.use_openmp) then + isLoopValid = .true. + if (idxmap%is_bld()) then + + if (present(lidx)) then + if (present(mask)) then + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + tlip = lip + nxt = lidx(i) + if (nxt <= nrow) then idx(i) = -1 cycle endif - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - end if - + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) if (info >=0) then - if (nxt == lip) then - ncol = nxt + if (nxt == tlip) then + ncol = max(ncol,nxt) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then - info=1 + !write(0,*) 'Error spot' call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='psb_ensure_size',i_err=(/info/)) - goto 9999 + &a_err='psb_ensure_size',i_err=(/info/)) + isLoopValid = .false. end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) @@ -1102,111 +1091,76 @@ contains else call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) - goto 9999 + isLoopValid = .false. end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 end if - enddo - end if - else if (.not.present(mask)) then - if (use_openmp) then - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(name,is,idx,mglob,idxmap,ncol,nrow,ins_lck,laddsz) & - !$OMP private(i,ip,lip,tlip,nxt,info) & - !$OMP reduction(.AND.:isLoopValid) - do i = 1, is + idx(i) = lip + info = psb_success_ + else + idx(i) = -1 + end if + enddo - ip = idx(i) + else if (.not.present(mask)) then - if ((ip < 1 ).or.(ip>mglob)) then + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + nxt = lidx(i) + if (nxt <= nrow) then idx(i) = -1 cycle endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip < 0) then - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - !$ call OMP_set_lock(ins_lck) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - ! Index not found - if (lip < 0) then - ! Locking system to handle concurrent hashmap write/access. - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - else - !$ call OMP_unset_lock(ins_lck) - end if - - idx(i) = lip - info = psb_success_ - end if - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - ncol = MAX(ncol,nxt) - !$ call OMP_unset_lock(ins_lck) - + if (info >=0) then + if (nxt == lip) then + ncol = max(nxt,ncol) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then + info=1 + !write(0,*) 'Error spot' call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - !$ isLoopValid = .false. - cycle + &a_err='psb_ensure_size',i_err=(/info/)) + isLoopValid = .false. end if - - idxmap%loc_to_glob(nxt) = ip - else - !$ call OMP_unset_lock(ins_lck) - end if - + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif info = psb_success_ - else - !$ call OMP_unset_lock(ins_lck) - call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='SearchInsKeyVal',i_err=(/info/)) - isLoopValid = .false. - cycle end if - end do - !$OMP END PARALLEL DO + end if + idx(i) = lip + info = psb_success_ + enddo - call idxmap%set_lc(ncol) + end if - if (.not. isLoopValid) then - goto 9999 - end if + else if (.not.present(lidx)) then - else - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) + if (present(mask)) then + do i = 1, is + if (mask(i)) then + ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - nxt = ncol + 1 + ncol = idxmap%get_lc() + nxt = ncol + 1 call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& & idxmap%glb_lc,ncol) if (lip < 0) then @@ -1221,38 +1175,81 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then info=1 - ch_err='psb_ensure_size' + write(0,*) 'Error spot 5' call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 + & a_err='psb_ensure_size',i_err=(/info/)) + isLoopValid = .false. end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) endif info = psb_success_ else - ch_err='SearchInsKeyVal' call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - goto 9999 + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. end if idx(i) = lip info = psb_success_ - enddo - end if - end if - end if + else + idx(i) = -1 + end if + enddo + else if (.not.present(mask)) then - if (use_openmp) then - !$ call OMP_destroy_lock(ins_lck) - end if + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + write(0,*) 'Error spot 6' + ch_err='psb_ensure_size' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + isLoopValid = .false. + + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + ch_err='SearchInsKeyVal' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + isLoopValid = .false. + end if + idx(i) = lip + info = psb_success_ + enddo + end if + end if else ! Wrong state idx = -1 info = -1 end if - + if (.not. isLoopValid) goto 9999 +#endif + !write(0,*) me,name,' after loop ',psb_errstatus_fatal() call psb_erractionrestore(err_act) return @@ -1266,6 +1263,7 @@ contains subroutine hash_g2lv2_ins(idxin,idxout,idxmap,info,mask,lidx) use psb_realloc_mod + use psb_error_mod implicit none class(psb_hash_map), intent(inout) :: idxmap integer(psb_lpk_), intent(in) :: idxin(:) @@ -1278,7 +1276,9 @@ contains is = size(idxin) im = min(is,size(idxout)) + !write(0,*) 'g2lv2_ins before realloc ',psb_errstatus_fatal() call psb_realloc(im,tidx,info) + !write(0,*) 'g2lv2_ins after realloc ',psb_errstatus_fatal() tidx(1:im) = idxin(1:im) call idxmap%g2lip_ins(tidx(1:im),info,mask=mask,lidx=lidx) idxout(1:im) = tidx(1:im) diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index 42911e3f..9e45e1f0 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -388,7 +388,7 @@ contains info = HashOK hsize = hash%hsize hmask = hash%hmask - + val = -1 hk = iand(psb_hashval(key),hmask) if (hk == 0) then hd = 1 @@ -407,35 +407,49 @@ contains if (hash%table(hk,1) == key) then val = hash%table(hk,2) info = HashDuplicate + !write(0,*) 'In searchinskey 1 : ', info, HashDuplicate return end if - if (hash%table(hk,1) == HashFreeEntry) then - if (hash%nk == hash%hsize -1) then - ! - ! Note: because of the way we allocate things at CDALL - ! time this is really unlikely; if we get here, we - ! have at least as many halo indices as internals, which - ! means we're already in trouble. But we try to keep going. - ! - call psb_hash_realloc(hash,info) - if (info /= HashOk) then - info = HashOutOfMemory - return + !$omp critical(hashsearchins) + if (hash%table(hk,1) == key) then + val = hash%table(hk,2) + info = HashDuplicate + else + if (hash%table(hk,1) == HashFreeEntry) then + if (hash%nk == hash%hsize -1) then + ! + ! Note: because of the way we allocate things at CDALL + ! time this is really unlikely; if we get here, we + ! have at least as many halo indices as internals, which + ! means we're already in trouble. But we try to keep going. + ! + call psb_hash_realloc(hash,info) + if (info /= HashOk) then + info = HashOutOfMemory + !return + else + call psb_hash_searchinskey(key,val,nextval,hash,info) + !return + end if else - call psb_hash_searchinskey(key,val,nextval,hash,info) - return + hash%nk = hash%nk + 1 + hash%table(hk,1) = key + hash%table(hk,2) = nextval + val = nextval + !return end if - else - hash%nk = hash%nk + 1 - hash%table(hk,1) = key - hash%table(hk,2) = nextval - val = nextval - return end if end if + !$omp end critical(hashsearchins) + if (info /= HashOk) then + write(0,*) 'In searchinskey 2: ', info + return + end if + if (val > 0) return hk = hk - hd if (hk < 0) hk = hk + hsize end do + !write(0,*) 'In searchinskey 3: ', info end subroutine psb_hash_lsearchinskey recursive subroutine psb_hash_isearchinskey(key,val,nextval,hash,info) @@ -448,7 +462,7 @@ contains info = HashOK hsize = hash%hsize hmask = hash%hmask - + hk = iand(psb_hashval(key),hmask) if (hk == 0) then hd = 1 @@ -460,7 +474,7 @@ contains info = HashOutOfMemory return end if - + val = -1 hash%nsrch = hash%nsrch + 1 do hash%nacc = hash%nacc + 1 @@ -469,6 +483,7 @@ contains info = HashDuplicate return end if + !$OMP CRITICAL if (hash%table(hk,1) == HashFreeEntry) then if (hash%nk == hash%hsize -1) then ! @@ -480,19 +495,22 @@ contains call psb_hash_realloc(hash,info) if (info /= HashOk) then info = HashOutOfMemory - return + !return else call psb_hash_searchinskey(key,val,nextval,hash,info) - return + !return end if else hash%nk = hash%nk + 1 hash%table(hk,1) = key hash%table(hk,2) = nextval val = nextval - return + !return end if end if + !$OMP END CRITICAL + if (info /= HashOk) return + if (val > 0) return hk = hk - hd if (hk < 0) hk = hk + hsize end do diff --git a/base/modules/desc/psb_list_map_mod.f90 b/base/modules/desc/psb_list_map_mod.F90 similarity index 80% rename from base/modules/desc/psb_list_map_mod.f90 rename to base/modules/desc/psb_list_map_mod.F90 index 3e3c8e25..6b61cf52 100644 --- a/base/modules/desc/psb_list_map_mod.f90 +++ b/base/modules/desc/psb_list_map_mod.F90 @@ -349,7 +349,6 @@ contains logical :: owned_ info = 0 - if (present(mask)) then if (size(mask) < size(idxin)) then info = -1 @@ -501,19 +500,37 @@ contains if (present(lidx)) then if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if ((ix <= idxmap%local_rows).or.(info /= 0)) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if ((ix <= idxmap%local_rows).or.(info /= 0)) then + if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -525,18 +542,37 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if ((ix <= idxmap%local_rows).or.(info /= 0)) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if ((ix <= idxmap%local_rows).or.(info /= 0)) then + if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -549,19 +585,37 @@ contains if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) - if (info /= 0) then + if (info /= 0) then info = -4 - return + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -573,18 +627,36 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idx(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then info = -4 - return + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idx(i) + idxmap%glob_to_loc(idx(i)) = ix end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idx(i) - idxmap%glob_to_loc(idx(i)) = ix +#endif end if idx(i) = ix else @@ -641,19 +713,37 @@ contains if (present(lidx)) then if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) - if (ix < 0) then + if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix +#endif end if idxout(i) = ix else @@ -665,18 +755,36 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = lidx(i) + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = lidx(i) call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if ((ix <= idxmap%local_rows).or.(info /= 0)) then info = -4 - return + else + idxmap%local_cols = max(ix,idxmap%local_cols) + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix end if - idxmap%local_cols = max(ix,idxmap%local_cols) - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix +#endif end if idxout(i) = ix else @@ -689,19 +797,37 @@ contains if (present(mask)) then do i=1, is + if (info /= 0) cycle if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) - if (ix < 0) then + if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then info = -4 - return + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix +#endif end if idxout(i) = ix else @@ -713,18 +839,36 @@ contains else if (.not.present(mask)) then do i=1, is + if (info /= 0) cycle if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) if (ix < 0) then +#if defined(OPENMP) + !$OMP CRITICAL(LISTINS) + ix = idxmap%glob_to_loc(idxin(i)) + if (ix < 0) then + ix = idxmap%local_cols + 1 + call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) + if (info /= 0) then + info = -4 + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if + end if + !$OMP END CRITICAL(LISTINS) +#else ix = idxmap%local_cols + 1 call psb_ensure_size(ix,idxmap%loc_to_glob,info,addsz=laddsz) if (info /= 0) then info = -4 - return - end if - idxmap%local_cols = ix - idxmap%loc_to_glob(ix) = idxin(i) - idxmap%glob_to_loc(idxin(i)) = ix + else + idxmap%local_cols = ix + idxmap%loc_to_glob(ix) = idxin(i) + idxmap%glob_to_loc(idxin(i)) = ix + end if +#endif end if idxout(i) = ix else diff --git a/base/modules/desc/psb_repl_map_mod.f90 b/base/modules/desc/psb_repl_map_mod.F90 similarity index 100% rename from base/modules/desc/psb_repl_map_mod.f90 rename to base/modules/desc/psb_repl_map_mod.F90 diff --git a/base/modules/psb_const_mod.F90 b/base/modules/psb_const_mod.F90 index cc29f049..56134474 100644 --- a/base/modules/psb_const_mod.F90 +++ b/base/modules/psb_const_mod.F90 @@ -185,13 +185,17 @@ module psb_const_mod ! The up/down constant are defined in pairs having ! opposite values. We make use of this fact in the heapsort routine. ! - integer(psb_ipk_), parameter :: psb_sort_up_ = 1, psb_sort_down_ = -1 - integer(psb_ipk_), parameter :: psb_lsort_up_ = 2, psb_lsort_down_ = -2 - integer(psb_ipk_), parameter :: psb_asort_up_ = 3, psb_asort_down_ = -3 - integer(psb_ipk_), parameter :: psb_alsort_up_ = 4, psb_alsort_down_ = -4 - integer(psb_ipk_), parameter :: psb_sort_ovw_idx_ = 0, psb_sort_keep_idx_ = 1 - integer(psb_ipk_), parameter :: psb_heap_resize = 200 + integer(psb_ipk_), parameter :: psb_sort_up_ = 1, psb_sort_down_ = -1 + integer(psb_ipk_), parameter :: psb_lsort_up_ = 2, psb_lsort_down_ = -2 + integer(psb_ipk_), parameter :: psb_asort_up_ = 3, psb_asort_down_ = -3 + integer(psb_ipk_), parameter :: psb_alsort_up_ = 4, psb_alsort_down_ = -4 + integer(psb_ipk_), parameter :: psb_sort_ovw_idx_ = 0, psb_sort_keep_idx_ = 1 + integer(psb_ipk_), parameter :: psb_heap_resize = 200 + integer(psb_ipk_), parameter :: psb_find_any_ = 0 + integer(psb_ipk_), parameter :: psb_find_first_ge_ = 1 + integer(psb_ipk_), parameter :: psb_find_last_le_ = 2 + ! ! Sparse matrix constants diff --git a/base/modules/serial/psb_c_base_mat_mod.F90 b/base/modules/serial/psb_c_base_mat_mod.F90 index 7a7bbb1f..33982e3a 100644 --- a/base/modules/serial/psb_c_base_mat_mod.F90 +++ b/base/modules/serial/psb_c_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_c_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_c_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_c_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_c_coo_ensure_size + procedure, pass(a) :: tril => psb_c_coo_tril + procedure, pass(a) :: triu => psb_c_coo_triu procedure, pass(a) :: cp_to_coo => psb_c_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_c_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_c_cp_coo_to_fmt @@ -1866,18 +1868,6 @@ module psb_c_base_mat_mod end subroutine psb_c_fix_coo_inner end interface - interface - subroutine psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_c_fix_coo_inner_colmajor - end interface - interface subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) @@ -1906,6 +1896,93 @@ module psb_c_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_c_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_c_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_c_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_c_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_c_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_c_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_c_coo_triu + end interface + !> !! \memberof psb_c_coo_sparse_mat diff --git a/base/modules/serial/psb_c_base_vect_mod.F90 b/base/modules/serial/psb_c_base_vect_mod.F90 index 44044771..df15e0c9 100644 --- a/base/modules/serial/psb_c_base_vect_mod.F90 +++ b/base/modules/serial/psb_c_base_vect_mod.F90 @@ -481,7 +481,11 @@ contains implicit none class(psb_c_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=czero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=czero + !$omp end workshare + end if call x%set_host() end subroutine c_base_zero diff --git a/base/modules/serial/psb_d_base_mat_mod.F90 b/base/modules/serial/psb_d_base_mat_mod.F90 index eb49905d..5f4c76df 100644 --- a/base/modules/serial/psb_d_base_mat_mod.F90 +++ b/base/modules/serial/psb_d_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_d_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_d_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_d_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_d_coo_ensure_size + procedure, pass(a) :: tril => psb_d_coo_tril + procedure, pass(a) :: triu => psb_d_coo_triu procedure, pass(a) :: cp_to_coo => psb_d_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_d_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_d_cp_coo_to_fmt @@ -1866,18 +1868,6 @@ module psb_d_base_mat_mod end subroutine psb_d_fix_coo_inner end interface - interface - subroutine psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_d_fix_coo_inner_colmajor - end interface - interface subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) @@ -1906,6 +1896,93 @@ module psb_d_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_d_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_d_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_d_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_d_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_d_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_d_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_d_coo_triu + end interface + !> !! \memberof psb_d_coo_sparse_mat diff --git a/base/modules/serial/psb_d_base_vect_mod.F90 b/base/modules/serial/psb_d_base_vect_mod.F90 index a28d12f6..87f5b0e4 100644 --- a/base/modules/serial/psb_d_base_vect_mod.F90 +++ b/base/modules/serial/psb_d_base_vect_mod.F90 @@ -488,7 +488,11 @@ contains implicit none class(psb_d_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=dzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=dzero + !$omp end workshare + end if call x%set_host() end subroutine d_base_zero diff --git a/base/modules/serial/psb_i_base_vect_mod.F90 b/base/modules/serial/psb_i_base_vect_mod.F90 index 0289ecd0..a5cddeb5 100644 --- a/base/modules/serial/psb_i_base_vect_mod.F90 +++ b/base/modules/serial/psb_i_base_vect_mod.F90 @@ -417,7 +417,11 @@ contains implicit none class(psb_i_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=izero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=izero + !$omp end workshare + end if call x%set_host() end subroutine i_base_zero diff --git a/base/modules/serial/psb_l_base_vect_mod.F90 b/base/modules/serial/psb_l_base_vect_mod.F90 index d8654f63..93b29e17 100644 --- a/base/modules/serial/psb_l_base_vect_mod.F90 +++ b/base/modules/serial/psb_l_base_vect_mod.F90 @@ -418,7 +418,11 @@ contains implicit none class(psb_l_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=lzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=lzero + !$omp end workshare + end if call x%set_host() end subroutine l_base_zero diff --git a/base/modules/serial/psb_s_base_mat_mod.F90 b/base/modules/serial/psb_s_base_mat_mod.F90 index 79c8222b..92bda7d8 100644 --- a/base/modules/serial/psb_s_base_mat_mod.F90 +++ b/base/modules/serial/psb_s_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_s_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_s_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_s_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_s_coo_ensure_size + procedure, pass(a) :: tril => psb_s_coo_tril + procedure, pass(a) :: triu => psb_s_coo_triu procedure, pass(a) :: cp_to_coo => psb_s_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_s_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_s_cp_coo_to_fmt @@ -1866,18 +1868,6 @@ module psb_s_base_mat_mod end subroutine psb_s_fix_coo_inner end interface - interface - subroutine psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_s_fix_coo_inner_colmajor - end interface - interface subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) @@ -1906,6 +1896,93 @@ module psb_s_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_s_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_s_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_s_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_s_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_s_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_s_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_s_coo_triu + end interface + !> !! \memberof psb_s_coo_sparse_mat diff --git a/base/modules/serial/psb_s_base_vect_mod.F90 b/base/modules/serial/psb_s_base_vect_mod.F90 index 4bd6bbfb..fccd846b 100644 --- a/base/modules/serial/psb_s_base_vect_mod.F90 +++ b/base/modules/serial/psb_s_base_vect_mod.F90 @@ -488,7 +488,11 @@ contains implicit none class(psb_s_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=szero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=szero + !$omp end workshare + end if call x%set_host() end subroutine s_base_zero diff --git a/base/modules/serial/psb_z_base_mat_mod.F90 b/base/modules/serial/psb_z_base_mat_mod.F90 index 5b6ca07b..3e8196f4 100644 --- a/base/modules/serial/psb_z_base_mat_mod.F90 +++ b/base/modules/serial/psb_z_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_z_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_z_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_z_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_z_coo_ensure_size + procedure, pass(a) :: tril => psb_z_coo_tril + procedure, pass(a) :: triu => psb_z_coo_triu procedure, pass(a) :: cp_to_coo => psb_z_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_z_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_z_cp_coo_to_fmt @@ -1866,18 +1868,6 @@ module psb_z_base_mat_mod end subroutine psb_z_fix_coo_inner end interface - interface - subroutine psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) - import - integer(psb_ipk_), intent(in) :: nr,nc,nzin,dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout - integer(psb_ipk_), intent(out) :: info - end subroutine psb_z_fix_coo_inner_colmajor - end interface - interface subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) @@ -1906,6 +1896,93 @@ module psb_z_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_z_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_z_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_z_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_z_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_z_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_z_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_z_coo_triu + end interface + !> !! \memberof psb_z_coo_sparse_mat diff --git a/base/modules/serial/psb_z_base_vect_mod.F90 b/base/modules/serial/psb_z_base_vect_mod.F90 index c52dcd59..2a14de21 100644 --- a/base/modules/serial/psb_z_base_vect_mod.F90 +++ b/base/modules/serial/psb_z_base_vect_mod.F90 @@ -481,7 +481,11 @@ contains implicit none class(psb_z_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=zzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=zzero + !$omp end workshare + end if call x%set_host() end subroutine z_base_zero diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 7e3c8160..46391dee 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -2817,6 +2817,9 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_coo_sparse_mat), intent(inout) :: a @@ -2828,7 +2831,7 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='c_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2860,10 +2863,13 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2871,16 +2877,20 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - - + else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2949,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz @@ -3478,6 +3487,602 @@ subroutine psb_c_coo_mv_from(a,b) end subroutine psb_c_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_c_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_tril + implicit none + + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_coo_tril + +subroutine psb_c_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_triu + implicit none + + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_coo_triu + subroutine psb_c_fix_coo(a,info,idir) use psb_const_mod @@ -3561,7 +4166,7 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3620,8 +4225,6 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_c_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3660,12 +4263,13 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_spk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3747,9 +4351,9 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3762,81 +4366,46 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do + !t0 = omp_get_wtime() !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do !$OMP BARRIER - + !$OMP SINGLE + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3865,45 +4434,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3911,7 +4442,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3940,7 +4471,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3969,7 +4500,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4174,7 +4705,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4188,22 +4719,22 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4333,684 +4864,14 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_c_fix_coo_inner_rowmajor -subroutine psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod + +subroutine psb_c_cp_coo_to_lcoo(a,b,info) use psb_error_mod - use psb_c_base_mat_mod, psb_protect_name => psb_c_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif + use psb_c_base_mat_mod, psb_protect_name => psb_c_cp_coo_to_lcoo implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - complex(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_c_fix_coo_inner_colmajor - - -subroutine psb_c_cp_coo_to_lcoo(a,b,info) - use psb_error_mod - use psb_c_base_mat_mod, psb_protect_name => psb_c_cp_coo_to_lcoo - implicit none - class(psb_c_coo_sparse_mat), intent(in) :: a - class(psb_lc_coo_sparse_mat), intent(inout) :: b - integer(psb_ipk_), intent(out) :: info + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_lc_coo_sparse_mat), intent(inout) :: b + integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act integer(psb_lpk_) :: nz @@ -5296,7 +5157,7 @@ function psb_lc_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -5363,7 +5224,7 @@ function psb_lc_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -5413,7 +5274,7 @@ function psb_lc_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do @@ -8335,3 +8196,4 @@ subroutine psb_lc_cp_coo_from_icoo(a,b,info) return end subroutine psb_lc_cp_coo_from_icoo + diff --git a/base/serial/impl/psb_c_csc_impl.f90 b/base/serial/impl/psb_c_csc_impl.F90 similarity index 98% rename from base/serial/impl/psb_c_csc_impl.f90 rename to base/serial/impl/psb_c_csc_impl.F90 index 6360d73a..54332d06 100644 --- a/base/serial/impl/psb_c_csc_impl.f90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_c_base_mat_mod use psb_c_csc_mat_mod, psb_protect_name => psb_c_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,33 @@ subroutine psb_c_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione) + !$OMP END PARALLEL +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip + call psi_exscan(nc+1,a%icp,info,shift=ione) +#endif call a%set_host() @@ -2311,9 +2329,28 @@ subroutine psb_c_cp_csc_to_fmt(a,b,info) b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2425,9 +2462,27 @@ subroutine psb_c_cp_csc_from_fmt(a,b,info) a%psb_c_base_sparse_mat = b%psb_c_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_c_csr_impl.f90 b/base/serial/impl/psb_c_csr_impl.F90 similarity index 91% rename from base/serial/impl/psb_c_csr_impl.f90 rename to base/serial/impl/psb_c_csr_impl.F90 index 55a91648..6c3ecec6 100644 --- a/base/serial/impl/psb_c_csr_impl.f90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = czero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2277,7 +2289,155 @@ subroutine psb_c_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2347,7 +2507,7 @@ subroutine psb_c_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2431,6 +2591,158 @@ subroutine psb_c_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2499,7 +2811,7 @@ subroutine psb_c_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2844,6 +3156,9 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_c_base_mat_mod use psb_c_csr_mat_mod, psb_protect_name => psb_c_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_csr_sparse_mat), intent(inout) :: a @@ -2860,12 +3175,6 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) character(len=20) :: name='c_cp_csr_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2907,96 +3216,37 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif + call a%set_host() + end subroutine psb_c_cp_csr_from_coo @@ -3096,6 +3346,9 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_c_base_mat_mod use psb_c_csr_mat_mod, psb_protect_name => psb_c_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_c_csr_sparse_mat), intent(inout) :: a @@ -3109,13 +3362,6 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. - info = psb_success_ debug_unit = psb_get_debug_unit() @@ -3139,90 +3385,34 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() end subroutine psb_c_mv_csr_from_coo @@ -3300,9 +3490,28 @@ subroutine psb_c_cp_csr_to_fmt(a,b,info) b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3386,9 +3595,27 @@ subroutine psb_c_cp_csr_from_fmt(a,b,info) a%psb_c_base_sparse_mat = b%psb_c_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index f326e46f..c2babf8e 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -2817,6 +2817,9 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_coo_sparse_mat), intent(inout) :: a @@ -2828,7 +2831,7 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='d_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2860,10 +2863,13 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2871,16 +2877,20 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - - + else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2949,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz @@ -3478,6 +3487,602 @@ subroutine psb_d_coo_mv_from(a,b) end subroutine psb_d_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_d_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_tril + implicit none + + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_d_coo_tril + +subroutine psb_d_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_triu + implicit none + + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_d_coo_triu + subroutine psb_d_fix_coo(a,info,idir) use psb_const_mod @@ -3561,7 +4166,7 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3620,8 +4225,6 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_d_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3660,12 +4263,13 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_dpk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3747,9 +4351,9 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3762,81 +4366,46 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do + !t0 = omp_get_wtime() !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do !$OMP BARRIER - + !$OMP SINGLE + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3865,45 +4434,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3911,7 +4442,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3940,7 +4471,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3969,7 +4500,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4174,7 +4705,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4188,22 +4719,22 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4333,684 +4864,14 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_d_fix_coo_inner_rowmajor -subroutine psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod + +subroutine psb_d_cp_coo_to_lcoo(a,b,info) use psb_error_mod - use psb_d_base_mat_mod, psb_protect_name => psb_d_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif + use psb_d_base_mat_mod, psb_protect_name => psb_d_cp_coo_to_lcoo implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - real(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_d_fix_coo_inner_colmajor - - -subroutine psb_d_cp_coo_to_lcoo(a,b,info) - use psb_error_mod - use psb_d_base_mat_mod, psb_protect_name => psb_d_cp_coo_to_lcoo - implicit none - class(psb_d_coo_sparse_mat), intent(in) :: a - class(psb_ld_coo_sparse_mat), intent(inout) :: b - integer(psb_ipk_), intent(out) :: info + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_ld_coo_sparse_mat), intent(inout) :: b + integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act integer(psb_lpk_) :: nz @@ -5296,7 +5157,7 @@ function psb_ld_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -5363,7 +5224,7 @@ function psb_ld_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -5413,7 +5274,7 @@ function psb_ld_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do @@ -8335,3 +8196,4 @@ subroutine psb_ld_cp_coo_from_icoo(a,b,info) return end subroutine psb_ld_cp_coo_from_icoo + diff --git a/base/serial/impl/psb_d_csc_impl.f90 b/base/serial/impl/psb_d_csc_impl.F90 similarity index 98% rename from base/serial/impl/psb_d_csc_impl.f90 rename to base/serial/impl/psb_d_csc_impl.F90 index 3b9b1b42..1761b051 100644 --- a/base/serial/impl/psb_d_csc_impl.f90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_d_base_mat_mod use psb_d_csc_mat_mod, psb_protect_name => psb_d_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,33 @@ subroutine psb_d_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione) + !$OMP END PARALLEL +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip + call psi_exscan(nc+1,a%icp,info,shift=ione) +#endif call a%set_host() @@ -2311,9 +2329,28 @@ subroutine psb_d_cp_csc_to_fmt(a,b,info) b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2425,9 +2462,27 @@ subroutine psb_d_cp_csc_from_fmt(a,b,info) a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_d_csr_impl.f90 b/base/serial/impl/psb_d_csr_impl.F90 similarity index 91% rename from base/serial/impl/psb_d_csr_impl.f90 rename to base/serial/impl/psb_d_csr_impl.F90 index 2c59c1a5..5b667342 100644 --- a/base/serial/impl/psb_d_csr_impl.f90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = dzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2277,7 +2289,155 @@ subroutine psb_d_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2347,7 +2507,7 @@ subroutine psb_d_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2431,6 +2591,158 @@ subroutine psb_d_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2499,7 +2811,7 @@ subroutine psb_d_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2844,6 +3156,9 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_d_base_mat_mod use psb_d_csr_mat_mod, psb_protect_name => psb_d_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_csr_sparse_mat), intent(inout) :: a @@ -2860,12 +3175,6 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) character(len=20) :: name='d_cp_csr_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2907,96 +3216,37 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif + call a%set_host() + end subroutine psb_d_cp_csr_from_coo @@ -3096,6 +3346,9 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_d_base_mat_mod use psb_d_csr_mat_mod, psb_protect_name => psb_d_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_d_csr_sparse_mat), intent(inout) :: a @@ -3109,13 +3362,6 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. - info = psb_success_ debug_unit = psb_get_debug_unit() @@ -3139,90 +3385,34 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() end subroutine psb_d_mv_csr_from_coo @@ -3300,9 +3490,28 @@ subroutine psb_d_cp_csr_to_fmt(a,b,info) b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3386,9 +3595,27 @@ subroutine psb_d_cp_csr_from_fmt(a,b,info) a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 7be72cce..402c608a 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -2817,6 +2817,9 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_coo_sparse_mat), intent(inout) :: a @@ -2828,7 +2831,7 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='s_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2860,10 +2863,13 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2871,16 +2877,20 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - - + else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2949,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz @@ -3478,6 +3487,602 @@ subroutine psb_s_coo_mv_from(a,b) end subroutine psb_s_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_s_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_tril + implicit none + + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_coo_tril + +subroutine psb_s_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_triu + implicit none + + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_coo_triu + subroutine psb_s_fix_coo(a,info,idir) use psb_const_mod @@ -3561,7 +4166,7 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3620,8 +4225,6 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_s_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3660,12 +4263,13 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_spk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3747,9 +4351,9 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3762,81 +4366,46 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do + !t0 = omp_get_wtime() !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do !$OMP BARRIER - + !$OMP SINGLE + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3865,45 +4434,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3911,7 +4442,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3940,7 +4471,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3969,7 +4500,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4174,7 +4705,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4188,22 +4719,22 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4333,684 +4864,14 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_s_fix_coo_inner_rowmajor -subroutine psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod + +subroutine psb_s_cp_coo_to_lcoo(a,b,info) use psb_error_mod - use psb_s_base_mat_mod, psb_protect_name => psb_s_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif + use psb_s_base_mat_mod, psb_protect_name => psb_s_cp_coo_to_lcoo implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - real(psb_spk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - real(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_s_fix_coo_inner_colmajor - - -subroutine psb_s_cp_coo_to_lcoo(a,b,info) - use psb_error_mod - use psb_s_base_mat_mod, psb_protect_name => psb_s_cp_coo_to_lcoo - implicit none - class(psb_s_coo_sparse_mat), intent(in) :: a - class(psb_ls_coo_sparse_mat), intent(inout) :: b - integer(psb_ipk_), intent(out) :: info + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_ls_coo_sparse_mat), intent(inout) :: b + integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act integer(psb_lpk_) :: nz @@ -5296,7 +5157,7 @@ function psb_ls_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -5363,7 +5224,7 @@ function psb_ls_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -5413,7 +5274,7 @@ function psb_ls_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do @@ -8335,3 +8196,4 @@ subroutine psb_ls_cp_coo_from_icoo(a,b,info) return end subroutine psb_ls_cp_coo_from_icoo + diff --git a/base/serial/impl/psb_s_csc_impl.f90 b/base/serial/impl/psb_s_csc_impl.F90 similarity index 98% rename from base/serial/impl/psb_s_csc_impl.f90 rename to base/serial/impl/psb_s_csc_impl.F90 index 3da31d16..a66b7dc0 100644 --- a/base/serial/impl/psb_s_csc_impl.f90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_s_base_mat_mod use psb_s_csc_mat_mod, psb_protect_name => psb_s_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,33 @@ subroutine psb_s_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione) + !$OMP END PARALLEL +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip + call psi_exscan(nc+1,a%icp,info,shift=ione) +#endif call a%set_host() @@ -2311,9 +2329,28 @@ subroutine psb_s_cp_csc_to_fmt(a,b,info) b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2425,9 +2462,27 @@ subroutine psb_s_cp_csc_from_fmt(a,b,info) a%psb_s_base_sparse_mat = b%psb_s_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_s_csr_impl.f90 b/base/serial/impl/psb_s_csr_impl.F90 similarity index 91% rename from base/serial/impl/psb_s_csr_impl.f90 rename to base/serial/impl/psb_s_csr_impl.F90 index 75358dbc..dbe7a4be 100644 --- a/base/serial/impl/psb_s_csr_impl.f90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = szero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2277,7 +2289,155 @@ subroutine psb_s_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2347,7 +2507,7 @@ subroutine psb_s_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2431,6 +2591,158 @@ subroutine psb_s_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2499,7 +2811,7 @@ subroutine psb_s_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2844,6 +3156,9 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_s_base_mat_mod use psb_s_csr_mat_mod, psb_protect_name => psb_s_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_csr_sparse_mat), intent(inout) :: a @@ -2860,12 +3175,6 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) character(len=20) :: name='s_cp_csr_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2907,96 +3216,37 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif + call a%set_host() + end subroutine psb_s_cp_csr_from_coo @@ -3096,6 +3346,9 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_s_base_mat_mod use psb_s_csr_mat_mod, psb_protect_name => psb_s_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_s_csr_sparse_mat), intent(inout) :: a @@ -3109,13 +3362,6 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. - info = psb_success_ debug_unit = psb_get_debug_unit() @@ -3139,90 +3385,34 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() end subroutine psb_s_mv_csr_from_coo @@ -3300,9 +3490,28 @@ subroutine psb_s_cp_csr_to_fmt(a,b,info) b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3386,9 +3595,27 @@ subroutine psb_s_cp_csr_from_fmt(a,b,info) a%psb_s_base_sparse_mat = b%psb_s_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index d06e2739..542f842e 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -2817,6 +2817,9 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) use psb_realloc_mod use psb_sort_mod use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_csput_a +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_coo_sparse_mat), intent(inout) :: a @@ -2828,7 +2831,7 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) integer(psb_ipk_) :: err_act character(len=20) :: name='z_coo_csput_a_impl' logical, parameter :: debug=.false. - integer(psb_ipk_) :: nza, i,j,k, nzl, isza, debug_level, debug_unit + integer(psb_ipk_) :: nza, i,j,k, nzl, isza, nzaold, debug_level, debug_unit info = psb_success_ debug_unit = psb_get_debug_unit() @@ -2860,10 +2863,13 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return - - nza = a%get_nzeros() - isza = a%get_size() if (a%is_bld()) then + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. + !$omp critical + nza = a%get_nzeros() + isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then call a%reallocate(max(nza+nz,int(1.5*isza))) @@ -2871,16 +2877,20 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) isza = a%get_size() if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) - goto 9999 + else + nzaold = nza + nza = nza + nz + call a%set_nzeros(nza) end if - - call psb_inner_ins(nz,ia,ja,val,nza,a%ia,a%ja,a%val,isza,& + !$omp end critical + if (info /= 0) goto 9999 + call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) - call a%set_nzeros(nza) call a%set_sorted(.false.) - - + else if (a%is_upd()) then + nza = a%get_nzeros() + isza = a%get_size() if (a%is_dev()) call a%sync() @@ -2949,7 +2959,6 @@ contains end if end do !$OMP END PARALLEL DO - nza = nza + nz #else do i=1, nz @@ -3478,6 +3487,602 @@ subroutine psb_z_coo_mv_from(a,b) end subroutine psb_z_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_z_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_tril + implicit none + + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_z_coo_tril + +subroutine psb_z_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_triu + implicit none + + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_z_coo_triu + subroutine psb_z_fix_coo(a,info,idir) use psb_const_mod @@ -3561,7 +4166,7 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3620,8 +4225,6 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! Dirty trick: call ROWMAJOR with rows <-> columns call psb_z_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& & ja,ia,val,iaux,nzout,info) -!!$ call psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& -!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -3660,12 +4263,13 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_dpk_), allocatable :: vs(:) integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii + integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers + real(psb_dpk_) :: t0, t1 #if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif @@ -3747,9 +4351,9 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) + !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & + !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) !$OMP SINGLE nthreads = omp_get_num_threads() @@ -3762,81 +4366,46 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s + idxend = idxstart + work - 1 + !write(0,*) 'fix_coo_inner: trying with exscan' + call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do + !t0 = omp_get_wtime() !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=first_idx,last_idx + do j=idxstart,idxend idxaux(j) = iaux(j) end do - ! Here we sort data inside the auxiliary buffers do i=1,nzin act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then + if ((act_row >= idxstart) .and. (act_row <= idxend)) then ias(idxaux(act_row)) = ia(i) jas(idxaux(act_row)) = ja(i) vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 end if end do !$OMP BARRIER - + !$OMP SINGLE + !t1 = omp_get_wtime() + !write(0,*) ithread,'Srt&Cpy :',t1-t0 + !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=first_idx,last_idx + do j=idxstart,idxend first_elem = iaux(j) last_elem = iaux(j+1) - 1 nzl = last_elem - first_elem + 1 @@ -3865,45 +4434,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! -------------------------------------------------- ! ---------------- kaux composition ---------------- - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do + call psi_exscan(nr+1,kaux,i,shift=ione) !$OMP BARRIER @@ -3911,7 +4442,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf select case(dupl) case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3940,7 +4471,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_add_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -3969,7 +4500,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !$OMP END DO case(psb_dupl_err_) - !$OMP DO schedule(STATIC) + !$OMP DO schedule(dynamic,32) do j=1,nr first_elem = iaux(j) last_elem = iaux(j+1) - 1 @@ -4174,7 +4705,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) !$OMP PARALLEL default(none) & !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & + !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) !$OMP SINGLE @@ -4188,22 +4719,22 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf work = nr/nthreads if (ithread < MOD(nr,nthreads)) then work = work + 1 - first_idx = ithread*work + 1 + idxstart = ithread*work + 1 else - first_idx = ithread*work + MOD(nr,nthreads) + 1 + idxstart = ithread*work + MOD(nr,nthreads) + 1 end if - last_idx = first_idx + work - 1 + idxend = idxstart + work - 1 ! --------------------------------------------------- first_elem = 0 last_elem = -1 - act_row = first_idx + act_row = idxstart do j=1,nzin if (ia(j) < act_row) then cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then + else if ((ia(j) > idxend) .or. (work < 1)) then exit else if (ia(j) > act_row) then nzl = last_elem - first_elem + 1 @@ -4333,684 +4864,14 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end subroutine psb_z_fix_coo_inner_rowmajor -subroutine psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,info) - use psb_const_mod + +subroutine psb_z_cp_coo_to_lcoo(a,b,info) use psb_error_mod - use psb_z_base_mat_mod, psb_protect_name => psb_z_fix_coo_inner_colmajor - use psb_string_mod - use psb_ip_reord_mod - use psb_sort_mod -#if defined(OPENMP) - use omp_lib -#endif + use psb_z_base_mat_mod, psb_protect_name => psb_z_cp_coo_to_lcoo implicit none - - integer(psb_ipk_), intent(in) :: nr, nc, nzin, dupl - integer(psb_ipk_), intent(inout) :: ia(:), ja(:), iaux(:) - complex(psb_dpk_), intent(inout) :: val(:) - integer(psb_ipk_), intent(out) :: nzout, info - !locals - integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) - complex(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret - integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii - integer(psb_ipk_) :: debug_level, debug_unit - character(len=20) :: name = 'psb_fixcoo' - logical :: srt_inp, use_buffers -#if defined(OPENMP) - integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread - integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) -#endif - info = psb_success_ - - call psb_erractionsave(err_act) - debug_unit = psb_get_debug_unit() - debug_level = psb_get_debug_level() - - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -end subroutine psb_z_fix_coo_inner_colmajor - - -subroutine psb_z_cp_coo_to_lcoo(a,b,info) - use psb_error_mod - use psb_z_base_mat_mod, psb_protect_name => psb_z_cp_coo_to_lcoo - implicit none - class(psb_z_coo_sparse_mat), intent(in) :: a - class(psb_lz_coo_sparse_mat), intent(inout) :: b - integer(psb_ipk_), intent(out) :: info + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_lz_coo_sparse_mat), intent(inout) :: b + integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act integer(psb_lpk_) :: nz @@ -5296,7 +5157,7 @@ function psb_lz_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -5363,7 +5224,7 @@ function psb_lz_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -5413,7 +5274,7 @@ function psb_lz_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do @@ -8335,3 +8196,4 @@ subroutine psb_lz_cp_coo_from_icoo(a,b,info) return end subroutine psb_lz_cp_coo_from_icoo + diff --git a/base/serial/impl/psb_z_csc_impl.f90 b/base/serial/impl/psb_z_csc_impl.F90 similarity index 98% rename from base/serial/impl/psb_z_csc_impl.f90 rename to base/serial/impl/psb_z_csc_impl.F90 index 8af661fa..e5516bd9 100644 --- a/base/serial/impl/psb_z_csc_impl.f90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2189,6 +2189,9 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) use psb_error_mod use psb_z_base_mat_mod use psb_z_csc_mat_mod, psb_protect_name => psb_z_mv_csc_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_csc_sparse_mat), intent(inout) :: a @@ -2223,18 +2226,33 @@ subroutine psb_z_mv_csc_from_coo(a,b,info) call psb_realloc(nc+1,a%icp,info) call b%free() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) + + !$OMP WORKSHARE a%icp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) do k=1,nza i = itemp(k) - a%icp(i) = a%icp(i) + 1 + !$OMP ATOMIC UPDATE + a%icp(i+1) = a%icp(i+1) + 1 + !$OMP END ATOMIC end do - ip = 1 - do i=1,nc - nrl = a%icp(i) - a%icp(i) = ip - ip = ip + nrl + !$OMP END DO + call psi_exscan(nc+1,a%icp,info,shift=ione) + !$OMP END PARALLEL +#else + a%icp(:) = 0 + do k=1,nza + i = itemp(k) + a%icp(i) = a%icp(i) + 1 end do - a%icp(nc+1) = ip + call psi_exscan(nc+1,a%icp,info,shift=ione) +#endif call a%set_host() @@ -2311,9 +2329,28 @@ subroutine psb_z_cp_csc_to_fmt(a,b,info) b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2425,9 +2462,27 @@ subroutine psb_z_cp_csc_from_fmt(a,b,info) a%psb_z_base_sparse_mat = b%psb_z_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_z_csr_impl.f90 b/base/serial/impl/psb_z_csr_impl.F90 similarity index 91% rename from base/serial/impl/psb_z_csr_impl.f90 rename to base/serial/impl/psb_z_csr_impl.F90 index 4f2693c0..9322105e 100644 --- a/base/serial/impl/psb_z_csr_impl.f90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -152,6 +152,7 @@ contains !$omp parallel do private(i,j, acc) schedule(static) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -163,6 +164,7 @@ contains !$omp parallel do private(i,j, acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -174,6 +176,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -189,6 +192,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -200,6 +204,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -211,6 +216,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -225,6 +231,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -236,6 +243,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -247,6 +255,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -261,6 +270,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -272,6 +282,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -283,6 +294,7 @@ contains !$omp parallel do private(i,j,acc) do i=1,m acc = zzero + !$omp simd do j=irp(i), irp(i+1)-1 acc = acc + val(j) * x(ja(j)) enddo @@ -2277,7 +2289,155 @@ subroutine psb_z_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2347,7 +2507,7 @@ subroutine psb_z_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2431,6 +2591,158 @@ subroutine psb_z_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2499,7 +2811,7 @@ subroutine psb_z_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2844,6 +3156,9 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) use psb_realloc_mod use psb_z_base_mat_mod use psb_z_csr_mat_mod, psb_protect_name => psb_z_cp_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_csr_sparse_mat), intent(inout) :: a @@ -2860,12 +3175,6 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) character(len=20) :: name='z_cp_csr_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s,j - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem,maxthreads - !$ use_openmp = .true. - - info = psb_success_ debug_unit = psb_get_debug_unit() debug_level = psb_get_debug_level() @@ -2907,96 +3216,37 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) endif - a%irp(:) = 0 -!!$ if (use_openmp) then -!!$ !$ maxthreads = omp_get_max_threads() -!!$ !$ allocate(sum(maxthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() +#if defined(OPENMP) + + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE + a%irp(:) = 0 + !$OMP END WORKSHARE + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif + call a%set_host() + end subroutine psb_z_cp_csr_from_coo @@ -3096,6 +3346,9 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) use psb_error_mod use psb_z_base_mat_mod use psb_z_csr_mat_mod, psb_protect_name => psb_z_mv_csr_from_coo +#if defined(OPENMP) + use omp_lib +#endif implicit none class(psb_z_csr_sparse_mat), intent(inout) :: a @@ -3109,13 +3362,6 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) integer(psb_ipk_), Parameter :: maxtry=8 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name='mv_from_coo' - logical :: use_openmp = .false. - - ! $ integer(psb_ipk_), allocatable :: sum(:) - ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - ! $ use_openmp = .true. - info = psb_success_ debug_unit = psb_get_debug_unit() @@ -3139,90 +3385,34 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) call psb_realloc(max(nr+1,nc+1),a%irp,info) call b%free() +#if defined(OPENMP) + !$OMP PARALLEL default(shared) reduction(max:info) + + !$OMP WORKSHARE a%irp(:) = 0 + !$OMP END WORKSHARE + + !$OMP DO schedule(STATIC) & + !$OMP private(k,i) + do k=1,nza + i = itemp(k) + !$OMP ATOMIC UPDATE + a%irp(i) = a%irp(i) + 1 + !$OMP END ATOMIC + end do + !$OMP END DO + call psi_exscan(nr+1,a%irp,info,shift=ione) + !$OMP END PARALLEL +#else + a%irp(:) = 0 + do k=1,nza + i = itemp(k) + a%irp(i) = a%irp(i) + 1 + end do + call psi_exscan(nr+1,a%irp,info,shift=ione) +#endif -!!$ if (use_openmp) then -!!$ !$OMP PARALLEL default(none) & -!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & -!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) -!!$ -!!$ !$OMP DO schedule(STATIC) & -!!$ !$OMP private(k,i) -!!$ do k=1,nza -!!$ i = itemp(k) -!!$ a%irp(i) = a%irp(i) + 1 -!!$ end do -!!$ !$OMP END DO -!!$ -!!$ !$OMP SINGLE -!!$ !$ nthreads = omp_get_num_threads() -!!$ !$ allocate(sum(nthreads+1)) -!!$ !$ sum(:) = 0 -!!$ !$ sum(1) = 1 -!!$ !$OMP END SINGLE -!!$ -!!$ !$ ithread = omp_get_thread_num() -!!$ -!!$ !$ work = nr/nthreads -!!$ !$ if (ithread < MOD(nr,nthreads)) then -!!$ !$ work = work + 1 -!!$ !$ first_idx = ithread*work + 1 -!!$ !$ else -!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 -!!$ !$ end if -!!$ -!!$ !$ last_idx = first_idx + work - 1 -!!$ -!!$ !$ s = 0 -!!$ !$ do i=first_idx,last_idx -!!$ !$ s = s + a%irp(i) -!!$ !$ end do -!!$ !$ if (work > 0) then -!!$ !$ sum(ithread+2) = s -!!$ !$ end if -!!$ -!!$ !$OMP BARRIER -!!$ -!!$ !$OMP SINGLE -!!$ !$ do i=2,nthreads+1 -!!$ !$ sum(i) = sum(i) + sum(i-1) -!!$ !$ end do -!!$ !$OMP END SINGLE -!!$ -!!$ !$ if (work > 0) then -!!$ !$ saved_elem = a%irp(first_idx) -!!$ !$ end if -!!$ !$ if (ithread == 0) then -!!$ !$ a%irp(1) = 1 -!!$ !$ end if -!!$ -!!$ !$ if (work > 0) then -!!$ !$ old_val = a%irp(first_idx+1) -!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) -!!$ !$ end if -!!$ -!!$ !$ do i=first_idx+2,last_idx+1 -!!$ !$ nxt_val = a%irp(i) -!!$ !$ a%irp(i) = a%irp(i-1) + old_val -!!$ !$ old_val = nxt_val -!!$ !$ end do -!!$ -!!$ !$OMP END PARALLEL -!!$ else - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - ip = 1 - do i=1,nr - ncl = a%irp(i) - a%irp(i) = ip - ip = ip + ncl - end do - a%irp(nr+1) = ip -!!$ end if - call a%set_host() end subroutine psb_z_mv_csr_from_coo @@ -3300,9 +3490,28 @@ subroutine psb_z_cp_csr_to_fmt(a,b,info) b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3386,9 +3595,27 @@ subroutine psb_z_cp_csr_from_fmt(a,b,info) a%psb_z_base_sparse_mat = b%psb_z_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index 1da2ce6e..a3898349 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -29,6 +29,107 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_c_exscanv(n,x,info,shift) + use psi_c_serial_mod, psb_protect_name => psi_c_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_spk_), intent(in), optional :: shift + + complex(psb_spk_) :: shift_, tp, ts + integer(psb_ipk_) :: i + logical is_nested, is_parallel + + if (present(shift)) then + shift_ = shift + else + shift_ = czero + end if + +#if defined(OPENMP) + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_c_exscan() + else + !$OMP PARALLEL default(shared) + call inner_c_exscan() + !$OMP END PARALLEL + end if +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif +#if defined(OPENMP) +contains + subroutine inner_c_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + complex(psb_spk_), pointer :: sumb(:) + complex(psb_spk_) :: tp, ts + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + idxstart = ithread*wrk + 1 + else + idxstart = ithread*wrk + MOD((n),nthreads) + 1 + end if + + idxend = min(idxstart + wrk - 1,n ) + tp = czero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts + end do + end if + sumb(ithread+2) = tp + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = x(i) + sumb(ithread+1) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_c_exscan +#endif +end subroutine psi_c_exscanv + subroutine psb_m_cgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_cgelp use psb_const_mod diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index 8c65b349..1b5b1442 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -29,6 +29,107 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_d_exscanv(n,x,info,shift) + use psi_d_serial_mod, psb_protect_name => psi_d_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_dpk_), intent(in), optional :: shift + + real(psb_dpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i + logical is_nested, is_parallel + + if (present(shift)) then + shift_ = shift + else + shift_ = dzero + end if + +#if defined(OPENMP) + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_d_exscan() + else + !$OMP PARALLEL default(shared) + call inner_d_exscan() + !$OMP END PARALLEL + end if +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif +#if defined(OPENMP) +contains + subroutine inner_d_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + real(psb_dpk_), pointer :: sumb(:) + real(psb_dpk_) :: tp, ts + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + idxstart = ithread*wrk + 1 + else + idxstart = ithread*wrk + MOD((n),nthreads) + 1 + end if + + idxend = min(idxstart + wrk - 1,n ) + tp = dzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts + end do + end if + sumb(ithread+2) = tp + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = x(i) + sumb(ithread+1) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_d_exscan +#endif +end subroutine psi_d_exscanv + subroutine psb_m_dgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_dgelp use psb_const_mod diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index 988bad52..9cdcdf0e 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -29,6 +29,107 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_e_exscanv(n,x,info,shift) + use psi_e_serial_mod, psb_protect_name => psi_e_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_epk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_epk_), intent(in), optional :: shift + + integer(psb_epk_) :: shift_, tp, ts + integer(psb_ipk_) :: i + logical is_nested, is_parallel + + if (present(shift)) then + shift_ = shift + else + shift_ = ezero + end if + +#if defined(OPENMP) + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_e_exscan() + else + !$OMP PARALLEL default(shared) + call inner_e_exscan() + !$OMP END PARALLEL + end if +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif +#if defined(OPENMP) +contains + subroutine inner_e_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_epk_), pointer :: sumb(:) + integer(psb_epk_) :: tp, ts + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + idxstart = ithread*wrk + 1 + else + idxstart = ithread*wrk + MOD((n),nthreads) + 1 + end if + + idxend = min(idxstart + wrk - 1,n ) + tp = ezero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts + end do + end if + sumb(ithread+2) = tp + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = x(i) + sumb(ithread+1) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_e_exscan +#endif +end subroutine psi_e_exscanv + subroutine psb_m_egelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_egelp use psb_const_mod @@ -441,9 +542,9 @@ subroutine psi_eaxpby(m,n,alpha, x, beta, y, info) integer(psb_epk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +603,8 @@ subroutine psi_eaxpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +634,106 @@ subroutine psi_eaxpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call eaxpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call eaxpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.ezero) then + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = ezero + enddo + else if (beta.eq.eone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +756,7 @@ subroutine psi_eaxpbyv2(m,alpha, x, beta, y, z, info) integer(psb_epk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +795,105 @@ subroutine psi_eaxpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call eaxpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.ezero) then + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = ezero + enddo + else if (beta.eq.eone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-eone) then + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.ezero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-eone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1241,7 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.ezero) then if (beta.eq.ezero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = ezero enddo @@ -953,12 +1253,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1271,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1286,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1304,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1319,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1337,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1352,14 @@ subroutine eaxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1445,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1463,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1478,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1496,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1511,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1529,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.ezero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1544,14 @@ subroutine eaxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-eone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index 83b078f0..d25617a9 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -29,6 +29,107 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_i2_exscanv(n,x,info,shift) + use psi_i2_serial_mod, psb_protect_name => psi_i2_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_i2pk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_i2pk_), intent(in), optional :: shift + + integer(psb_i2pk_) :: shift_, tp, ts + integer(psb_ipk_) :: i + logical is_nested, is_parallel + + if (present(shift)) then + shift_ = shift + else + shift_ = i2zero + end if + +#if defined(OPENMP) + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_i2_exscan() + else + !$OMP PARALLEL default(shared) + call inner_i2_exscan() + !$OMP END PARALLEL + end if +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif +#if defined(OPENMP) +contains + subroutine inner_i2_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_i2pk_), pointer :: sumb(:) + integer(psb_i2pk_) :: tp, ts + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + idxstart = ithread*wrk + 1 + else + idxstart = ithread*wrk + MOD((n),nthreads) + 1 + end if + + idxend = min(idxstart + wrk - 1,n ) + tp = i2zero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts + end do + end if + sumb(ithread+2) = tp + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = x(i) + sumb(ithread+1) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_i2_exscan +#endif +end subroutine psi_i2_exscanv + subroutine psb_m_i2gelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_i2gelp use psb_const_mod @@ -441,9 +542,9 @@ subroutine psi_i2axpby(m,n,alpha, x, beta, y, info) integer(psb_i2pk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +603,8 @@ subroutine psi_i2axpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +634,106 @@ subroutine psi_i2axpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call i2axpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call i2axpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.i2zero) then + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = i2zero + enddo + else if (beta.eq.i2one) then + ! + ! Do nothing! + ! + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +756,7 @@ subroutine psi_i2axpbyv2(m,alpha, x, beta, y, z, info) integer(psb_i2pk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +795,105 @@ subroutine psi_i2axpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call i2axpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.i2zero) then + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = i2zero + enddo + else if (beta.eq.i2one) then + ! + ! Do nothing! + ! + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-i2one) then + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.i2zero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-i2one) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1241,7 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.i2zero) then if (beta.eq.i2zero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = i2zero enddo @@ -953,12 +1253,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1271,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1286,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1304,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1319,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1337,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1352,14 @@ subroutine i2axpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1445,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1463,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1478,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1496,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1511,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1529,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.i2zero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1544,14 @@ subroutine i2axpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-i2one) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index 950e2358..05c8e60f 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -29,6 +29,107 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_m_exscanv(n,x,info,shift) + use psi_m_serial_mod, psb_protect_name => psi_m_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + integer(psb_mpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_), intent(in), optional :: shift + + integer(psb_mpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i + logical is_nested, is_parallel + + if (present(shift)) then + shift_ = shift + else + shift_ = mzero + end if + +#if defined(OPENMP) + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_m_exscan() + else + !$OMP PARALLEL default(shared) + call inner_m_exscan() + !$OMP END PARALLEL + end if +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif +#if defined(OPENMP) +contains + subroutine inner_m_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_mpk_), pointer :: sumb(:) + integer(psb_mpk_) :: tp, ts + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + idxstart = ithread*wrk + 1 + else + idxstart = ithread*wrk + MOD((n),nthreads) + 1 + end if + + idxend = min(idxstart + wrk - 1,n ) + tp = mzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts + end do + end if + sumb(ithread+2) = tp + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = x(i) + sumb(ithread+1) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_m_exscan +#endif +end subroutine psi_m_exscanv + subroutine psb_m_mgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_mgelp use psb_const_mod @@ -441,9 +542,9 @@ subroutine psi_maxpby(m,n,alpha, x, beta, y, info) integer(psb_mpk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly + integer(psb_ipk_) :: lx, ly, i integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -502,7 +603,8 @@ subroutine psi_maxpbyv(m,alpha, x, beta, y, info) integer(psb_ipk_) :: err_act integer(psb_ipk_) :: lx, ly integer(psb_ipk_) :: ierr(5) - character(len=20) :: name, ch_err + integer(psb_ipk_) :: i + character(len=20) :: name, ch_err name='psb_geaxpby' info=psb_success_ @@ -532,7 +634,106 @@ subroutine psi_maxpbyv(m,alpha, x, beta, y, info) goto 9999 end if - if (m>0) call maxpby(m,ione,alpha,x,lx,beta,y,ly,info) +! if (m>0) call maxpby(m,ione,alpha,x,lx,beta,y,ly,info) + + if (alpha.eq.mzero) then + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = mzero + enddo + else if (beta.eq.mone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + y(i) + enddo + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + y(i) + enddo + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + y(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif + call psb_erractionrestore(err_act) return @@ -555,7 +756,7 @@ subroutine psi_maxpbyv2(m,alpha, x, beta, y, z, info) integer(psb_mpk_), intent (in) :: alpha, beta integer(psb_ipk_), intent(out) :: info integer(psb_ipk_) :: err_act - integer(psb_ipk_) :: lx, ly, lz + integer(psb_ipk_) :: lx, ly, lz, i integer(psb_ipk_) :: ierr(5) character(len=20) :: name, ch_err @@ -594,7 +795,105 @@ subroutine psi_maxpbyv2(m,alpha, x, beta, y, z, info) goto 9999 end if - if (m>0) call maxpbyv2(m,ione,alpha,x,lx,beta,y,ly,z,lz,info) + if (alpha.eq.mzero) then + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = mzero + enddo + else if (beta.eq.mone) then + ! + ! Do nothing! + ! + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = beta*y(i) + enddo + endif + + else if (alpha.eq.mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = x(i) + beta*y(i) + enddo + endif + + else if (alpha.eq.-mone) then + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = -x(i) + beta*y(i) + enddo + endif + + else + + if (beta.eq.mzero) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + enddo + else if (beta.eq.mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + y(i) + enddo + + else if (beta.eq.-mone) then + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) - y(i) + enddo + else + !$omp parallel do private(i) + do i=1,m + Z(i) = alpha*x(i) + beta*y(i) + enddo + endif + + endif call psb_erractionrestore(err_act) return @@ -942,6 +1241,7 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (alpha.eq.mzero) then if (beta.eq.mzero) then do j=1, n + !$omp parallel do private(i) do i=1,m y(i,j) = mzero enddo @@ -953,12 +1253,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = beta*y(i,j) enddo @@ -969,12 +1271,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + y(i,j) enddo @@ -982,12 +1286,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = x(i,j) + beta*y(i,j) enddo @@ -998,12 +1304,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + y(i,j) enddo @@ -1011,12 +1319,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1027,12 +1337,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1040,12 +1352,14 @@ subroutine maxpby(m, n, alpha, X, lldx, beta, Y, lldy, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m y(i,j) = alpha*x(i,j) + beta*y(i,j) enddo @@ -1131,12 +1445,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = beta*y(i,j) enddo @@ -1147,12 +1463,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + y(i,j) enddo @@ -1160,12 +1478,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = x(i,j) + beta*y(i,j) enddo @@ -1176,12 +1496,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + y(i,j) enddo @@ -1189,12 +1511,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = -x(i,j) + beta*y(i,j) enddo @@ -1205,12 +1529,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) if (beta.eq.mzero) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) enddo enddo else if (beta.eq.mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + y(i,j) enddo @@ -1218,12 +1544,14 @@ subroutine maxpbyv2(m, n, alpha, X, lldx, beta, Y, lldy, Z, lldz, info) else if (beta.eq.-mone) then do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) - y(i,j) enddo enddo else do j=1,n + !$omp parallel do private(i) do i=1,m Z(i,j) = alpha*x(i,j) + beta*y(i,j) enddo diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index 6c8e21e2..26a57e68 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -29,6 +29,107 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_s_exscanv(n,x,info,shift) + use psi_s_serial_mod, psb_protect_name => psi_s_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + real(psb_spk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + real(psb_spk_), intent(in), optional :: shift + + real(psb_spk_) :: shift_, tp, ts + integer(psb_ipk_) :: i + logical is_nested, is_parallel + + if (present(shift)) then + shift_ = shift + else + shift_ = szero + end if + +#if defined(OPENMP) + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_s_exscan() + else + !$OMP PARALLEL default(shared) + call inner_s_exscan() + !$OMP END PARALLEL + end if +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif +#if defined(OPENMP) +contains + subroutine inner_s_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + real(psb_spk_), pointer :: sumb(:) + real(psb_spk_) :: tp, ts + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + idxstart = ithread*wrk + 1 + else + idxstart = ithread*wrk + MOD((n),nthreads) + 1 + end if + + idxend = min(idxstart + wrk - 1,n ) + tp = szero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts + end do + end if + sumb(ithread+2) = tp + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = x(i) + sumb(ithread+1) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_s_exscan +#endif +end subroutine psi_s_exscanv + subroutine psb_m_sgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_sgelp use psb_const_mod diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index f3087992..0b15b2d6 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -29,6 +29,107 @@ ! POSSIBILITY OF SUCH DAMAGE. ! ! +subroutine psi_z_exscanv(n,x,info,shift) + use psi_z_serial_mod, psb_protect_name => psi_z_exscanv + use psb_const_mod + use psb_error_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + integer(psb_ipk_), intent(in) :: n + complex(psb_dpk_), intent (inout) :: x(:) + integer(psb_ipk_), intent(out) :: info + complex(psb_dpk_), intent(in), optional :: shift + + complex(psb_dpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i + logical is_nested, is_parallel + + if (present(shift)) then + shift_ = shift + else + shift_ = zzero + end if + +#if defined(OPENMP) + is_parallel = omp_in_parallel() + if (is_parallel) then + call inner_z_exscan() + else + !$OMP PARALLEL default(shared) + call inner_z_exscan() + !$OMP END PARALLEL + end if +#else + tp = shift_ + do i=1,n + ts = x(i) + x(i) = tp + tp = tp + ts + end do + +#endif +#if defined(OPENMP) +contains + subroutine inner_z_exscan() + ! Note: all these variables are private, but SUMB should *really* be + ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied + ! so effectively we are recovering a SHARED SUMB which is what + ! we need in this case. If it was an ALLOCATABLE, then it would be the contents + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + complex(psb_dpk_), pointer :: sumb(:) + complex(psb_dpk_) :: tp, ts + + nthreads = omp_get_num_threads() + ithread = omp_get_thread_num() + !$OMP SINGLE + allocate(sumb(nthreads+1)) + sumb(:) = 0 + !$OMP END SINGLE COPYPRIVATE(sumb) + + wrk = (n)/nthreads + if (ithread < MOD((n),nthreads)) then + wrk = wrk + 1 + idxstart = ithread*wrk + 1 + else + idxstart = ithread*wrk + MOD((n),nthreads) + 1 + end if + + idxend = min(idxstart + wrk - 1,n ) + tp = zzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts + end do + end if + sumb(ithread+2) = tp + !$OMP BARRIER + + !$OMP SINGLE + do i=2,nthreads+1 + sumb(i) = sumb(i) + sumb(i-1) + end do + !$OMP END SINGLE + + !$OMP BARRIER + + !$OMP DO SCHEDULE(STATIC) + do i=1,n + x(i) = x(i) + sumb(ithread+1) + shift_ + end do + !$OMP END DO + !$OMP SINGLE + deallocate(sumb) + !$OMP END SINGLE + end subroutine inner_z_exscan +#endif +end subroutine psi_z_exscanv + subroutine psb_m_zgelp(trans,iperm,x,info) use psb_serial_mod, psb_protect_name => psb_m_zgelp use psb_const_mod diff --git a/base/serial/sort/psb_c_qsort_impl.f90 b/base/serial/sort/psb_c_qsort_impl.f90 index 712529fc..7f33c099 100644 --- a/base/serial/sort/psb_c_qsort_impl.f90 +++ b/base/serial/sort/psb_c_qsort_impl.f90 @@ -40,6 +40,7 @@ ! Data Structures and Algorithms ! Addison-Wesley ! + subroutine psb_cqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_cqsort use psb_error_mod diff --git a/base/serial/sort/psb_d_msort_impl.f90 b/base/serial/sort/psb_d_msort_impl.f90 index 11029818..66ad7897 100644 --- a/base/serial/sort/psb_d_msort_impl.f90 +++ b/base/serial/sort/psb_d_msort_impl.f90 @@ -76,64 +76,6 @@ subroutine psb_dmsort_u(x,nout,dir) return end subroutine psb_dmsort_u - -function psb_dbsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_dbsrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_dpk_) :: key - real(psb_dpk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_dbsrch - -function psb_dssrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_dssrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_dpk_) :: key - real(psb_dpk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_dssrch - subroutine psb_dmsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_dmsort use psb_error_mod diff --git a/base/serial/sort/psb_d_qsort_impl.f90 b/base/serial/sort/psb_d_qsort_impl.f90 index 13328188..8e4b1d21 100644 --- a/base/serial/sort/psb_d_qsort_impl.f90 +++ b/base/serial/sort/psb_d_qsort_impl.f90 @@ -40,6 +40,125 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_dbsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_dbsrch + implicit none + integer(psb_ipk_) :: ipos, n + real(psb_dpk_) :: key + real(psb_dpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) exit + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) exit + m = m + 1 + end do + else + do while (m>1) + if (v(m-1)>=key) then + m=m-1 + else + exit + end if + end do + end if + ipos = max(m,1) + + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_dbsrch + +function psb_dssrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_dssrch + implicit none + integer(psb_ipk_) :: ipos, n + real(psb_dpk_) :: key + real(psb_dpk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_dssrch + subroutine psb_dqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_dqsort use psb_error_mod diff --git a/base/serial/sort/psb_e_msort_impl.f90 b/base/serial/sort/psb_e_msort_impl.f90 index d8cd6404..b97d448a 100644 --- a/base/serial/sort/psb_e_msort_impl.f90 +++ b/base/serial/sort/psb_e_msort_impl.f90 @@ -131,64 +131,6 @@ subroutine psb_emsort_u(x,nout,dir) return end subroutine psb_emsort_u - -function psb_ebsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_ebsrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_epk_) :: key - integer(psb_epk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_ebsrch - -function psb_essrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_essrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_epk_) :: key - integer(psb_epk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_essrch - subroutine psb_emsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_emsort use psb_error_mod diff --git a/base/serial/sort/psb_e_qsort_impl.f90 b/base/serial/sort/psb_e_qsort_impl.f90 index 9b95c78e..c70f8051 100644 --- a/base/serial/sort/psb_e_qsort_impl.f90 +++ b/base/serial/sort/psb_e_qsort_impl.f90 @@ -40,6 +40,125 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_ebsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_ebsrch + implicit none + integer(psb_ipk_) :: ipos, n + integer(psb_epk_) :: key + integer(psb_epk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) exit + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) exit + m = m + 1 + end do + else + do while (m>1) + if (v(m-1)>=key) then + m=m-1 + else + exit + end if + end do + end if + ipos = max(m,1) + + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_ebsrch + +function psb_essrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_essrch + implicit none + integer(psb_ipk_) :: ipos, n + integer(psb_epk_) :: key + integer(psb_epk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_essrch + subroutine psb_eqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_eqsort use psb_error_mod diff --git a/base/serial/sort/psb_m_msort_impl.f90 b/base/serial/sort/psb_m_msort_impl.f90 index cd99a3c5..437d1069 100644 --- a/base/serial/sort/psb_m_msort_impl.f90 +++ b/base/serial/sort/psb_m_msort_impl.f90 @@ -131,64 +131,6 @@ subroutine psb_mmsort_u(x,nout,dir) return end subroutine psb_mmsort_u - -function psb_mbsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_mbsrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_mpk_) :: key - integer(psb_mpk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_mbsrch - -function psb_mssrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_mssrch - implicit none - integer(psb_ipk_) :: ipos, n - integer(psb_mpk_) :: key - integer(psb_mpk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_mssrch - subroutine psb_mmsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_mmsort use psb_error_mod diff --git a/base/serial/sort/psb_m_qsort_impl.f90 b/base/serial/sort/psb_m_qsort_impl.f90 index ac8241f5..6b70c3a0 100644 --- a/base/serial/sort/psb_m_qsort_impl.f90 +++ b/base/serial/sort/psb_m_qsort_impl.f90 @@ -40,6 +40,125 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_mbsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_mbsrch + implicit none + integer(psb_ipk_) :: ipos, n + integer(psb_mpk_) :: key + integer(psb_mpk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) exit + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) exit + m = m + 1 + end do + else + do while (m>1) + if (v(m-1)>=key) then + m=m-1 + else + exit + end if + end do + end if + ipos = max(m,1) + + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_mbsrch + +function psb_mssrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_mssrch + implicit none + integer(psb_ipk_) :: ipos, n + integer(psb_mpk_) :: key + integer(psb_mpk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_mssrch + subroutine psb_mqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_mqsort use psb_error_mod diff --git a/base/serial/sort/psb_s_msort_impl.f90 b/base/serial/sort/psb_s_msort_impl.f90 index dfd7508c..e3382f27 100644 --- a/base/serial/sort/psb_s_msort_impl.f90 +++ b/base/serial/sort/psb_s_msort_impl.f90 @@ -76,64 +76,6 @@ subroutine psb_smsort_u(x,nout,dir) return end subroutine psb_smsort_u - -function psb_sbsrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_sbsrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_spk_) :: key - real(psb_spk_) :: v(:) - - integer(psb_ipk_) :: lb, ub, m, i - - ipos = -1 - if (n<5) then - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - return - end if - - lb = 1 - ub = n - - do while (lb.le.ub) - m = (lb+ub)/2 - if (key.eq.v(m)) then - ipos = m - lb = ub + 1 - else if (key < v(m)) then - ub = m-1 - else - lb = m + 1 - end if - enddo - return -end function psb_sbsrch - -function psb_sssrch(key,n,v) result(ipos) - use psb_sort_mod, psb_protect_name => psb_sssrch - implicit none - integer(psb_ipk_) :: ipos, n - real(psb_spk_) :: key - real(psb_spk_) :: v(:) - - integer(psb_ipk_) :: i - - ipos = -1 - do i=1,n - if (key.eq.v(i)) then - ipos = i - return - end if - enddo - - return -end function psb_sssrch - subroutine psb_smsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_smsort use psb_error_mod diff --git a/base/serial/sort/psb_s_qsort_impl.f90 b/base/serial/sort/psb_s_qsort_impl.f90 index d6e0e66e..cae32546 100644 --- a/base/serial/sort/psb_s_qsort_impl.f90 +++ b/base/serial/sort/psb_s_qsort_impl.f90 @@ -40,6 +40,125 @@ ! Data Structures and Algorithms ! Addison-Wesley ! +function psb_sbsrch(key,n,v,dir,find) result(ipos) + use psb_sort_mod, psb_protect_name => psb_sbsrch + implicit none + integer(psb_ipk_) :: ipos, n + real(psb_spk_) :: key + real(psb_spk_) :: v(:) + integer(psb_ipk_), optional :: dir, find + + integer(psb_ipk_) :: lb, ub, m, i, k, dir_, find_ + + if (present(dir)) then + dir_ = dir + else + dir_ = psb_sort_up_ + end if + if (present(find)) then + find_ = find + else + find_ = psb_find_any_ + end if + + ipos = -1 + if (dir_ == psb_sort_up_) then + if (n<=5) then + do m=1,n + if (key == v(m)) then + ipos = m + exit + end if + enddo + + else + + lb = 1 + ub = n + + do while (lb.le.ub) + m = (lb+ub)/2 + if (key.eq.v(m)) then + ipos = m + exit + else if (key < v(m)) then + ub = m-1 + else + lb = m + 1 + end if + enddo + end if + select case(find_) + case (psb_find_any_ ) + ! do nothing + case (psb_find_last_le_ ) + if ((m>n) .or. (m<1)) then + m = n + do while (m>=1) + if (v(m)<=key) exit + m = m - 1 + end do + else + do while (mn) .or. (m<1)) then + m = 1 + do while (m<=n) + if (v(m)>=key) exit + m = m + 1 + end do + else + do while (m>1) + if (v(m-1)>=key) then + m=m-1 + else + exit + end if + end do + end if + ipos = max(m,1) + + case default + write(0,*) 'Wrong FIND' + end select + + + else if (dir_ == psb_sort_down_) then + write(0,*) ' bsrch on sort down not implemented' + else + write(0,*) ' bsrch wrong DIR ',dir_,psb_sort_up_,psb_sort_down_ + end if + return +end function psb_sbsrch + +function psb_sssrch(key,n,v) result(ipos) + use psb_sort_mod, psb_protect_name => psb_sssrch + implicit none + integer(psb_ipk_) :: ipos, n + real(psb_spk_) :: key + real(psb_spk_) :: v(:) + + integer(psb_ipk_) :: i + + ipos = -1 + do i=1,n + if (key.eq.v(i)) then + ipos = i + return + end if + enddo + + return +end function psb_sssrch + subroutine psb_sqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_sqsort use psb_error_mod diff --git a/base/serial/sort/psb_z_qsort_impl.f90 b/base/serial/sort/psb_z_qsort_impl.f90 index 7b0af1c5..a1cdb193 100644 --- a/base/serial/sort/psb_z_qsort_impl.f90 +++ b/base/serial/sort/psb_z_qsort_impl.f90 @@ -40,6 +40,7 @@ ! Data Structures and Algorithms ! Addison-Wesley ! + subroutine psb_zqsort(x,ix,dir,flag) use psb_sort_mod, psb_protect_name => psb_zqsort use psb_error_mod diff --git a/base/tools/psb_cspins.F90 b/base/tools/psb_cspins.F90 index 27cfbd8e..e5f2731d 100644 --- a/base/tools/psb_cspins.F90 +++ b/base/tools/psb_cspins.F90 @@ -51,6 +51,9 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_cspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -131,11 +134,117 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) @@ -143,7 +252,7 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then @@ -151,11 +260,10 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name,a_err='a%csput') goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -177,6 +285,8 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name) goto 9999 end if +#endif + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then @@ -189,9 +299,18 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -201,7 +320,7 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -220,8 +339,12 @@ subroutine psb_cspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/base/tools/psb_dspins.F90 b/base/tools/psb_dspins.F90 index 2a70ab83..cdeaa931 100644 --- a/base/tools/psb_dspins.F90 +++ b/base/tools/psb_dspins.F90 @@ -51,6 +51,9 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_dspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -131,11 +134,117 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) @@ -143,7 +252,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then @@ -151,11 +260,10 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name,a_err='a%csput') goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -177,6 +285,8 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name) goto 9999 end if +#endif + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then @@ -189,9 +299,18 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -201,7 +320,7 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -220,8 +339,12 @@ subroutine psb_dspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/base/tools/psb_sspins.F90 b/base/tools/psb_sspins.F90 index aee7a900..39e4ad79 100644 --- a/base/tools/psb_sspins.F90 +++ b/base/tools/psb_sspins.F90 @@ -51,6 +51,9 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_sspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -131,11 +134,117 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) @@ -143,7 +252,7 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then @@ -151,11 +260,10 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name,a_err='a%csput') goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -177,6 +285,8 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name) goto 9999 end if +#endif + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then @@ -189,9 +299,18 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -201,7 +320,7 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -220,8 +339,12 @@ subroutine psb_sspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/base/tools/psb_zspins.F90 b/base/tools/psb_zspins.F90 index abe64251..0c0ff91f 100644 --- a/base/tools/psb_zspins.F90 +++ b/base/tools/psb_zspins.F90 @@ -51,6 +51,9 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) use psb_base_mod, psb_protect_name => psb_zspins use psi_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none !....parameters... @@ -131,11 +134,117 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + block + logical :: is_in_parallel + is_in_parallel = omp_in_parallel() + if (is_in_parallel) then + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !$omp critical(spins) + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !$omp end critical(spins) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9998 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9998 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + end if +9998 continue + !write(0,*) me,' after csput',psb_errstatus_fatal() + !$omp end parallel + else + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) + !write(0,*) me,' Before g2l_ins ',psb_errstatus_fatal() + if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& + & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_cdins',i_err=(/info/)) + goto 9999 + end if + nrow = desc_a%get_local_rows() + ncol = desc_a%get_local_cols() + !write(0,*) me,' Before csput',psb_errstatus_fatal() + if (a%is_bld()) then + call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + call psb_errpush(info,name,a_err='a%csput') + goto 9999 + end if + + if (a%is_remote_build()) then + nnl = count(ila(1:nz)<0) + if (nnl > 0) then + allocate(lila(nnl),ljla(nnl),lval(nnl)) + k = 0 + do i=1,nz + if (ila(i)<0) then + k=k+1 + lila(k) = ia(i) + ljla(k) = ja(i) + lval(k) = val(i) + end if + end do + if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl + call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& + & 1_psb_lpk_,desc_a%get_global_rows(),info) + end if + end if + + else + info = psb_err_invalid_a_and_cd_state_ + call psb_errpush(info,name) + goto 9999 + end if + end if + end block +#else + + !write(0,*) me,' Before g2l ',psb_errstatus_fatal() + call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) - call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l_ins(ja(1:nz),jla(1:nz),info,& & mask=(ila(1:nz)>0)) + !write(0,*) me,' after g2l_ins ',psb_errstatus_fatal(),info if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_ai_,name,& & a_err='psb_cdins',i_err=(/info/)) @@ -143,7 +252,7 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) end if nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() - + !write(0,*) me,' Before csput',psb_errstatus_fatal() if (a%is_bld()) then call a%csput(nz,ila,jla,val,ione,nrow,ione,ncol,info) if (info /= psb_success_) then @@ -151,11 +260,10 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name,a_err='a%csput') goto 9999 end if - + if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) if (nnl > 0) then - !write(0,*) 'Check on insert ',nnl allocate(lila(nnl),ljla(nnl),lval(nnl)) k = 0 do i=1,nz @@ -177,6 +285,8 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) call psb_errpush(info,name) goto 9999 end if +#endif + if (info /= 0) goto 9999 endif else if (desc_a%is_asb()) then @@ -189,9 +299,18 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) & a_err='allocate',i_err=(/info/)) goto 9999 end if +#if defined(OPENMP) + !$omp parallel private(ila,jla,nrow,ncol,nnl,k) +#endif if (local_) then +#if defined(OPENMP) + !$omp workshare +#endif ila(1:nz) = ia(1:nz) jla(1:nz) = ja(1:nz) +#if defined(OPENMP) + !$omp end workshare +#endif else call desc_a%indxmap%g2l(ia(1:nz),ila(1:nz),info,owned=.true.) if (info == 0) call desc_a%indxmap%g2l(ja(1:nz),jla(1:nz),info,& @@ -201,7 +320,7 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='a%csput') - goto 9999 + !goto 9999 end if if (a%is_remote_build()) then nnl = count(ila(1:nz)<0) @@ -220,8 +339,12 @@ subroutine psb_zspins(nz,ia,ja,val,a,desc_a,info,rebuild,local) if (k /= nnl) write(0,*) name,' Wrong conversion?',k,nnl call a%rmta%csput(nnl,lila,ljla,lval,1_psb_lpk_,desc_a%get_global_rows(),& & 1_psb_lpk_,desc_a%get_global_rows(),info) - end if + end if end if +#if defined(OPENMP) + !$omp end parallel +#endif + else info = psb_err_invalid_cd_state_ call psb_errpush(info,name) diff --git a/test/omp/Makefile b/test/omp/Makefile new file mode 100644 index 00000000..c35431c5 --- /dev/null +++ b/test/omp/Makefile @@ -0,0 +1,56 @@ +INSTALLDIR=../.. +INCDIR=$(INSTALLDIR)/include +MODDIR=$(INSTALLDIR)/modules/ +include $(INCDIR)/Make.inc.psblas +# +# Libraries used +LIBDIR=$(INSTALLDIR)/lib +PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base +LDLIBS=$(PSBLDLIBS) +# +# Compilers and such +# +CCOPT= -g +FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG). + + +EXEDIR=./runs + +all: runsd psb_tomp #psb_d_pde3d psb_s_pde3d psb_d_pde2d psb_s_pde2d + +runsd: + (if test ! -d runs ; then mkdir runs; fi) + +psb_tomp: psb_tomp.o + $(FLINK) psb_tomp.o -o psb_tomp $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_tomp $(EXEDIR) + +psb_d_pde3d: psb_d_pde3d.o + $(FLINK) psb_d_pde3d.o -o psb_d_pde3d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_d_pde3d $(EXEDIR) + + +psb_s_pde3d: psb_s_pde3d.o + $(FLINK) psb_s_pde3d.o -o psb_s_pde3d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_s_pde3d $(EXEDIR) + +psb_d_pde2d: psb_d_pde2d.o + $(FLINK) psb_d_pde2d.o -o psb_d_pde2d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_d_pde2d $(EXEDIR) + + +psb_s_pde2d: psb_s_pde2d.o + $(FLINK) psb_s_pde2d.o -o psb_s_pde2d $(PSBLAS_LIB) $(LDLIBS) + /bin/mv psb_s_pde2d $(EXEDIR) + + +clean: + /bin/rm -f psb_tomp.o psb_d_pde3d.o psb_s_pde3d.o psb_d_pde2d.o psb_s_pde2d.o *$(.mod) \ + $(EXEDIR)/psb_d_pde3d $(EXEDIR)/psb_s_pde3d $(EXEDIR)/psb_d_pde2d $(EXEDIR)/psb_s_pde2d +verycleanlib: + (cd ../..; make veryclean) +lib: + (cd ../../; make library) + + + diff --git a/test/omp/psb_tomp.F90 b/test/omp/psb_tomp.F90 new file mode 100644 index 00000000..79097ca8 --- /dev/null +++ b/test/omp/psb_tomp.F90 @@ -0,0 +1,1091 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! File: psb_d_pde3d.f90 +! +! Program: psb_d_pde3d +! This sample program solves a linear system obtained by discretizing a +! PDE with Dirichlet BCs. +! +! +! The PDE is a general second order equation in 3d +! +! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u) +! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f +! dxdx dydy dzdz dx dy dz +! +! with Dirichlet boundary conditions +! u = g +! +! on the unit cube 0<=x,y,z<=1. +! +! +! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation. +! +! There are three choices available for data distribution: +! 1. A simple BLOCK distribution +! 2. A ditribution based on arbitrary assignment of indices to processes, +! typically from a graph partitioner +! 3. A 3D distribution in which the unit cube is partitioned +! into subcubes, each one assigned to a process. +! +! +module psb_d_pde3d_mod + + + use psb_base_mod, only : psb_dpk_, psb_ipk_, psb_lpk_, psb_desc_type,& + & psb_dspmat_type, psb_d_vect_type, dzero,& + & psb_d_base_sparse_mat, psb_d_base_vect_type, & + & psb_i_base_vect_type, psb_l_base_vect_type + + interface + function d_func_3d(x,y,z) result(val) + import :: psb_dpk_ + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + end function d_func_3d + end interface + + interface psb_gen_pde3d + module procedure psb_d_gen_pde3d + end interface psb_gen_pde3d + +contains + + function d_null_func_3d(x,y,z) result(val) + + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + + val = dzero + + end function d_null_func_3d + ! + ! functions parametrizing the differential equation + ! + + ! + ! Note: b1, b2 and b3 are the coefficients of the first + ! derivative of the unknown function. The default + ! we apply here is to have them zero, so that the resulting + ! matrix is symmetric/hermitian and suitable for + ! testing with CG and FCG. + ! When testing methods for non-hermitian matrices you can + ! change the B1/B2/B3 functions to e.g. done/sqrt((3*done)) + ! + function b1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b1 + real(psb_dpk_), intent(in) :: x,y,z + b1=dzero + end function b1 + function b2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b2 + real(psb_dpk_), intent(in) :: x,y,z + b2=dzero + end function b2 + function b3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b3 + real(psb_dpk_), intent(in) :: x,y,z + b3=dzero + end function b3 + function c(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: c + real(psb_dpk_), intent(in) :: x,y,z + c=dzero + end function c + function a1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a1 + real(psb_dpk_), intent(in) :: x,y,z + a1=done/80 + end function a1 + function a2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a2 + real(psb_dpk_), intent(in) :: x,y,z + a2=done/80 + end function a2 + function a3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a3 + real(psb_dpk_), intent(in) :: x,y,z + a3=done/80 + end function a3 + function g(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: g + real(psb_dpk_), intent(in) :: x,y,z + g = dzero + if (x == done) then + g = done + else if (x == dzero) then + g = exp(y**2-z**2) + end if + end function g + + + ! + ! subroutine to allocate and fill in the coefficient matrix and + ! the rhs. + ! + subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info,& + & f,amold,vmold,imold,partition,nrl,iv) + use psb_base_mod + use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif + ! + ! Discretizes the partial differential equation + ! + ! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u) + ! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f + ! dxdx dydy dzdz dx dy dz + ! + ! with Dirichlet boundary conditions + ! u = g + ! + ! on the unit cube 0<=x,y,z<=1. + ! + ! + ! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation. + ! + implicit none + integer(psb_ipk_) :: idim + type(psb_dspmat_type) :: a + type(psb_d_vect_type) :: xv,bv + type(psb_desc_type) :: desc_a + type(psb_ctxt_type) :: ctxt + integer(psb_ipk_) :: info + character(len=*) :: afmt + procedure(d_func_3d), optional :: f + class(psb_d_base_sparse_mat), optional :: amold + class(psb_d_base_vect_type), optional :: vmold + class(psb_i_base_vect_type), optional :: imold + integer(psb_ipk_), optional :: partition, nrl,iv(:) + + ! Local variables. + + integer(psb_ipk_), parameter :: nb=20 + type(psb_d_csc_sparse_mat) :: acsc + type(psb_d_coo_sparse_mat) :: acoo + type(psb_d_csr_sparse_mat) :: acsr + real(psb_dpk_) :: zt(nb),x,y,z + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_, mysz + integer(psb_lpk_) :: m,n,glob_row,nt + integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner + ! For 3D partition + ! Note: integer control variables going directly into an MPI call + ! must be 4 bytes, i.e. psb_mpk_ + integer(psb_mpk_) :: npdims(3), npp, minfo + integer(psb_ipk_) :: npx,npy,npz, iamx,iamy,iamz,mynx,myny,mynz + integer(psb_ipk_), allocatable :: bndx(:),bndy(:),bndz(:) + ! Process grid + integer(psb_ipk_) :: np, iam + integer(psb_ipk_) :: icoeff + integer(psb_lpk_), allocatable :: myidx(:) + ! deltah dimension of each grid cell + ! deltat discretization time + real(psb_dpk_) :: deltah, sqdeltah, deltah2 + real(psb_dpk_), parameter :: rhs=dzero,one=done,zero=dzero + real(psb_dpk_) :: t0, t1, t2, t3, tasb, talc, ttot, tgen, tcdasb + integer(psb_ipk_) :: err_act + procedure(d_func_3d), pointer :: f_ + character(len=20) :: name, ch_err,tmpfmt + + info = psb_success_ + name = 'create_matrix' + call psb_erractionsave(err_act) + + call psb_info(ctxt, iam, np) + + + if (present(f)) then + f_ => f + else + f_ => d_null_func_3d + end if + + deltah = done/(idim+1) + sqdeltah = deltah*deltah + deltah2 = (2*done)* deltah + + if (present(partition)) then + if ((1<= partition).and.(partition <= 3)) then + partition_ = partition + else + write(*,*) 'Invalid partition choice ',partition,' defaulting to 3' + partition_ = 3 + end if + else + partition_ = 3 + end if + + ! initialize array descriptor and sparse matrix storage. provide an + ! estimate of the number of non zeroes + + m = (1_psb_lpk_*idim)*idim*idim + n = m + nnz = ((n*7)/(np)) + if(iam == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n + t0 = psb_wtime() + select case(partition_) + case(1) + ! A BLOCK partition + if (present(nrl)) then + nr = nrl + else + ! + ! Using a simple BLOCK distribution. + ! + nt = (m+np-1)/np + nr = max(0,min(nt,m-(iam*nt))) + end if + + nt = nr + call psb_sum(ctxt,nt) + if (nt /= m) then + write(psb_err_unit,*) iam, 'Initialization error ',nr,nt,m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! First example of use of CDALL: specify for each process a number of + ! contiguous rows + ! + call psb_cdall(ctxt,desc_a,info,nl=nr) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(2) + ! A partition defined by the user through IV + + if (present(iv)) then + if (size(iv) /= m) then + write(psb_err_unit,*) iam, 'Initialization error: wrong IV size',size(iv),m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + else + write(psb_err_unit,*) iam, 'Initialization error: IV not present' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! Second example of use of CDALL: specify for each row the + ! process that owns it + ! + call psb_cdall(ctxt,desc_a,info,vg=iv) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(3) + ! A 3-dimensional partition + + ! A nifty MPI function will split the process list + npdims = 0 +#if defined(SERIAL_MPI) + npdims = 1 +#else + call mpi_dims_create(np,3,npdims,info) +#endif + npx = npdims(1) + npy = npdims(2) + npz = npdims(3) + + allocate(bndx(0:npx),bndy(0:npy),bndz(0:npz)) + ! We can reuse idx2ijk for process indices as well. + call idx2ijk(iamx,iamy,iamz,iam,npx,npy,npz,base=0) + ! Now let's split the 3D cube in hexahedra + call dist1Didx(bndx,idim,npx) + mynx = bndx(iamx+1)-bndx(iamx) + call dist1Didx(bndy,idim,npy) + myny = bndy(iamy+1)-bndy(iamy) + call dist1Didx(bndz,idim,npz) + mynz = bndz(iamz+1)-bndz(iamz) + + ! How many indices do I own? + nlr = mynx*myny*mynz + allocate(myidx(nlr)) + ! Now, let's generate the list of indices I own + nr = 0 + do i=bndx(iamx),bndx(iamx+1)-1 + do j=bndy(iamy),bndy(iamy+1)-1 + do k=bndz(iamz),bndz(iamz+1)-1 + nr = nr + 1 + call ijk2idx(myidx(nr),i,j,k,idim,idim,idim) + end do + end do + end do + if (nr /= nlr) then + write(psb_err_unit,*) iam,iamx,iamy,iamz, 'Initialization error: NR vs NLR ',& + & nr,nlr,mynx,myny,mynz + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + end if + + ! + ! Third example of use of CDALL: specify for each process + ! the set of global indices it owns. + ! + call psb_cdall(ctxt,desc_a,info,vl=myidx) + + + ! + ! Specify process topology + ! + block + ! + ! Use adjcncy methods + ! + integer(psb_mpk_), allocatable :: neighbours(:) + integer(psb_mpk_) :: cnt + logical, parameter :: debug_adj=.true. + if (debug_adj.and.(np > 1)) then + cnt = 0 + allocate(neighbours(np)) + if (iamx < npx-1) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx+1,iamy,iamz,npx,npy,npz,base=0) + end if + if (iamy < npy-1) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy+1,iamz,npx,npy,npz,base=0) + end if + if (iamz < npz-1) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy,iamz+1,npx,npy,npz,base=0) + end if + if (iamx >0) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx-1,iamy,iamz,npx,npy,npz,base=0) + end if + if (iamy >0) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy-1,iamz,npx,npy,npz,base=0) + end if + if (iamz >0) then + cnt = cnt + 1 + call ijk2idx(neighbours(cnt),iamx,iamy,iamz-1,npx,npy,npz,base=0) + end if + call psb_realloc(cnt, neighbours,info) + call desc_a%set_p_adjcncy(neighbours) + !write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy() + end if + end block + + case default + write(psb_err_unit,*) iam, 'Initialization error: should not get here' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end select + + + if (info == psb_success_) call psb_spall(a,desc_a,info,nnz=nnz, & + & bldmode=psb_matbld_remote_,dupl=psb_dupl_add_) + ! define rhs from boundary conditions; also build initial guess + if (info == psb_success_) call psb_geall(xv,desc_a,info) + if (info == psb_success_) call psb_geall(bv,desc_a,info,& + & bldmode=psb_matbld_remote_,dupl=psb_dupl_add_) + + call psb_barrier(ctxt) + talc = psb_wtime()-t0 + + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='allocation rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + call psb_barrier(ctxt) + t1 = psb_wtime() + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + + ! loop over rows belonging to current process in a block + ! distribution. + + !$omp do schedule(dynamic,4) + ! + do ii=1, nlr, nb + if (info /= 0) cycle + ib = min(nb,nlr-ii+1) + !ib = min(nb,mysz-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle + end do + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel + + tgen = psb_wtime()-t1 + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='insert rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + + call psb_barrier(ctxt) + t1 = psb_wtime() + call psb_cdasb(desc_a,info,mold=imold) + tcdasb = psb_wtime()-t1 + + call psb_barrier(ctxt) + t1 = psb_wtime() + if (info == psb_success_) then + if (present(amold)) then + call psb_spasb(a,desc_a,info,mold=amold) + else + call psb_spasb(a,desc_a,info,afmt=afmt) + end if + end if + call psb_barrier(ctxt) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + if (info == psb_success_) call psb_geasb(xv,desc_a,info,mold=vmold) + if (info == psb_success_) call psb_geasb(bv,desc_a,info,mold=vmold) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + tasb = psb_wtime()-t1 + call psb_barrier(ctxt) + ttot = psb_wtime() - t0 + + call psb_amx(ctxt,talc) + call psb_amx(ctxt,tgen) + call psb_amx(ctxt,tasb) + call psb_amx(ctxt,ttot) + if(iam == psb_root_) then + tmpfmt = a%get_fmt() + write(psb_out_unit,'("The matrix has been generated and assembled in ",a3," format.")')& + & tmpfmt + write(psb_out_unit,'("-allocation time : ",es12.5)') talc + write(psb_out_unit,'("-coeff. gen. time : ",es12.5)') tgen + write(psb_out_unit,'("-desc asbly time : ",es12.5)') tcdasb + write(psb_out_unit,'("- mat asbly time : ",es12.5)') tasb + write(psb_out_unit,'("-total time : ",es12.5)') ttot + + end if + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(ctxt,err_act) + + return + end subroutine psb_d_gen_pde3d + function outside(i,j,k,bndx,bndy,bndz,iamx,iamy,iamz) result(res) + logical :: res + integer(psb_ipk_), intent(in) :: i,j,k,iamx,iamy,iamz + integer(psb_ipk_), intent(in) :: bndx(0:),bndy(0:),bndz(0:) + + res = (i=bndx(iamx+1)) & + & .or.(j=bndy(iamy+1)) & + & .or.(k=bndz(iamz+1)) + end function outside +end module psb_d_pde3d_mod + +program psb_d_pde3d + use psb_base_mod + use psb_prec_mod + use psb_krylov_mod + use psb_util_mod + use psb_d_pde3d_mod +#if defined(OPENMP) + use omp_lib +#endif + implicit none + + ! input parameters + character(len=20) :: kmethd, ptype + character(len=5) :: afmt + integer(psb_ipk_) :: idim + integer(psb_epk_) :: system_size + + ! miscellaneous + real(psb_dpk_), parameter :: one = done + real(psb_dpk_) :: t1, t2, tprec + + ! sparse matrix and preconditioner + type(psb_dspmat_type) :: a + type(psb_dprec_type) :: prec + ! descriptor + type(psb_desc_type) :: desc_a + ! dense vectors + type(psb_d_vect_type) :: xxv,bv + ! parallel environment + type(psb_ctxt_type) :: ctxt + integer(psb_ipk_) :: iam, np, nth + + ! solver parameters + integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart + integer(psb_epk_) :: amatsize, precsize, descsize, d2size + real(psb_dpk_) :: err, eps + + ! Parameters for solvers in Block-Jacobi preconditioner + type ainvparms + character(len=12) :: alg, orth_alg, ilu_alg, ilut_scale + integer(psb_ipk_) :: fill, inv_fill + real(psb_dpk_) :: thresh, inv_thresh + end type ainvparms + type(ainvparms) :: parms + + ! other variables + integer(psb_ipk_) :: info, i + character(len=20) :: name,ch_err + character(len=40) :: fname + + info=psb_success_ + + + call psb_init(ctxt) + call psb_info(ctxt,iam,np) +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + + if (iam < 0) then + ! This should not happen, but just in case + call psb_exit(ctxt) + stop + endif + if(psb_errstatus_fatal()) goto 9999 + name='pde3d90' + call psb_set_errverbosity(itwo) + call psb_cd_set_large_threshold(125000_psb_ipk_) + ! + ! Hello world + ! + if (iam == psb_root_) then + write(*,*) 'Welcome to PSBLAS version: ',psb_version_string_ + write(*,*) 'This is the ',trim(name),' sample program' + end if +#if 0 + block + integer(psb_ipk_), parameter :: ntv=10 + integer(psb_ipk_) :: itv(ntv+1),i + itv(:) = 0 + do i=1,ntv + itv(i) = 2 + mod(i,2) + end do + write(0,*) 'ITV before : ',itv(:) + call psi_exscan(ntv,itv,info) + write(0,*) 'ITV after : ',itv(:) + itv(:) = 0 + do i=1,ntv + itv(i) = 2 + mod(i,2) + end do + write(0,*) 'ITV before 1: ',itv(:) + call psi_exscan(ntv,itv,info,shift=ione) + write(0,*) 'ITV after 1: ',itv(:) + ! call a%print('a.mtx',head='Test') + end block +!!$ +!!$ call psb_exit(ctxt) +!!$ stop +#endif + ! + ! get parameters + ! + call get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ipart,parms) + ! + ! allocate and fill in the coefficient matrix, rhs and initial guess + ! + call psb_barrier(ctxt) + t1 = psb_wtime() + call psb_gen_pde3d(ctxt,idim,a,bv,xxv,desc_a,afmt,info,partition=ipart) + call psb_barrier(ctxt) + t2 = psb_wtime() - t1 + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='psb_gen_pde3d' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + if (iam == psb_root_) write(psb_out_unit,'("Overall matrix creation time : ",es12.5)')t2 + if (iam == psb_root_) write(psb_out_unit,'(" ")') + call a%print('a.mtx',head='Test') + ! + ! prepare the preconditioner. + ! + if(iam == psb_root_) write(psb_out_unit,'("Setting preconditioner to : ",a)')ptype + call prec%init(ctxt,ptype,info) + ! + ! Set the options for the BJAC preconditioner + ! + if (psb_toupper(ptype) == "BJAC") then + call prec%set('sub_solve', parms%alg, info) + select case (psb_toupper(parms%alg)) + case ("ILU") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('ilu_alg', parms%ilu_alg, info) + case ("ILUT") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('sub_iluthrs', parms%thresh, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + case ("AINV") + call prec%set('inv_thresh', parms%inv_thresh, info) + call prec%set('inv_fillin', parms%inv_fill, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + call prec%set('ainv_alg', parms%orth_alg, info) + case ("INVK") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('inv_fillin', parms%inv_fill, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + case ("INVT") + call prec%set('sub_fillin', parms%fill, info) + call prec%set('inv_fillin', parms%inv_fill, info) + call prec%set('sub_iluthrs', parms%thresh, info) + call prec%set('inv_thresh', parms%inv_thresh, info) + call prec%set('ilut_scale', parms%ilut_scale, info) + case default + ! Do nothing, use default setting in the init routine + end select + else + ! nothing to set for NONE or DIAG preconditioner + end if + + call psb_barrier(ctxt) + t1 = psb_wtime() + call prec%build(a,desc_a,info) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='psb_precbld' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + tprec = psb_wtime()-t1 + + call psb_amx(ctxt,tprec) + + if (iam == psb_root_) write(psb_out_unit,'("Preconditioner time : ",es12.5)')tprec + if (iam == psb_root_) write(psb_out_unit,'(" ")') + call prec%descr(info) + ! + ! iterative method parameters + ! + if(iam == psb_root_) write(psb_out_unit,'("Calling iterative method ",a)')kmethd + call psb_barrier(ctxt) + t1 = psb_wtime() + eps = 1.d-6 + call psb_krylov(kmethd,a,prec,bv,xxv,eps,desc_a,info,& + & itmax=itmax,iter=iter,err=err,itrace=itrace,istop=istopc,irst=irst) + + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='solver routine' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + call psb_barrier(ctxt) + t2 = psb_wtime() - t1 + call psb_amx(ctxt,t2) + amatsize = a%sizeof() + descsize = desc_a%sizeof() + precsize = prec%sizeof() + system_size = desc_a%get_global_rows() + call psb_sum(ctxt,amatsize) + call psb_sum(ctxt,descsize) + call psb_sum(ctxt,precsize) + + if (iam == psb_root_) then + write(psb_out_unit,'(" ")') + write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Linear system size : ",i12)') system_size + write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 + write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter + write(psb_out_unit,'("Number of iterations : ",i12)')iter + write(psb_out_unit,'("Convergence indicator on exit : ",es12.5)')err + write(psb_out_unit,'("Info on exit : ",i12)')info + write(psb_out_unit,'("Total memory occupation for A: ",i12)')amatsize + write(psb_out_unit,'("Total memory occupation for PREC: ",i12)')precsize + write(psb_out_unit,'("Total memory occupation for DESC_A: ",i12)')descsize + write(psb_out_unit,'("Storage format for A: ",a)') a%get_fmt() + write(psb_out_unit,'("Storage format for DESC_A: ",a)') desc_a%get_fmt() + end if + + ! + ! cleanup storage and exit + ! + call psb_gefree(bv,desc_a,info) + call psb_gefree(xxv,desc_a,info) + call psb_spfree(a,desc_a,info) + call prec%free(info) + call psb_cdfree(desc_a,info) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='free routine' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + call psb_exit(ctxt) + stop + +9999 call psb_error(ctxt) + + stop + +contains + ! + ! get iteration parameters from standard input + ! + subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,& + & itmax,itrace,irst,ipart,parms) + type(psb_ctxt_type) :: ctxt + character(len=*) :: kmethd, ptype, afmt + integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst,ipart + integer(psb_ipk_) :: np, iam + integer(psb_ipk_) :: ip, inp_unit + character(len=1024) :: filename + type(ainvparms) :: parms + + call psb_info(ctxt, iam, np) + + if (iam == 0) then + if (command_argument_count()>0) then + call get_command_argument(1,filename) + inp_unit = 30 + open(inp_unit,file=filename,action='read',iostat=info) + if (info /= 0) then + write(psb_err_unit,*) 'Could not open file ',filename,' for input' + call psb_abort(ctxt) + stop + else + write(psb_err_unit,*) 'Opened file ',trim(filename),' for input' + end if + else + inp_unit=psb_inp_unit + end if + read(inp_unit,*) ip + if (ip >= 3) then + read(inp_unit,*) kmethd + read(inp_unit,*) ptype + read(inp_unit,*) afmt + + read(inp_unit,*) idim + if (ip >= 4) then + read(inp_unit,*) ipart + else + ipart = 3 + endif + if (ip >= 5) then + read(inp_unit,*) istopc + else + istopc=1 + endif + if (ip >= 6) then + read(inp_unit,*) itmax + else + itmax=500 + endif + if (ip >= 7) then + read(inp_unit,*) itrace + else + itrace=-1 + endif + if (ip >= 8) then + read(inp_unit,*) irst + else + irst=1 + endif + if (ip >= 9) then + read(inp_unit,*) parms%alg + read(inp_unit,*) parms%ilu_alg + read(inp_unit,*) parms%ilut_scale + read(inp_unit,*) parms%fill + read(inp_unit,*) parms%inv_fill + read(inp_unit,*) parms%thresh + read(inp_unit,*) parms%inv_thresh + read(inp_unit,*) parms%orth_alg + else + parms%alg = 'ILU' ! Block Solver ILU,ILUT,INVK,AINVT,AORTH + parms%ilu_alg = 'NONE' ! If ILU : MILU or NONE othewise ignored + parms%ilut_scale = 'NONE' ! If ILUT: NONE, MAXVAL, DIAG, ARWSUM, ACLSUM, ARCSUM + parms%fill = 0 ! Level of fill for forward factorization + parms%inv_fill = 1 ! Level of fill for inverse factorization (only INVK) + parms%thresh = 1E-1_psb_dpk_ ! Threshold for forward factorization + parms%inv_thresh = 1E-1_psb_dpk_ ! Threshold for inverse factorization + parms%orth_alg = 'LLK' ! What orthogonalization algorithm? + endif + + write(psb_out_unit,'("Solving matrix : ell1")') + write(psb_out_unit,& + & '("Grid dimensions : ",i4," x ",i4," x ",i4)') & + & idim,idim,idim + write(psb_out_unit,'("Number of processors : ",i0)')np + select case(ipart) + case(1) + write(psb_out_unit,'("Data distribution : BLOCK")') + case(3) + write(psb_out_unit,'("Data distribution : 3D")') + case default + ipart = 3 + write(psb_out_unit,'("Unknown data distrbution, defaulting to 3D")') + end select + write(psb_out_unit,'("Preconditioner : ",a)') ptype + if( psb_toupper(ptype) == "BJAC" ) then + write(psb_out_unit,'("Block subsolver : ",a)') parms%alg + select case (psb_toupper(parms%alg)) + case ('ILU') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("MILU : ",a)') parms%ilu_alg + case ('ILUT') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("Threshold : ",es12.5)') parms%thresh + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case ('INVK') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("Invese Fill in : ",i0)') parms%inv_fill + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case ('INVT') + write(psb_out_unit,'("Fill in : ",i0)') parms%fill + write(psb_out_unit,'("Threshold : ",es12.5)') parms%thresh + write(psb_out_unit,'("Invese Fill in : ",i0)') parms%inv_fill + write(psb_out_unit,'("Inverse Threshold : ",es12.5)') parms%inv_thresh + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case ('AINV','AORTH') + write(psb_out_unit,'("Inverse Threshold : ",es12.5)') parms%inv_thresh + write(psb_out_unit,'("Invese Fill in : ",i0)') parms%inv_fill + write(psb_out_unit,'("Orthogonalization : ",a)') parms%orth_alg + write(psb_out_unit,'("Scaling : ",a)') parms%ilut_scale + case default + write(psb_out_unit,'("Unknown diagonal solver")') + end select + end if + write(psb_out_unit,'("Iterative method : ",a)') kmethd + write(psb_out_unit,'(" ")') + else + ! wrong number of parameter, print an error message and exit + call pr_usage(izero) + call psb_abort(ctxt) + stop 1 + endif + if (inp_unit /= psb_inp_unit) then + close(inp_unit) + end if + + end if + ! broadcast parameters to all processors + call psb_bcast(ctxt,kmethd) + call psb_bcast(ctxt,afmt) + call psb_bcast(ctxt,ptype) + call psb_bcast(ctxt,idim) + call psb_bcast(ctxt,ipart) + call psb_bcast(ctxt,istopc) + call psb_bcast(ctxt,itmax) + call psb_bcast(ctxt,itrace) + call psb_bcast(ctxt,irst) + call psb_bcast(ctxt,parms%alg) + call psb_bcast(ctxt,parms%fill) + call psb_bcast(ctxt,parms%inv_fill) + call psb_bcast(ctxt,parms%thresh) + call psb_bcast(ctxt,parms%inv_thresh) + call psb_bcast(ctxt,parms%orth_alg) + call psb_bcast(ctxt,parms%ilut_scale) + + return + + end subroutine get_parms + ! + ! print an error message + ! + subroutine pr_usage(iout) + integer(psb_ipk_) :: iout + write(iout,*)'incorrect parameter(s) found' + write(iout,*)' usage: pde3d90 methd prec dim & + &[istop itmax itrace]' + write(iout,*)' where:' + write(iout,*)' methd: cgstab cgs rgmres bicgstabl' + write(iout,*)' prec : bjac diag none' + write(iout,*)' dim number of points along each axis' + write(iout,*)' the size of the resulting linear ' + write(iout,*)' system is dim**3' + write(iout,*)' ipart data partition 1 3 ' + write(iout,*)' istop stopping criterion 1, 2 ' + write(iout,*)' itmax maximum number of iterations [500] ' + write(iout,*)' itrace <=0 (no tracing, default) or ' + write(iout,*)' >= 1 do tracing every itrace' + write(iout,*)' iterations ' + end subroutine pr_usage + +end program psb_d_pde3d diff --git a/test/pargen/psb_d_pde2d.F90 b/test/pargen/psb_d_pde2d.F90 index 6da97828..11777b19 100644 --- a/test/pargen/psb_d_pde2d.F90 +++ b/test/pargen/psb_d_pde2d.F90 @@ -156,6 +156,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -192,7 +195,7 @@ contains type(psb_d_coo_sparse_mat) :: acoo type(psb_d_csr_sparse_mat) :: acsr real(psb_dpk_) :: zt(nb),x,y,z - integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_, mysz integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner ! For 2D partition @@ -204,8 +207,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2 @@ -391,7 +393,6 @@ contains end if end block - case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -418,94 +419,114 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(done,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif - + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(done,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel + + tgen = psb_wtime()-t1 if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -514,8 +535,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info,mold=imold) @@ -579,6 +598,9 @@ program psb_d_pde2d use psb_krylov_mod use psb_util_mod use psb_d_pde2d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -600,7 +622,7 @@ program psb_d_pde2d type(psb_d_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -625,7 +647,16 @@ program psb_d_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -750,6 +781,8 @@ program psb_d_pde2d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter @@ -790,7 +823,8 @@ contains ! ! get iteration parameters from standard input ! - subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ipart,parms) + subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,& + & itmax,itrace,irst,ipart,parms) type(psb_ctxt_type) :: ctxt character(len=*) :: kmethd, ptype, afmt integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst,ipart diff --git a/test/pargen/psb_d_pde3d.F90 b/test/pargen/psb_d_pde3d.F90 index d4eeccf2..6e895c00 100644 --- a/test/pargen/psb_d_pde3d.F90 +++ b/test/pargen/psb_d_pde3d.F90 @@ -172,6 +172,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -220,8 +223,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_dpk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_dpk_) :: deltah, sqdeltah, deltah2 @@ -377,7 +379,6 @@ contains ! call psb_cdall(ctxt,desc_a,info,vl=myidx) - ! ! Specify process topology ! @@ -447,69 +448,79 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr, nb - ib = min(nb,nlr-ii+1) - !ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_dpk_), allocatable :: val(:) + real(psb_dpk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + !ib = min(nb,mysz-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif ! term depending on (x,y,z) val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & @@ -546,14 +557,22 @@ contains endif end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle + end do + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -569,113 +588,6 @@ contains call psb_cdasb(desc_a,info,mold=imold) tcdasb = psb_wtime()-t1 - if (.false.) then - ! - ! Add extra rows to test remote build. - ! - block - integer(psb_ipk_) :: ks, i - ks = desc_a%get_local_cols()-desc_a%get_local_rows() - if (ks > 0) ks = max(1,ks / 10) - mysz = nlr+ks - call psb_realloc(mysz,myidx,info) - do i=nlr+1, mysz - myidx(i) = i - end do - call desc_a%l2gv1(myidx(nlr+1:mysz),info) - !write(0,*) iam,' Check on extra nodes ',nlr,mysz,':',myidx(nlr+1:mysz) - do ii= nlr+1, mysz, nb - ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=dzero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - - end block - end if - call psb_barrier(ctxt) t1 = psb_wtime() if (info == psb_success_) then @@ -719,7 +631,7 @@ contains write(psb_out_unit,'("-total time : ",es12.5)') ttot end if - deallocate(val,irow,icol) + call psb_erractionrestore(err_act) return @@ -744,6 +656,9 @@ program psb_d_pde3d use psb_krylov_mod use psb_util_mod use psb_d_pde3d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -765,7 +680,7 @@ program psb_d_pde3d type(psb_d_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -790,7 +705,16 @@ program psb_d_pde3d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -914,6 +838,8 @@ program psb_d_pde3d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter diff --git a/test/pargen/psb_s_pde2d.F90 b/test/pargen/psb_s_pde2d.F90 index 664d5d08..f14d2cb4 100644 --- a/test/pargen/psb_s_pde2d.F90 +++ b/test/pargen/psb_s_pde2d.F90 @@ -156,6 +156,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -192,7 +195,7 @@ contains type(psb_s_coo_sparse_mat) :: acoo type(psb_s_csr_sparse_mat) :: acsr real(psb_spk_) :: zt(nb),x,y,z - integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_, mysz integer(psb_lpk_) :: m,n,glob_row,nt integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner ! For 2D partition @@ -204,8 +207,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2 @@ -391,7 +393,6 @@ contains end if end block - case default write(psb_err_unit,*) iam, 'Initialization error: should not get here' info = -1 @@ -418,94 +419,114 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr,nb - ib = min(nb,nlr-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,glob_row,idim,idim) - ! x, y coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - - zt(k) = f_(x,y) - ! internal point: build discretization - ! - ! term depending on (x-1,y) - ! - val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1) - val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y) - val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) - call ijk2idx(icol(icoeff),ix,iy,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y+1) - val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y) - val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr,nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,glob_row,idim,idim) + ! x, y coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + + zt(k) = f_(x,y) + ! internal point: build discretization + ! + ! term depending on (x-1,y) + ! + val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1) + val(icoeff) = -a2(x,y)/sqdeltah-b2(x,y)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y) + val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y) + call ijk2idx(icol(icoeff),ix,iy,idim,idim) irow(icoeff) = glob_row icoeff = icoeff+1 - endif - + ! term depending on (x,y+1) + val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2 + if (iy == idim) then + zt(k) = g(x,sone)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y) + val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2 + if (ix==idim) then + zt(k) = g(sone,y)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel + + tgen = psb_wtime()-t1 if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -514,8 +535,6 @@ contains goto 9999 end if - deallocate(val,irow,icol) - call psb_barrier(ctxt) t1 = psb_wtime() call psb_cdasb(desc_a,info,mold=imold) @@ -579,6 +598,9 @@ program psb_s_pde2d use psb_krylov_mod use psb_util_mod use psb_s_pde2d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -600,7 +622,7 @@ program psb_s_pde2d type(psb_s_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -625,7 +647,16 @@ program psb_s_pde2d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -750,6 +781,8 @@ program psb_s_pde2d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter @@ -790,7 +823,8 @@ contains ! ! get iteration parameters from standard input ! - subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,itmax,itrace,irst,ipart,parms) + subroutine get_parms(ctxt,kmethd,ptype,afmt,idim,istopc,& + & itmax,itrace,irst,ipart,parms) type(psb_ctxt_type) :: ctxt character(len=*) :: kmethd, ptype, afmt integer(psb_ipk_) :: idim, istopc,itmax,itrace,irst,ipart diff --git a/test/pargen/psb_s_pde3d.F90 b/test/pargen/psb_s_pde3d.F90 index 280a7c34..2938a4ff 100644 --- a/test/pargen/psb_s_pde3d.F90 +++ b/test/pargen/psb_s_pde3d.F90 @@ -172,6 +172,9 @@ contains & f,amold,vmold,imold,partition,nrl,iv) use psb_base_mod use psb_util_mod +#if defined(OPENMP) + use omp_lib +#endif ! ! Discretizes the partial differential equation ! @@ -220,8 +223,7 @@ contains ! Process grid integer(psb_ipk_) :: np, iam integer(psb_ipk_) :: icoeff - integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) - real(psb_spk_), allocatable :: val(:) + integer(psb_lpk_), allocatable :: myidx(:) ! deltah dimension of each grid cell ! deltat discretization time real(psb_spk_) :: deltah, sqdeltah, deltah2 @@ -377,7 +379,6 @@ contains ! call psb_cdall(ctxt,desc_a,info,vl=myidx) - ! ! Specify process topology ! @@ -447,69 +448,79 @@ contains goto 9999 end if - ! we build an auxiliary matrix consisting of one row at a - ! time; just a small matrix. might be extended to generate - ! a bunch of rows per call. - ! - allocate(val(20*nb),irow(20*nb),& - &icol(20*nb),stat=info) - if (info /= psb_success_ ) then - info=psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - endif - - - ! loop over rows belonging to current process in a block - ! distribution. - call psb_barrier(ctxt) t1 = psb_wtime() - do ii=1, nlr, nb - ib = min(nb,nlr-ii+1) - !ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif + !$omp parallel shared(deltah,myidx,a,desc_a) + ! + block + integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth + integer(psb_lpk_) :: glob_row + integer(psb_lpk_), allocatable :: irow(:),icol(:) + real(psb_spk_), allocatable :: val(:) + real(psb_spk_) :: x,y,z, zt(nb) +#if defined(OPENMP) + nth = omp_get_num_threads() + ith = omp_get_thread_num() +#else + nth = 1 + ith = 0 +#endif + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + !goto 9999 + endif + + !$omp do schedule(dynamic) + ! + do ii=1, nlr, nb + if(info /= psb_success_) cycle + ib = min(nb,nlr-ii+1) + !ib = min(nb,mysz-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif ! term depending on (x,y,z) val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & @@ -546,14 +557,22 @@ contains endif end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do +#if defined(OPENMP) +!!$ write(0,*) omp_get_thread_num(),' Check insertion ',& +!!$ & irow(1:icoeff-1),':',icol(1:icoeff-1) +#endif + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) cycle + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) cycle + zt(:)=szero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) cycle + end do + !$omp end do + deallocate(val,irow,icol) + end block + !$omp end parallel tgen = psb_wtime()-t1 if(info /= psb_success_) then @@ -569,113 +588,6 @@ contains call psb_cdasb(desc_a,info,mold=imold) tcdasb = psb_wtime()-t1 - if (.false.) then - ! - ! Add extra rows to test remote build. - ! - block - integer(psb_ipk_) :: ks, i - ks = desc_a%get_local_cols()-desc_a%get_local_rows() - if (ks > 0) ks = max(1,ks / 10) - mysz = nlr+ks - call psb_realloc(mysz,myidx,info) - do i=nlr+1, mysz - myidx(i) = i - end do - call desc_a%l2gv1(myidx(nlr+1:mysz),info) - !write(0,*) iam,' Check on extra nodes ',nlr,mysz,':',myidx(nlr+1:mysz) - do ii= nlr+1, mysz, nb - ib = min(nb,mysz-ii+1) - icoeff = 1 - do k=1,ib - i=ii+k-1 - ! local matrix pointer - glob_row=myidx(i) - ! compute gridpoint coordinates - call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) - ! x, y, z coordinates - x = (ix-1)*deltah - y = (iy-1)*deltah - z = (iz-1)*deltah - zt(k) = f_(x,y,z) - ! internal point: build discretization - ! - ! term depending on (x-1,y,z) - ! - val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 - if (ix == 1) then - zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y-1,z) - val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 - if (iy == 1) then - zt(k) = g(x,szero,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y,z-1) - val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 - if (iz == 1) then - zt(k) = g(x,y,szero)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - ! term depending on (x,y,z) - val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & - & + c(x,y,z) - call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - ! term depending on (x,y,z+1) - val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 - if (iz == idim) then - zt(k) = g(x,y,sone)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x,y+1,z) - val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 - if (iy == idim) then - zt(k) = g(x,sone,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - ! term depending on (x+1,y,z) - val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 - if (ix==idim) then - zt(k) = g(sone,y,z)*(-val(icoeff)) + zt(k) - else - call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) - irow(icoeff) = glob_row - icoeff = icoeff+1 - endif - - end do - call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) - if(info /= psb_success_) exit - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) - if(info /= psb_success_) exit - zt(:)=szero - call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) - if(info /= psb_success_) exit - end do - - end block - end if - call psb_barrier(ctxt) t1 = psb_wtime() if (info == psb_success_) then @@ -719,7 +631,7 @@ contains write(psb_out_unit,'("-total time : ",es12.5)') ttot end if - deallocate(val,irow,icol) + call psb_erractionrestore(err_act) return @@ -744,6 +656,9 @@ program psb_s_pde3d use psb_krylov_mod use psb_util_mod use psb_s_pde3d_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none ! input parameters @@ -765,7 +680,7 @@ program psb_s_pde3d type(psb_s_vect_type) :: xxv,bv ! parallel environment type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: iam, np + integer(psb_ipk_) :: iam, np, nth ! solver parameters integer(psb_ipk_) :: iter, itmax,itrace, istopc, irst, ipart @@ -790,7 +705,16 @@ program psb_s_pde3d call psb_init(ctxt) call psb_info(ctxt,iam,np) - +#if defined(OPENMP) + !$OMP parallel shared(nth) + !$OMP master + nth = omp_get_num_threads() + !$OMP end master + !$OMP end parallel +#else + nth = 1 +#endif + if (iam < 0) then ! This should not happen, but just in case call psb_exit(ctxt) @@ -914,6 +838,8 @@ program psb_s_pde3d if (iam == psb_root_) then write(psb_out_unit,'(" ")') write(psb_out_unit,'("Number of processes : ",i12)')np + write(psb_out_unit,'("Number of threads : ",i12)')nth + write(psb_out_unit,'("Total number of tasks : ",i12)')nth*np write(psb_out_unit,'("Linear system size : ",i12)') system_size write(psb_out_unit,'("Time to solve system : ",es12.5)')t2 write(psb_out_unit,'("Time per iteration : ",es12.5)')t2/iter diff --git a/test/pargen/runs/ppde.inp b/test/pargen/runs/ppde.inp index e7e5dca2..40e3358d 100644 --- a/test/pargen/runs/ppde.inp +++ b/test/pargen/runs/ppde.inp @@ -2,13 +2,13 @@ BICGSTAB Iterative method BICGSTAB CGS BICG BICGSTABL RGMRES FCG CGR BJAC Preconditioner NONE DIAG BJAC CSR Storage format for matrix A: CSR COO -040 Domain size (acutal system is this**3 (pde3d) or **2 (pde2d) ) +140 Domain size (acutal system is this**3 (pde3d) or **2 (pde2d) ) 3 Partition: 1 BLOCK 3 3D 2 Stopping criterion 1 2 -0100 MAXIT +0200 MAXIT 05 ITRACE 002 IRST restart for RGMRES and BiCGSTABL -ILU Block Solver ILU,ILUT,INVK,AINVT,AORTH +INVK Block Solver ILU,ILUT,INVK,AINVT,AORTH NONE If ILU : MILU or NONE othewise ignored NONE Scaling if ILUT: NONE, MAXVAL otherwise ignored 0 Level of fill for forward factorization