diff --git a/README.md b/README.md index d2f19f95..86bb2805 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,8 @@ Salvatore Filippone Contributors (roughly reverse cronological order): +Andea Di Iorio +Stefano Petrilli Soren Rasmussen Zaak Beekman Ambra Abdullahi Hassan diff --git a/base/modules/auxil/psb_c_hsort_x_mod.f90 b/base/modules/auxil/psb_c_hsort_x_mod.f90 index c0e39411..8f0437f7 100644 --- a/base/modules/auxil/psb_c_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_c_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_c_realloc_mod.F90 b/base/modules/auxil/psb_c_realloc_mod.F90 index 9e6af5a8..9b22bee7 100644 --- a/base/modules/auxil/psb_c_realloc_mod.F90 +++ b/base/modules/auxil/psb_c_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,36 +997,17 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1020,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1073,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_d_hsort_x_mod.f90 b/base/modules/auxil/psb_d_hsort_x_mod.f90 index 7273e972..ba45d683 100644 --- a/base/modules/auxil/psb_d_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_d_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_d_realloc_mod.F90 b/base/modules/auxil/psb_d_realloc_mod.F90 index 672b4677..ca85e0ec 100644 --- a/base/modules/auxil/psb_d_realloc_mod.F90 +++ b/base/modules/auxil/psb_d_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,36 +997,17 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1020,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1073,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_e_realloc_mod.F90 b/base/modules/auxil/psb_e_realloc_mod.F90 index 0f2431fd..06a6d034 100644 --- a/base/modules/auxil/psb_e_realloc_mod.F90 +++ b/base/modules/auxil/psb_e_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,36 +997,17 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1020,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1073,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_i2_realloc_mod.F90 b/base/modules/auxil/psb_i2_realloc_mod.F90 index 22e85d36..146bdf7e 100644 --- a/base/modules/auxil/psb_i2_realloc_mod.F90 +++ b/base/modules/auxil/psb_i2_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,36 +997,17 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1020,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1073,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_i_hsort_x_mod.f90 b/base/modules/auxil/psb_i_hsort_x_mod.f90 index 0d1288a6..4bbc3d7f 100644 --- a/base/modules/auxil/psb_i_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_i_hsort_x_mod.f90 @@ -124,7 +124,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -237,9 +237,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_l_hsort_x_mod.f90 b/base/modules/auxil/psb_l_hsort_x_mod.f90 index 487e8ce9..5134d6bb 100644 --- a/base/modules/auxil/psb_l_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_l_hsort_x_mod.f90 @@ -124,7 +124,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_lpk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -237,9 +237,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_lpk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_lpk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_m_realloc_mod.F90 b/base/modules/auxil/psb_m_realloc_mod.F90 index c81ed83a..4d2f9316 100644 --- a/base/modules/auxil/psb_m_realloc_mod.F90 +++ b/base/modules/auxil/psb_m_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,36 +997,17 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1020,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1073,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_s_hsort_x_mod.f90 b/base/modules/auxil/psb_s_hsort_x_mod.f90 index 34f69ea4..204dbbf4 100644 --- a/base/modules/auxil/psb_s_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_s_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_s_realloc_mod.F90 b/base/modules/auxil/psb_s_realloc_mod.F90 index 0b2873e3..f064e606 100644 --- a/base/modules/auxil/psb_s_realloc_mod.F90 +++ b/base/modules/auxil/psb_s_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,36 +997,17 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1020,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1073,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/auxil/psb_z_hsort_x_mod.f90 b/base/modules/auxil/psb_z_hsort_x_mod.f90 index 39f52e4f..4b7302aa 100644 --- a/base/modules/auxil/psb_z_hsort_x_mod.f90 +++ b/base/modules/auxil/psb_z_hsort_x_mod.f90 @@ -123,7 +123,7 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 @@ -236,9 +236,9 @@ contains return endif - call psb_ensure_size(heap%last+1,heap%keys,info,addsz=(1_psb_ipk_)*psb_heap_resize) + call psb_ensure_size(heap%last+1,heap%keys,info) if (info == psb_success_) & - & call psb_ensure_size(heap%last+1,heap%idxs,info,addsz=(1_psb_ipk_)*psb_heap_resize) + & call psb_ensure_size(heap%last+1,heap%idxs,info) if (info /= psb_success_) then write(psb_err_unit,*) 'Memory allocation failure in heap_insert' info = -5 diff --git a/base/modules/auxil/psb_z_realloc_mod.F90 b/base/modules/auxil/psb_z_realloc_mod.F90 index e6eeac2f..e9eb26d3 100644 --- a/base/modules/auxil/psb_z_realloc_mod.F90 +++ b/base/modules/auxil/psb_z_realloc_mod.F90 @@ -768,7 +768,7 @@ Contains integer(psb_ipk_) :: info ! ...Local Variables - integer(psb_ipk_) :: isz,err_act,lb + integer(psb_ipk_) :: isz,err_act,lb, i character(len=30) :: name, char_err logical, parameter :: debug=.false. @@ -790,7 +790,11 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else - vout(:) = vin(:) + !$omp parallel do private(i) + do i=lb,lb+isz-1 + vout(i) = vin(i) + end do + !$omp end parallel do endif endif @@ -836,7 +840,9 @@ Contains call psb_errpush(info,name,a_err=char_err) goto 9999 else + !$omp workshare vout(:,:) = vin(:,:) + !$omp end workshare endif endif @@ -991,36 +997,17 @@ Contains goto 9999 end if -!!$ If (len > psb_size(v)) Then -!!$ if (present(newsz)) then -!!$ isz = (max(len+1,newsz)) -!!$ else -!!$ if (present(addsz)) then -!!$ isz = len+max(1,addsz) -!!$ else -!!$ isz = max(len+10, int(1.25*len)) -!!$ endif -!!$ endif -!!$ -!!$ call psb_realloc(isz,v,info,pad=pad) -!!$ if (info /= psb_success_) then -!!$ info=psb_err_from_subroutine_ -!!$ call psb_errpush(info,name,a_err='psb_realloc') -!!$ goto 9999 -!!$ End If -!!$ end If - If (len > psb_size(v)) Then + isz = psb_size(v) + If (len > isz) Then #if defined(OPENMP) !$OMP CRITICAL - if (len > psb_size(v)) then + if (len > isz) then if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) @@ -1033,17 +1020,18 @@ Contains goto 9999 end if #else - if (present(newsz)) then - isz = (max(len+1,newsz)) - else - if (present(addsz)) then - isz = len+max(1,addsz) + if (len > isz) then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - isz = max(len+10, int(1.25*len)) + isz = max(len,1,int(1.25*isz)) endif - endif - call psb_realloc(isz,v,info,pad=pad) + call psb_realloc(isz,v,info,pad=pad) + end if + if (info /= psb_success_) then info=psb_err_from_subroutine_ call psb_errpush(info,name,a_err='psb_realloc') @@ -1085,16 +1073,14 @@ Contains info=psb_err_from_subroutine_ goto 9999 end if - - If (len > psb_size(v)) Then - if (present(newsz)) then - isz = (max(len+1,newsz)) + isz = psb_size(v) + If (len > isz) Then + if (present(newsz)) then + isz = max(len+1,1,newsz) + else if (present(addsz)) then + isz = max(len,1,isz+addsz) else - if (present(addsz)) then - isz = len+max(1,addsz) - else - isz = max(len+10, int(1.25*len)) - endif + isz = max(len,1,int(1.25*isz)) endif call psb_realloc(isz,v,info,pad=pad) diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index b7f53879..058dbb8d 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -652,7 +652,7 @@ contains integer(psb_ipk_) :: me, np character(len=20) :: name,ch_err logical, allocatable :: mask_(:) - logical :: use_openmp = .true. +!!$ logical :: use_openmp = .true. #ifdef OPENMP integer(kind = OMP_lock_kind) :: ins_lck #endif @@ -683,119 +683,32 @@ contains mglob = idxmap%get_gr() nrow = idxmap%get_lr() !write(0,*) me,name,' before loop ',psb_errstatus_fatal() - if (use_openmp) then #ifdef OPENMP - !call OMP_init_lock(ins_lck) + !call OMP_init_lock(ins_lck) - if (idxmap%is_bld()) then + if (idxmap%is_bld()) then - isLoopValid = .true. - ncol = idxmap%get_lc() - if (present(mask)) then - mask_ = mask - else - allocate(mask_(size(idx))) - mask_ = .true. - end if - - if (present(lidx)) then - if (present(mask)) then - !$omp critical(hash_g2l_ins) - - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - !call OMP_set_lock(ins_lck) - ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - !call OMP_set_lock(ins_lck) - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - ncol = idxmap%get_lc() - nxt = lidx(i) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - ! Index not found - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - - if (info /= psb_success_) then - !write(0,*) 'Error spot 1' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - end if - end if - else - idx(i) = -1 - end if - !call OMP_unset_lock(ins_lck) - end if - else - idx(i) = lip - end if - else - idx(i) = -1 - end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) + isLoopValid = .true. + ncol = idxmap%get_lc() + if (present(mask)) then + mask_ = mask + else + allocate(mask_(size(idx))) + mask_ = .true. + end if - if (.not. isLoopValid) then - goto 9999 - end if - else - !$omp critical(hash_g2l_ins) + if (present(lidx)) then + if (present(mask)) then + !$omp critical(hash_g2l_ins) - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 @@ -812,6 +725,7 @@ contains & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index ! is often already existing). @@ -823,9 +737,11 @@ contains if (lip > 0) then idx(i) = lip else if (lip < 0) then + ! Index not found call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) lip = tlip + if (info >= 0) then ! 'nxt' is not equal to 'tlip' when the key is already inside ! the hash map. In that case 'tlip' is the value corresponding @@ -838,7 +754,7 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then - !write(0,*) 'Error spot 2' + !write(0,*) 'Error spot 1' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -858,160 +774,160 @@ contains else idx(i) = lip end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if + + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 end if - else if (.not.present(lidx)) then - if(present(mask)) then - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - - !$omp critical(hash_g2l_ins) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle - endif - !call OMP_set_lock(ins_lck) - ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) + else + !$omp critical(hash_g2l_ins) - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - !write(0,*) me,name,' b hic 1 ',psb_errstatus_fatal() - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - !write(0,*) me,name,' a hic 1 ',psb_errstatus_fatal() - if (lip < 0) then - !call OMP_set_lock(ins_lck) - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing, so this lock is relatively rare). - ncol = idxmap%get_lc() - nxt = ncol + 1 - !write(0,*) me,name,' b hic 2 ',psb_errstatus_fatal() - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - !write(0,*) me,name,' a hic 2 ',psb_errstatus_fatal() - if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - ! Index not found - !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (psb_errstatus_fatal()) write(0,*) me,name,' a hsik ',info,omp_get_thread_num() - !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() - lip = tlip - - if (info >= 0) then - !write(0,*) 'Error before spot 3', info - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (psb_errstatus_fatal()) write(0,*) me,name,' a esz ',info,omp_get_thread_num() - if (info /= psb_success_) then - !write(0,*) 'Error spot 3', info - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - end if - end if - else + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). + ncol = idxmap%get_lc() + nxt = lidx(i) + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then + + ncol = MAX(ncol,nxt) + + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + + if (info /= psb_success_) then + !write(0,*) 'Error spot 2' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + + isLoopValid = .false. idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) end if - !call OMP_unset_lock(ins_lck) end if else - idx(i) = lip + idx(i) = -1 end if - else - idx(i) = -1 + !call OMP_unset_lock(ins_lck) end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = lip end if - else - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - !$omp critical(hash_g2l_ins) - do i = 1, is - info = 0 - if (.not. isLoopValid) cycle + + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 + end if + end if + else if (.not.present(lidx)) then + if(present(mask)) then + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + + !$omp critical(hash_g2l_ins) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + if (mask(i)) then ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - !call OMP_set_lock(ins_lck) + !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() !call OMP_unset_lock(ins_lck) ! At first, we check the index presence in 'idxmap'. Usually ! the index is found. If it is not found, we repeat the checking, ! but inside a critical region. + !write(0,*) me,name,' b hic 1 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) + !write(0,*) me,name,' a hic 1 ',psb_errstatus_fatal() if (lip < 0) then !call OMP_set_lock(ins_lck) ! We check again if the index is already in 'idxmap', this ! time inside a critical region (we assume that the index - ! is often already existing). + ! is often already existing, so this lock is relatively rare). ncol = idxmap%get_lc() - nxt = ncol + 1 + nxt = ncol + 1 + !write(0,*) me,name,' b hic 2 ',psb_errstatus_fatal() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - + !write(0,*) me,name,' a hic 2 ',psb_errstatus_fatal() if (lip > 0) then idx(i) = lip else if (lip < 0) then ! Index not found + !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + if (psb_errstatus_fatal()) write(0,*) me,name,' a hsik ',info,omp_get_thread_num() + !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() lip = tlip if (info >= 0) then + !write(0,*) 'Error before spot 3', info ! 'nxt' is not equal to 'tlip' when the key is already inside ! the hash map. In that case 'tlip' is the value corresponding ! to the existing mapping. if (nxt == tlip) then ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) - + if (psb_errstatus_fatal()) write(0,*) me,name,' a esz ',info,omp_get_thread_num() if (info /= psb_success_) then - !write(0,*) 'Error spot 4' + !write(0,*) 'Error spot 3', info call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) @@ -1028,116 +944,145 @@ contains end if !call OMP_unset_lock(ins_lck) end if - else idx(i) = lip end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 + else + idx(i) = -1 end if - end if - end if - else - ! Wrong state - idx = -1 - info = -1 - end if - !call OMP_destroy_lock(ins_lck) -#endif - else if (.not.use_openmp) then -#ifdef OPENMP - ! $ omp parallel - ! $ omp critical - !write(0,*) 'In cnv: ',omp_get_num_threads() -#endif - isLoopValid = .true. - if (idxmap%is_bld()) then + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) - if (present(lidx)) then - if (present(mask)) then - do i = 1, is + if (.not. isLoopValid) then + goto 9999 + end if + else + ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & + ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + ! $ OMP private(i,ip,lip,tlip,nxt,info) & + ! $ OMP reduction(.AND.:isLoopValid) + !$omp critical(hash_g2l_ins) + do i = 1, is + info = 0 + if (.not. isLoopValid) cycle + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + !call OMP_set_lock(ins_lck) + ncol = idxmap%get_lc() + !call OMP_unset_lock(ins_lck) + + ! At first, we check the index presence in 'idxmap'. Usually + ! the index is found. If it is not found, we repeat the checking, + ! but inside a critical region. + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + !call OMP_set_lock(ins_lck) + ! We check again if the index is already in 'idxmap', this + ! time inside a critical region (we assume that the index + ! is often already existing). ncol = idxmap%get_lc() - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob) ) then - idx(i) = -1 - cycle - endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - tlip = lip - nxt = lidx(i) - if (nxt <= nrow) then - idx(i) = -1 - cycle - endif - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (info >=0) then - if (nxt == tlip) then - ncol = max(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - !write(0,*) 'Error spot' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. - end if - idxmap%loc_to_glob(nxt) = ip + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + + if (lip > 0) then + idx(i) = lip + else if (lip < 0) then + ! Index not found + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + + if (info >= 0) then + ! 'nxt' is not equal to 'tlip' when the key is already inside + ! the hash map. In that case 'tlip' is the value corresponding + ! to the existing mapping. + if (nxt == tlip) then + + ncol = MAX(ncol,nxt) + + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + + if (info /= psb_success_) then + !write(0,*) 'Error spot 4' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + + isLoopValid = .false. + idx(i) = -1 + else + idx(i) = lip + idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - isLoopValid = .false. + end if end if + else + idx(i) = -1 end if - idx(i) = lip - info = psb_success_ - else - idx(i) = -1 + !call OMP_unset_lock(ins_lck) end if - enddo - else if (.not.present(mask)) then + else + idx(i) = lip + end if - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then + end do + ! $ OMP END PARALLEL DO + !$omp end critical(hash_g2l_ins) + + if (.not. isLoopValid) then + goto 9999 + end if + + end if + end if + else + ! Wrong state + idx = -1 + info = -1 + end if + !call OMP_destroy_lock(ins_lck) +#else +!!$ else if (.not.use_openmp) then + isLoopValid = .true. + if (idxmap%is_bld()) then + + if (present(lidx)) then + if (present(mask)) then + do i = 1, is + ncol = idxmap%get_lc() + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then idx(i) = -1 cycle endif - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then + call hash_inner_cnv(ip,lip,idxmap%hashvmask,& + & idxmap%hashv,idxmap%glb_lc,ncol) + if (lip < 0) then + tlip = lip nxt = lidx(i) if (nxt <= nrow) then idx(i) = -1 cycle endif call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - if (info >=0) then - if (nxt == lip) then - ncol = max(nxt,ncol) + if (nxt == tlip) then + ncol = max(ncol,nxt) call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then - info=1 - !write(0,*) 'Error spot' + !write(0,*) 'Error spot' call psb_errpush(psb_err_from_subroutine_ai_,name,& &a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. + isLoopValid = .false. end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) @@ -1151,66 +1096,71 @@ contains end if idx(i) = lip info = psb_success_ - enddo + else + idx(i) = -1 + end if + enddo - end if + else if (.not.present(mask)) then - else if (.not.present(lidx)) then + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + cycle + endif + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip - if (present(mask)) then - do i = 1, is - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then - idx(i) = -1 - cycle + if (info >=0) then + if (nxt == lip) then + ncol = max(nxt,ncol) + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + !write(0,*) 'Error spot' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err='psb_ensure_size',i_err=(/info/)) + isLoopValid = .false. + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) endif - ncol = idxmap%get_lc() - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& - & idxmap%glb_lc,ncol) - if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - end if - - if (info >=0) then - if (nxt == lip) then - ncol = nxt - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - info=1 - write(0,*) 'Error spot 5' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='psb_ensure_size',i_err=(/info/)) - isLoopValid = .false. - end if - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - endif - info = psb_success_ - else - call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='SearchInsKeyVal',i_err=(/info/)) - isLoopValid = .false. - end if - idx(i) = lip info = psb_success_ else - idx(i) = -1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. end if - enddo - else if (.not.present(mask)) then + end if + idx(i) = lip + info = psb_success_ + enddo - do i = 1, is - ncol = idxmap%get_lc() - ip = idx(i) + end if + + else if (.not.present(lidx)) then + + if (present(mask)) then + do i = 1, is + if (mask(i)) then + ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - nxt = ncol + 1 + ncol = idxmap%get_lc() + nxt = ncol + 1 call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& & idxmap%glb_lc,ncol) if (lip < 0) then @@ -1225,41 +1175,80 @@ contains & pad=-1_psb_lpk_,addsz=laddsz) if (info /= psb_success_) then info=1 - write(0,*) 'Error spot 6' - ch_err='psb_ensure_size' + write(0,*) 'Error spot 5' call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + & a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - end if idxmap%loc_to_glob(nxt) = ip call idxmap%set_lc(ncol) endif info = psb_success_ else - ch_err='SearchInsKeyVal' call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) - isLoopValid = .false. + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. end if idx(i) = lip info = psb_success_ - enddo + else + idx(i) = -1 + end if + enddo + else if (.not.present(mask)) then + + do i = 1, is + ncol = idxmap%get_lc() + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then + idx(i) = -1 + cycle + endif + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip < 0) then + call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip + end if + + if (info >=0) then + if (nxt == lip) then + ncol = nxt + call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + write(0,*) 'Error spot 6' + ch_err='psb_ensure_size' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + &a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + isLoopValid = .false. + + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(ncol) + endif + info = psb_success_ + else + ch_err='SearchInsKeyVal' + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err=ch_err,i_err=(/info,izero,izero,izero,izero/)) + isLoopValid = .false. + end if + idx(i) = lip + info = psb_success_ + enddo - end if end if - else - ! Wrong state - idx = -1 - info = -1 end if -#ifdef OPENMP - ! $ omp end critical - ! $ omp end parallel - -#endif - if (.not. isLoopValid) goto 9999 + else + ! Wrong state + idx = -1 + info = -1 end if + if (.not. isLoopValid) goto 9999 +#endif !write(0,*) me,name,' after loop ',psb_errstatus_fatal() call psb_erractionrestore(err_act) return diff --git a/base/modules/serial/psb_c_base_mat_mod.F90 b/base/modules/serial/psb_c_base_mat_mod.F90 index a5dd0fd0..33982e3a 100644 --- a/base/modules/serial/psb_c_base_mat_mod.F90 +++ b/base/modules/serial/psb_c_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_c_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_c_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_c_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_c_coo_ensure_size + procedure, pass(a) :: tril => psb_c_coo_tril + procedure, pass(a) :: triu => psb_c_coo_triu procedure, pass(a) :: cp_to_coo => psb_c_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_c_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_c_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_c_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_c_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_c_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_c_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_c_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_c_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_c_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_c_coo_triu + end interface + !> !! \memberof psb_c_coo_sparse_mat diff --git a/base/modules/serial/psb_c_base_vect_mod.F90 b/base/modules/serial/psb_c_base_vect_mod.F90 index 44044771..df15e0c9 100644 --- a/base/modules/serial/psb_c_base_vect_mod.F90 +++ b/base/modules/serial/psb_c_base_vect_mod.F90 @@ -481,7 +481,11 @@ contains implicit none class(psb_c_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=czero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=czero + !$omp end workshare + end if call x%set_host() end subroutine c_base_zero diff --git a/base/modules/serial/psb_d_base_mat_mod.F90 b/base/modules/serial/psb_d_base_mat_mod.F90 index fac0af20..5f4c76df 100644 --- a/base/modules/serial/psb_d_base_mat_mod.F90 +++ b/base/modules/serial/psb_d_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_d_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_d_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_d_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_d_coo_ensure_size + procedure, pass(a) :: tril => psb_d_coo_tril + procedure, pass(a) :: triu => psb_d_coo_triu procedure, pass(a) :: cp_to_coo => psb_d_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_d_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_d_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_d_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_d_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_d_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_d_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_d_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_d_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_d_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_d_coo_triu + end interface + !> !! \memberof psb_d_coo_sparse_mat diff --git a/base/modules/serial/psb_d_base_vect_mod.F90 b/base/modules/serial/psb_d_base_vect_mod.F90 index a28d12f6..87f5b0e4 100644 --- a/base/modules/serial/psb_d_base_vect_mod.F90 +++ b/base/modules/serial/psb_d_base_vect_mod.F90 @@ -488,7 +488,11 @@ contains implicit none class(psb_d_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=dzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=dzero + !$omp end workshare + end if call x%set_host() end subroutine d_base_zero diff --git a/base/modules/serial/psb_i_base_vect_mod.F90 b/base/modules/serial/psb_i_base_vect_mod.F90 index 0289ecd0..a5cddeb5 100644 --- a/base/modules/serial/psb_i_base_vect_mod.F90 +++ b/base/modules/serial/psb_i_base_vect_mod.F90 @@ -417,7 +417,11 @@ contains implicit none class(psb_i_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=izero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=izero + !$omp end workshare + end if call x%set_host() end subroutine i_base_zero diff --git a/base/modules/serial/psb_l_base_vect_mod.F90 b/base/modules/serial/psb_l_base_vect_mod.F90 index d8654f63..93b29e17 100644 --- a/base/modules/serial/psb_l_base_vect_mod.F90 +++ b/base/modules/serial/psb_l_base_vect_mod.F90 @@ -418,7 +418,11 @@ contains implicit none class(psb_l_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=lzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=lzero + !$omp end workshare + end if call x%set_host() end subroutine l_base_zero diff --git a/base/modules/serial/psb_s_base_mat_mod.F90 b/base/modules/serial/psb_s_base_mat_mod.F90 index 186ae577..92bda7d8 100644 --- a/base/modules/serial/psb_s_base_mat_mod.F90 +++ b/base/modules/serial/psb_s_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_s_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_s_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_s_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_s_coo_ensure_size + procedure, pass(a) :: tril => psb_s_coo_tril + procedure, pass(a) :: triu => psb_s_coo_triu procedure, pass(a) :: cp_to_coo => psb_s_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_s_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_s_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_s_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_s_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_s_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_s_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_s_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_s_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_s_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_s_coo_triu + end interface + !> !! \memberof psb_s_coo_sparse_mat diff --git a/base/modules/serial/psb_s_base_vect_mod.F90 b/base/modules/serial/psb_s_base_vect_mod.F90 index 4bd6bbfb..fccd846b 100644 --- a/base/modules/serial/psb_s_base_vect_mod.F90 +++ b/base/modules/serial/psb_s_base_vect_mod.F90 @@ -488,7 +488,11 @@ contains implicit none class(psb_s_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=szero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=szero + !$omp end workshare + end if call x%set_host() end subroutine s_base_zero diff --git a/base/modules/serial/psb_z_base_mat_mod.F90 b/base/modules/serial/psb_z_base_mat_mod.F90 index 3ce9074f..3e8196f4 100644 --- a/base/modules/serial/psb_z_base_mat_mod.F90 +++ b/base/modules/serial/psb_z_base_mat_mod.F90 @@ -168,6 +168,8 @@ module psb_z_base_mat_mod procedure, pass(a) :: reallocate_nz => psb_z_coo_reallocate_nz procedure, pass(a) :: allocate_mnnz => psb_z_coo_allocate_mnnz procedure, pass(a) :: ensure_size => psb_z_coo_ensure_size + procedure, pass(a) :: tril => psb_z_coo_tril + procedure, pass(a) :: triu => psb_z_coo_triu procedure, pass(a) :: cp_to_coo => psb_z_cp_coo_to_coo procedure, pass(a) :: cp_from_coo => psb_z_cp_coo_from_coo procedure, pass(a) :: cp_to_fmt => psb_z_cp_coo_to_fmt @@ -1894,6 +1896,93 @@ module psb_z_base_mat_mod integer(psb_ipk_), intent(in), optional :: idir end subroutine psb_z_fix_coo end interface + ! + !> Function tril: + !! \memberof psb_z_coo_sparse_mat + !! \brief Copy the lower triangle, i.e. all entries + !! A(I,J) such that J-I <= DIAG + !! default value is DIAG=0, i.e. lower triangle up to + !! the main diagonal. + !! DIAG=-1 means copy the strictly lower triangle + !! DIAG= 1 means copy the lower triangle plus the first diagonal + !! of the upper triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! + !! \param l the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param u [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_z_coo_tril(a,l,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,u) + import + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: u + end subroutine psb_z_coo_tril + end interface + + ! + !> Function triu: + !! \memberof psb_z_coo_sparse_mat + !! \brief Copy the upper triangle, i.e. all entries + !! A(I,J) such that DIAG <= J-I + !! default value is DIAG=0, i.e. upper triangle from + !! the main diagonal up. + !! DIAG= 1 means copy the strictly upper triangle + !! DIAG=-1 means copy the upper triangle plus the first diagonal + !! of the lower triangle. + !! Moreover, apply a clipping by copying entries A(I,J) only if + !! IMIN<=I<=IMAX + !! JMIN<=J<=JMAX + !! Optionally copies the lower triangle at the same time + !! + !! \param u the output (sub)matrix + !! \param info return code + !! \param diag [0] the last diagonal (J-I) to be considered. + !! \param imin [1] the minimum row index we are interested in + !! \param imax [a\%get_nrows()] the minimum row index we are interested in + !! \param jmin [1] minimum col index + !! \param jmax [a\%get_ncols()] maximum col index + !! \param iren(:) [none] an array to return renumbered indices (iren(ia(:)),iren(ja(:)) + !! \param rscale [false] map [min(ia(:)):max(ia(:))] onto [1:max(ia(:))-min(ia(:))+1] + !! \param cscale [false] map [min(ja(:)):max(ja(:))] onto [1:max(ja(:))-min(ja(:))+1] + !! ( iren cannot be specified with rscale/cscale) + !! \param append [false] append to ia,ja + !! \param nzin [none] if append, then first new entry should go in entry nzin+1 + !! \param l [none] copy of the complementary triangle + !! + ! + interface + subroutine psb_z_coo_triu(a,u,info,diag,imin,imax,& + & jmin,jmax,rscale,cscale,l) + import + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: l + end subroutine psb_z_coo_triu + end interface + !> !! \memberof psb_z_coo_sparse_mat diff --git a/base/modules/serial/psb_z_base_vect_mod.F90 b/base/modules/serial/psb_z_base_vect_mod.F90 index c52dcd59..2a14de21 100644 --- a/base/modules/serial/psb_z_base_vect_mod.F90 +++ b/base/modules/serial/psb_z_base_vect_mod.F90 @@ -481,7 +481,11 @@ contains implicit none class(psb_z_base_vect_type), intent(inout) :: x - if (allocated(x%v)) x%v=zzero + if (allocated(x%v)) then + !$omp workshare + x%v(:)=zzero + !$omp end workshare + end if call x%set_host() end subroutine z_base_zero diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 03939afe..46391dee 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2879,9 +2881,6 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) nzaold = nza nza = nza + nz call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 @@ -2960,8 +2959,7 @@ contains end if end do !$OMP END PARALLEL DO - - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) @@ -3489,6 +3487,602 @@ subroutine psb_c_coo_mv_from(a,b) end subroutine psb_c_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_c_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_tril + implicit none + + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_coo_tril + +subroutine psb_c_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_c_base_mat_mod, psb_protect_name => psb_c_coo_triu + implicit none + + class(psb_c_coo_sparse_mat), intent(in) :: a + class(psb_c_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_c_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_coo_triu + subroutine psb_c_fix_coo(a,info,idir) use psb_const_mod @@ -4563,7 +5157,7 @@ function psb_lc_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +5224,7 @@ function psb_lc_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +5274,7 @@ function psb_lc_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do diff --git a/base/serial/impl/psb_c_csc_impl.F90 b/base/serial/impl/psb_c_csc_impl.F90 index d8837b76..54332d06 100644 --- a/base/serial/impl/psb_c_csc_impl.F90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_c_cp_csc_to_fmt(a,b,info) b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_c_cp_csc_from_fmt(a,b,info) a%psb_c_base_sparse_mat = b%psb_c_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 47779646..6c3ecec6 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_c_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_c_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_c_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_c_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -3190,9 +3490,28 @@ subroutine psb_c_cp_csr_to_fmt(a,b,info) b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3595,27 @@ subroutine psb_c_cp_csr_from_fmt(a,b,info) a%psb_c_base_sparse_mat = b%psb_c_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 9597c5f5..c2babf8e 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2879,9 +2881,6 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) nzaold = nza nza = nza + nz call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 @@ -2960,8 +2959,7 @@ contains end if end do !$OMP END PARALLEL DO - - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) @@ -3489,6 +3487,602 @@ subroutine psb_d_coo_mv_from(a,b) end subroutine psb_d_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_d_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_tril + implicit none + + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_d_coo_tril + +subroutine psb_d_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_d_base_mat_mod, psb_protect_name => psb_d_coo_triu + implicit none + + class(psb_d_coo_sparse_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_d_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_d_coo_triu + subroutine psb_d_fix_coo(a,info,idir) use psb_const_mod @@ -4563,7 +5157,7 @@ function psb_ld_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +5224,7 @@ function psb_ld_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +5274,7 @@ function psb_ld_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do diff --git a/base/serial/impl/psb_d_csc_impl.F90 b/base/serial/impl/psb_d_csc_impl.F90 index a50d5026..1761b051 100644 --- a/base/serial/impl/psb_d_csc_impl.F90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_d_cp_csc_to_fmt(a,b,info) b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_d_cp_csc_from_fmt(a,b,info) a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 1e39b892..880bebf8 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_d_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_d_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_d_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_d_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -3190,9 +3490,28 @@ subroutine psb_d_cp_csr_to_fmt(a,b,info) b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3595,27 @@ subroutine psb_d_cp_csr_from_fmt(a,b,info) a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 174d3e07..402c608a 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2879,9 +2881,6 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) nzaold = nza nza = nza + nz call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 @@ -2960,8 +2959,7 @@ contains end if end do !$OMP END PARALLEL DO - - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) @@ -3489,6 +3487,602 @@ subroutine psb_s_coo_mv_from(a,b) end subroutine psb_s_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_s_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_tril + implicit none + + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_coo_tril + +subroutine psb_s_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_s_base_mat_mod, psb_protect_name => psb_s_coo_triu + implicit none + + class(psb_s_coo_sparse_mat), intent(in) :: a + class(psb_s_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_s_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_coo_triu + subroutine psb_s_fix_coo(a,info,idir) use psb_const_mod @@ -4563,7 +5157,7 @@ function psb_ls_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +5224,7 @@ function psb_ls_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +5274,7 @@ function psb_ls_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do diff --git a/base/serial/impl/psb_s_csc_impl.F90 b/base/serial/impl/psb_s_csc_impl.F90 index dc4ca30b..a66b7dc0 100644 --- a/base/serial/impl/psb_s_csc_impl.F90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_s_cp_csc_to_fmt(a,b,info) b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_s_cp_csc_from_fmt(a,b,info) a%psb_s_base_sparse_mat = b%psb_s_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 22ceb6e5..dbe7a4be 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_s_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_s_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_s_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_s_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -3190,9 +3490,28 @@ subroutine psb_s_cp_csr_to_fmt(a,b,info) b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3595,27 @@ subroutine psb_s_cp_csr_from_fmt(a,b,info) a%psb_s_base_sparse_mat = b%psb_s_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 2ccc614a..542f842e 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -2864,7 +2864,9 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (nz == 0) return if (a%is_bld()) then - + ! Structure here is peculiar, because this function can be called + ! either within a parallel region, or outside. + ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() isza = a%get_size() @@ -2879,9 +2881,6 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) nzaold = nza nza = nza + nz call a%set_nzeros(nza) -#if defined(OPENMP) - !write(0,*) 'From thread ',omp_get_thread_num(),nzaold,nz,nza,a%get_nzeros() -#endif end if !$omp end critical if (info /= 0) goto 9999 @@ -2960,8 +2959,7 @@ contains end if end do !$OMP END PARALLEL DO - - !nza = nza + nz + nza = nza + nz #else do i=1, nz ir = ia(i) @@ -3489,6 +3487,602 @@ subroutine psb_z_coo_mv_from(a,b) end subroutine psb_z_coo_mv_from +! +! COO implementation of tril/triu +! +subroutine psb_z_coo_tril(a,l,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,u) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_tril + implicit none + + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: l + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: u + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='tril' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = a%ia(k) + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block + +#else + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzlin = nzlin + 1 + l%ia(nzlin) = i + l%ja(nzlin) = ja(k) + l%val(nzlin) = val(k) + else + nzuin = nzuin + 1 + u%ia(nzuin) = i + u%ja(nzuin) = ja(k) + u%val(nzuin) = val(k) + end if + end if + end do loop1 + end associate + + call l%set_nzeros(nzlin) + call u%set_nzeros(nzuin) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + nzin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + nzin = nzin + 1 + l%ia(nzin) = i + l%ja(nzin) = ja(k) + l%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call l%set_nzeros(nzin) + end if + call l%fix(info) + nzout = l%get_nzeros() + if (rscale_) & + & l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_z_coo_tril + +subroutine psb_z_coo_triu(a,u,info,& + & diag,imin,imax,jmin,jmax,rscale,cscale,l) + ! Output is always in COO format + use psb_error_mod + use psb_const_mod + use psb_z_base_mat_mod, psb_protect_name => psb_z_coo_triu + implicit none + + class(psb_z_coo_sparse_mat), intent(in) :: a + class(psb_z_coo_sparse_mat), intent(out) :: u + integer(psb_ipk_),intent(out) :: info + integer(psb_ipk_), intent(in), optional :: diag,imin,imax,jmin,jmax + logical, intent(in), optional :: rscale,cscale + class(psb_z_coo_sparse_mat), optional, intent(out) :: l + + integer(psb_ipk_) :: err_act, nzin, nzout, i, j, k + integer(psb_ipk_) :: imin_, imax_, jmin_, jmax_, mb,nb, diag_, nzlin, nzuin, nz + character(len=20) :: name='triu' + logical :: rscale_, cscale_ + logical, parameter :: debug=.false. + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(diag)) then + diag_ = diag + else + diag_ = 0 + end if + if (present(imin)) then + imin_ = imin + else + imin_ = 1 + end if + if (present(imax)) then + imax_ = imax + else + imax_ = a%get_nrows() + end if + if (present(jmin)) then + jmin_ = jmin + else + jmin_ = 1 + end if + if (present(jmax)) then + jmax_ = jmax + else + jmax_ = a%get_ncols() + end if + if (present(rscale)) then + rscale_ = rscale + else + rscale_ = .true. + end if + if (present(cscale)) then + cscale_ = cscale + else + cscale_ = .true. + end if + + if (rscale_) then + mb = imax_ - imin_ +1 + else + mb = imax_ + endif + if (cscale_) then + nb = jmax_ - jmin_ +1 + else + nb = jmax_ + endif + +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do k=1,nz + i = a%ia(k) + j = a%ja(k) + if ((i>=imin_).and.(i<=imax_).and.(jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = a%ia(k) + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + associate(val =>a%val, ja => a%ja, ia=>a%ia) + loop1: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)a%val, ja => a%ja, ia=>a%ia) + loop2: do k=1,nz + i = ia(k) + j = ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + nzin = nzin + 1 + u%ia(nzin) = i + u%ja(nzin) = ja(k) + u%val(nzin) = val(k) + end if + end if + end do loop2 + end associate + call u%set_nzeros(nzin) + end if + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) & + & u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + if (cscale_) & + & u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if +#endif + if (info /= psb_success_) goto 9999 + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_z_coo_triu + subroutine psb_z_fix_coo(a,info,idir) use psb_const_mod @@ -4563,7 +5157,7 @@ function psb_lz_coo_maxval(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, nnz res = max(res,abs(a%val(i))) end do @@ -4630,7 +5224,7 @@ function psb_lz_coo_csnmi(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, m res = max(res,abs(vt(i))) end do @@ -4680,7 +5274,7 @@ function psb_lz_coo_csnm1(a) result(res) #if defined(OPENMP) block integer(psb_ipk_) :: i - !$omp parallel do private(i) + !$omp parallel do private(i) reduction(max:res) do i=1, n res = max(res,abs(vt(i))) end do diff --git a/base/serial/impl/psb_z_csc_impl.F90 b/base/serial/impl/psb_z_csc_impl.F90 index 035d0e83..e5516bd9 100644 --- a/base/serial/impl/psb_z_csc_impl.F90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2329,9 +2329,28 @@ subroutine psb_z_cp_csc_to_fmt(a,b,info) b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat nc = a%get_ncols() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) - if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) + if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,b%icp,info) + call psb_realloc(nz,b%ia,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + b%icp(i)=a%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ia(j) = a%ia(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -2443,9 +2462,27 @@ subroutine psb_z_cp_csc_from_fmt(a,b,info) a%psb_z_base_sparse_mat = b%psb_z_base_sparse_mat nc = b%get_ncols() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) - if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) + if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz), a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nc+1,a%icp,info) + call psb_realloc(nz,a%ia,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nc+1 + a%icp(i)=b%icp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ia(j)=b%ia(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 60839e5d..9322105e 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2289,7 +2289,155 @@ subroutine psb_z_csr_tril(a,l,info,& nb = jmax_ endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,lrws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call l%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(u)) then + nzlin = l%get_nzeros() ! At this point it should be 0 + call u%allocate(mb,nb,nz) + nzuin = u%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + else + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop1 + !$omp end parallel do + + call psi_exscan(mb,lrws,info) + call psi_exscan(mb,urws,info) + !write(0,*) lrws(:), urws(:) + !$omp parallel do private(i,j,k,lpnt,upnt) shared(imin_,imax_,a) + loop2: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + else + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop2 + !$omp end parallel do + !write(0,*) 'End of copyout',lnz,unz + call l%set_nzeros(lnz) + call l%fix(info) + call u%set_nzeros(unz) + call u%fix(info) + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ >=-1).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_lower(.false.) + end if + else + lnz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws) reduction(+: lnz) + loop3: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic update + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + !$omp end atomic + lnz = lnz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,lrws,info) + !$omp parallel do private(i,j,k,lpnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)<=diag_) then + !$omp atomic capture + lrws(i-imin_+1) = lrws(i-imin_+1) +1 + lpnt = lrws(i-imin_+1) + !$omp end atomic + l%ia(lpnt) = i + l%ja(lpnt) = a%ja(k) + l%val(lpnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call l%set_nzeros(lnz) + call l%fix(info) + end if + nzout = l%get_nzeros() + if (rscale_) then + !$omp workshare + l%ia(1:nzout) = l%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + l%ja(1:nzout) = l%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + if ((diag_ <= 0).and.(imin_ == jmin_)) then + call l%set_triangle(.true.) + call l%set_lower(.true.) + end if + end block +#else nz = a%get_nzeros() call l%allocate(mb,nb,nz) @@ -2359,7 +2507,7 @@ subroutine psb_z_csr_tril(a,l,info,& call l%set_triangle(.true.) call l%set_lower(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -2443,6 +2591,158 @@ subroutine psb_z_csr_triu(a,u,info,& endif +#if defined(OPENMP) + block + integer(psb_ipk_), allocatable :: lrws(:),urws(:) + integer(psb_ipk_) :: lpnt, upnt, lnz, unz + call psb_realloc(mb,urws,info) + !$omp workshare + urws(:) = 0 + !$omp end workshare + nz = a%get_nzeros() + call u%allocate(mb,nb,nz) + !write(0,*) 'Invocation of COO%TRIL', present(u),nz + if (present(l)) then + nzuin = u%get_nzeros() ! At this point it should be 0 + call l%allocate(mb,nb,nz) + nzlin = l%get_nzeros() ! At this point it should be 0 + if (info == 0) call psb_realloc(mb,urws,info) + !$omp workshare + lrws(:) = 0 + !$omp end workshare + !write(0,*) 'omp version of COO%TRIL/TRIU' + lnz = 0 + unz = 0 + !$omp parallel do private(i,j,k) shared(imin_,imax_,a,lrws,urws) reduction(+: lnz,unz) + loop1: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)=diag_) then + !$omp atomic update + urws(i-imin_+1) = urws(i-imin_+1) +1 + !$omp end atomic + unz = unz + 1 + end if + end if + end do + end do loop3 + !$omp end parallel do + call psi_exscan(mb,urws,info) + !$omp parallel do private(i,j,k,upnt) shared(imin_,imax_,a) + loop4: do i=imin_,imax_ + do k = a%irp(i),a%irp(i+1)-1 + j = a%ja(k) + if ((jmin_<=j).and.(j<=jmax_)) then + if ((j-i)>=diag_) then + !$omp atomic capture + urws(i-imin_+1) = urws(i-imin_+1) +1 + upnt = urws(i-imin_+1) + !$omp end atomic + u%ia(upnt) = i + u%ja(upnt) = a%ja(k) + u%val(upnt) = a%val(k) + end if + end if + end do + end do loop4 + !$omp end parallel do + call u%set_nzeros(unz) + call u%fix(info) + end if + nzout = u%get_nzeros() + if (rscale_) then + !$omp workshare + u%ia(1:nzout) = u%ia(1:nzout) - imin_ + 1 + !$omp end workshare + end if + if (cscale_) then + !$omp workshare + u%ja(1:nzout) = u%ja(1:nzout) - jmin_ + 1 + !$omp end workshare + end if + + if ((diag_ >= 0).and.(imin_ == jmin_)) then + call u%set_triangle(.true.) + call u%set_upper(.true.) + end if + end block + + +#else + nz = a%get_nzeros() call u%allocate(mb,nb,nz) @@ -2511,7 +2811,7 @@ subroutine psb_z_csr_triu(a,u,info,& call u%set_triangle(.true.) call u%set_upper(.true.) end if - +#endif if (info /= psb_success_) goto 9999 call psb_erractionrestore(err_act) @@ -3190,9 +3490,28 @@ subroutine psb_z_cp_csr_to_fmt(a,b,info) b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat nr = a%get_nrows() nz = a%get_nzeros() - if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) - if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) - if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( a%irp(1:nr+1), b%irp , info) + if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) + if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,b%irp,info) + call psb_realloc(nz,b%ja,info) + call psb_realloc(nz,b%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + b%irp(i)=a%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + b%ja(j) = a%ja(j) + b%val(j) = a%val(j) + end do + !$omp end parallel do + end if + call b%set_host() class default @@ -3276,9 +3595,27 @@ subroutine psb_z_cp_csr_from_fmt(a,b,info) a%psb_z_base_sparse_mat = b%psb_z_base_sparse_mat nr = b%get_nrows() nz = b%get_nzeros() - if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) - if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) - if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + if (.false.) then + if (info == 0) call psb_safe_cpy( b%irp(1:nr+1), a%irp , info) + if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) + if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) + else + ! Despite the implementation in safe_cpy, it seems better this way + call psb_realloc(nr+1,a%irp,info) + call psb_realloc(nz,a%ja,info) + call psb_realloc(nz,a%val,info) + !$omp parallel do private(i) schedule(static) + do i=1,nr+1 + a%irp(i)=b%irp(i) + end do + !$omp end parallel do + !$omp parallel do private(j) schedule(static) + do j=1,nz + a%ja(j)=b%ja(j) + a%val(j)=b%val(j) + end do + !$omp end parallel do + end if call a%set_host() class default