From a613e963dbf823664a4aa90413931b12e8584705 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 10 May 2024 10:10:54 +0200 Subject: [PATCH 1/3] First step in fix for coo_impl on OpenMP --- base/serial/impl/psb_c_coo_impl.F90 | 2 +- base/serial/impl/psb_d_coo_impl.F90 | 2 +- base/serial/impl/psb_s_coo_impl.F90 | 2 +- base/serial/impl/psb_z_coo_impl.F90 | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 5c90e287..84517ead 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -4205,7 +4205,7 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index f6a173d1..1b122fd8 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -4205,7 +4205,7 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 4c12d8fc..3df01ba6 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -4205,7 +4205,7 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 44ee89b5..356914de 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -4205,7 +4205,7 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) From ecccb1391437fea6dc6984129e79e32140293091 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 10 May 2024 13:53:35 +0200 Subject: [PATCH 2/3] Fix COO fix_coo_inner_rowmajor not to overflow on integers. --- base/serial/impl/psb_c_coo_impl.F90 | 109 +++++++++++++++------------- base/serial/impl/psb_d_coo_impl.F90 | 109 +++++++++++++++------------- base/serial/impl/psb_s_coo_impl.F90 | 109 +++++++++++++++------------- base/serial/impl/psb_z_coo_impl.F90 | 109 +++++++++++++++------------- 4 files changed, 240 insertions(+), 196 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 84517ead..cc3c6842 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -4268,7 +4268,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4277,7 +4277,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4302,10 +4302,13 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if if (use_buffers) then - iaux(:) = 0 #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4319,7 +4322,9 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnr = maxval(iaux(1:nr)) #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4342,22 +4347,21 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4382,60 +4386,67 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(maxnr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4553,7 +4564,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 1b122fd8..e523d83d 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -4268,7 +4268,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4277,7 +4277,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4302,10 +4302,13 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if if (use_buffers) then - iaux(:) = 0 #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4319,7 +4322,9 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnr = maxval(iaux(1:nr)) #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4342,22 +4347,21 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4382,60 +4386,67 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(maxnr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4553,7 +4564,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 3df01ba6..b46731fc 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -4268,7 +4268,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4277,7 +4277,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4302,10 +4302,13 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if if (use_buffers) then - iaux(:) = 0 #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4319,7 +4322,9 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnr = maxval(iaux(1:nr)) #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4342,22 +4347,21 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4382,60 +4386,67 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(maxnr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4553,7 +4564,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 356914de..a7e527e5 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -4268,7 +4268,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4277,7 +4277,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4302,10 +4302,13 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if if (use_buffers) then - iaux(:) = 0 #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4319,7 +4322,9 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnr = maxval(iaux(1:nr)) #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4342,22 +4347,21 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4382,60 +4386,67 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(maxnr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4553,7 +4564,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then From 70f51b9da89cb36897bedbdb80f29fbc439ac21e Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 15 May 2024 12:05:15 +0200 Subject: [PATCH 3/3] Improve handling of fix_coo buffers with OpenMP --- base/serial/impl/psb_c_coo_impl.F90 | 81 +++++++++++++++++------------ base/serial/impl/psb_d_coo_impl.F90 | 81 +++++++++++++++++------------ base/serial/impl/psb_s_coo_impl.F90 | 81 +++++++++++++++++------------ base/serial/impl/psb_z_coo_impl.F90 | 81 +++++++++++++++++------------ 4 files changed, 188 insertions(+), 136 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index cc3c6842..1b015ab1 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -4174,7 +4174,6 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4301,7 +4300,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then + !if (use_buffers) then #if defined(OPENMP) !$omp workshare iaux(:) = 0 @@ -4322,7 +4321,14 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO - maxnr = maxval(iaux(1:nr)) + maxnr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnr) + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do + !$OMP END PARALLEL DO #else iaux(:) = 0 !srt_inp = .true. @@ -4338,8 +4344,12 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnr = 0 + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4417,7 +4427,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! the number of unique values in 'kaux' block integer(psb_ipk_), allocatable :: ixt(:) - allocate(ixt(maxnr+2)) + allocate(ixt(2*maxnr+2)) !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) do j=1,nr first_elem = iaux(j) @@ -4721,7 +4731,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4743,38 +4753,41 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnr+2)) + ! --------------------------------------------------- + + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j + last_elem = j end if - - last_elem = j - end if - end do + end do + end block !$OMP END PARALLEL #else i = 1 diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index e523d83d..e714ef5e 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -4174,7 +4174,6 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4301,7 +4300,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then + !if (use_buffers) then #if defined(OPENMP) !$omp workshare iaux(:) = 0 @@ -4322,7 +4321,14 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO - maxnr = maxval(iaux(1:nr)) + maxnr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnr) + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do + !$OMP END PARALLEL DO #else iaux(:) = 0 !srt_inp = .true. @@ -4338,8 +4344,12 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnr = 0 + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4417,7 +4427,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! the number of unique values in 'kaux' block integer(psb_ipk_), allocatable :: ixt(:) - allocate(ixt(maxnr+2)) + allocate(ixt(2*maxnr+2)) !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) do j=1,nr first_elem = iaux(j) @@ -4721,7 +4731,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4743,38 +4753,41 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnr+2)) + ! --------------------------------------------------- + + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j + last_elem = j end if - - last_elem = j - end if - end do + end do + end block !$OMP END PARALLEL #else i = 1 diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index b46731fc..029b9dbb 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -4174,7 +4174,6 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4301,7 +4300,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then + !if (use_buffers) then #if defined(OPENMP) !$omp workshare iaux(:) = 0 @@ -4322,7 +4321,14 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO - maxnr = maxval(iaux(1:nr)) + maxnr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnr) + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do + !$OMP END PARALLEL DO #else iaux(:) = 0 !srt_inp = .true. @@ -4338,8 +4344,12 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnr = 0 + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4417,7 +4427,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! the number of unique values in 'kaux' block integer(psb_ipk_), allocatable :: ixt(:) - allocate(ixt(maxnr+2)) + allocate(ixt(2*maxnr+2)) !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) do j=1,nr first_elem = iaux(j) @@ -4721,7 +4731,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4743,38 +4753,41 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnr+2)) + ! --------------------------------------------------- + + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j + last_elem = j end if - - last_elem = j - end if - end do + end do + end block !$OMP END PARALLEL #else i = 1 diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index a7e527e5..32dd80b9 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -4174,7 +4174,6 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4301,7 +4300,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then + !if (use_buffers) then #if defined(OPENMP) !$omp workshare iaux(:) = 0 @@ -4322,7 +4321,14 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO - maxnr = maxval(iaux(1:nr)) + maxnr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnr) + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do + !$OMP END PARALLEL DO #else iaux(:) = 0 !srt_inp = .true. @@ -4338,8 +4344,12 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnr = 0 + do i=1,nr + maxnr = max(maxnr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4417,7 +4427,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! the number of unique values in 'kaux' block integer(psb_ipk_), allocatable :: ixt(:) - allocate(ixt(maxnr+2)) + allocate(ixt(2*maxnr+2)) !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) do j=1,nr first_elem = iaux(j) @@ -4721,7 +4731,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4743,38 +4753,41 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnr+2)) + ! --------------------------------------------------- + + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j + last_elem = j end if - - last_elem = j - end if - end do + end do + end block !$OMP END PARALLEL #else i = 1