From 54104e0cc618fc7530e6fbcb0a034a250146b0f3 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 16 May 2024 10:24:29 +0200 Subject: [PATCH] Merge changes from repackage --- base/serial/impl/psb_c_coo_impl.F90 | 199 ++++++++++++++++------------ base/serial/impl/psb_d_coo_impl.F90 | 199 ++++++++++++++++------------ base/serial/impl/psb_s_coo_impl.F90 | 199 ++++++++++++++++------------ base/serial/impl/psb_z_coo_impl.F90 | 199 ++++++++++++++++------------ 4 files changed, 460 insertions(+), 336 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 46391dee..d6117546 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -165,7 +165,8 @@ subroutine psb_c_coo_scals(d,a,info) if (a%is_unit()) then call a%make_nonunit() end if - + + !$omp parallel do private(i) do i=1,a%get_nzeros() a%val(i) = a%val(i) * d enddo @@ -2869,6 +2870,7 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() + nzaold = nza isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then @@ -2878,14 +2880,19 @@ subroutine psb_c_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) else - nzaold = nza +#if defined(OPENMP) nza = nza + nz +#endif call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) +#if !defined(OPENMP) + nza = nzaold + call a%set_nzeros(nza) +#endif call a%set_sorted(.false.) else if (a%is_upd()) then @@ -4168,7 +4175,6 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4199,7 +4205,7 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) @@ -4262,7 +4268,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnzr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4271,7 +4277,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4295,11 +4301,14 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then - iaux(:) = 0 + !if (use_buffers) then #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnzr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnzr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4313,7 +4322,16 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnzr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnzr) + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do + !$OMP END PARALLEL DO #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4327,8 +4345,12 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnzr = 0 + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4336,22 +4358,21 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnzr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4376,60 +4397,67 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4547,7 +4575,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then @@ -4704,7 +4732,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnzr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4726,38 +4754,41 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + ! --------------------------------------------------- - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - last_elem = j - end if - end do + last_elem = j + end if + end do + end block !$OMP END PARALLEL #else i = 1 diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index c2babf8e..4cb4c3ec 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -165,7 +165,8 @@ subroutine psb_d_coo_scals(d,a,info) if (a%is_unit()) then call a%make_nonunit() end if - + + !$omp parallel do private(i) do i=1,a%get_nzeros() a%val(i) = a%val(i) * d enddo @@ -2869,6 +2870,7 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() + nzaold = nza isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then @@ -2878,14 +2880,19 @@ subroutine psb_d_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) else - nzaold = nza +#if defined(OPENMP) nza = nza + nz +#endif call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) +#if !defined(OPENMP) + nza = nzaold + call a%set_nzeros(nza) +#endif call a%set_sorted(.false.) else if (a%is_upd()) then @@ -4168,7 +4175,6 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4199,7 +4205,7 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) @@ -4262,7 +4268,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnzr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4271,7 +4277,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4295,11 +4301,14 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then - iaux(:) = 0 + !if (use_buffers) then #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnzr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnzr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4313,7 +4322,16 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnzr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnzr) + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do + !$OMP END PARALLEL DO #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4327,8 +4345,12 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnzr = 0 + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4336,22 +4358,21 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnzr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4376,60 +4397,67 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4547,7 +4575,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then @@ -4704,7 +4732,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnzr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4726,38 +4754,41 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + ! --------------------------------------------------- - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - last_elem = j - end if - end do + last_elem = j + end if + end do + end block !$OMP END PARALLEL #else i = 1 diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 402c608a..f706db33 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -165,7 +165,8 @@ subroutine psb_s_coo_scals(d,a,info) if (a%is_unit()) then call a%make_nonunit() end if - + + !$omp parallel do private(i) do i=1,a%get_nzeros() a%val(i) = a%val(i) * d enddo @@ -2869,6 +2870,7 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() + nzaold = nza isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then @@ -2878,14 +2880,19 @@ subroutine psb_s_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) else - nzaold = nza +#if defined(OPENMP) nza = nza + nz +#endif call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) +#if !defined(OPENMP) + nza = nzaold + call a%set_nzeros(nza) +#endif call a%set_sorted(.false.) else if (a%is_upd()) then @@ -4168,7 +4175,6 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4199,7 +4205,7 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) @@ -4262,7 +4268,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) real(psb_spk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnzr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4271,7 +4277,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4295,11 +4301,14 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then - iaux(:) = 0 + !if (use_buffers) then #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnzr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnzr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4313,7 +4322,16 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnzr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnzr) + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do + !$OMP END PARALLEL DO #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4327,8 +4345,12 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnzr = 0 + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4336,22 +4358,21 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnzr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4376,60 +4397,67 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4547,7 +4575,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then @@ -4704,7 +4732,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnzr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4726,38 +4754,41 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + ! --------------------------------------------------- - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - last_elem = j - end if - end do + last_elem = j + end if + end do + end block !$OMP END PARALLEL #else i = 1 diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 542f842e..c368ce91 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -165,7 +165,8 @@ subroutine psb_z_coo_scals(d,a,info) if (a%is_unit()) then call a%make_nonunit() end if - + + !$omp parallel do private(i) do i=1,a%get_nzeros() a%val(i) = a%val(i) * d enddo @@ -2869,6 +2870,7 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) ! Hence the call to set_nzeros done here. !$omp critical nza = a%get_nzeros() + nzaold = nza isza = a%get_size() ! Build phase. Must handle reallocations in a sensible way. if (isza < (nza+nz)) then @@ -2878,14 +2880,19 @@ subroutine psb_z_coo_csput_a(nz,ia,ja,val,a,imin,imax,jmin,jmax,info) if (isza < (nza+nz)) then info = psb_err_alloc_dealloc_; call psb_errpush(info,name) else - nzaold = nza +#if defined(OPENMP) nza = nza + nz +#endif call a%set_nzeros(nza) end if !$omp end critical if (info /= 0) goto 9999 call psb_inner_ins(nz,ia,ja,val,nzaold,a%ia,a%ja,a%val,isza,& & imin,imax,jmin,jmax,info) +#if !defined(OPENMP) + nza = nzaold + call a%set_nzeros(nza) +#endif call a%set_sorted(.false.) else if (a%is_upd()) then @@ -4168,7 +4175,6 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) #endif info = psb_success_ @@ -4199,7 +4205,7 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) ! 'iaux' has to allow the threads to have an exclusive group ! of indices as work space. Since each thread handles one ! row/column at the time, we allocate this way. - allocate(iaux(MAX((nc+2),(nr+2))*maxthreads),stat=info) + allocate(iaux(MAX((nc+2),(nr+2))),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) @@ -4262,7 +4268,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !locals integer(psb_ipk_), allocatable :: ias(:),jas(:), ix2(:) complex(psb_dpk_), allocatable :: vs(:) - integer(psb_ipk_) :: nza, nzl,iret + integer(psb_ipk_) :: nza, nzl,iret, maxnzr integer(psb_ipk_) :: i,j, irw, icl, err_act, ip,is, imx, k, ii, i1, i2 integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' @@ -4271,7 +4277,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf #if defined(OPENMP) integer(psb_ipk_) :: work,idxstart,idxend,first_elem,last_elem,s,nthreads,ithread integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads - integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) + integer(psb_ipk_), allocatable :: kaux(:),idxaux(:) #endif info = psb_success_ @@ -4295,11 +4301,14 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf use_buffers = .false. end if - if (use_buffers) then - iaux(:) = 0 + !if (use_buffers) then #if defined(OPENMP) + !$omp workshare + iaux(:) = 0 + !$omp end workshare + maxnzr = 0 !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & + !$OMP shared(nzin,ia,nr,iaux,maxnzr) & !$OMP private(i) & !$OMP reduction(.and.:use_buffers) do i=1,nzin @@ -4313,7 +4322,16 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf end if end do !$OMP END PARALLEL DO + maxnzr = 0 + !$OMP PARALLEL DO default(none) schedule(STATIC) & + !$OMP private(i) shared(nr,iaux)& + !$OMP reduction(max:maxnzr) + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do + !$OMP END PARALLEL DO #else + iaux(:) = 0 !srt_inp = .true. do i=1,nzin if ((ia(i) < 1).or.(ia(i) > nr)) then @@ -4327,8 +4345,12 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) end do + maxnzr = 0 + do i=1,nr + maxnzr = max(maxnzr,iaux(i)) + end do #endif - end if + !end if ! Check again use_buffers. We enter here if nzin >= nr and ! all the indices are valid @@ -4336,22 +4358,21 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf if (use_buffers) then #if defined(OPENMP) maxthreads = omp_get_max_threads() - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) + allocate(kaux(nr+1),idxaux(MAX(nc+2,nr+2)),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ call psb_errpush(info,name) goto 9999 end if + !$omp workshare kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 + !$omp end workshare err = 0 - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting ! index for each row. We do the same on 'kaux' !$OMP PARALLEL default(none) & - !$OMP shared(t0,t1,idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl,err) & + !$OMP shared(maxnzr,idxaux,ia,ja,val,ias,jas,vs,nthreads,nr,nc,nzin,iaux,kaux,dupl,err) & !$OMP private(s,i,j,k,ithread,idxstart,idxend,work,nxt_val,old_val,saved_elem, & !$OMP first_elem,last_elem,nzl,iret,act_row,i1,i2) reduction(max: info) @@ -4376,60 +4397,67 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf !write(0,*) 'fix_coo_inner: trying with exscan' call psi_exscan(nr+1,iaux,info,shift=ione) !$OMP BARRIER - !$OMP SINGLE - !t0 = omp_get_wtime() - !$OMP END SINGLE - + + ! ------------------ Sorting and buffers ------------------- ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving ! unmodified 'iaux' - do j=idxstart,idxend + + !$omp do private(j) + do j=1,nr+1 idxaux(j) = iaux(j) end do + !$omp end do + ! Here we sort data inside the auxiliary buffers + !$omp do private(act_row,i,i1) do i=1,nzin act_row = ia(i) - if ((act_row >= idxstart) .and. (act_row <= idxend)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - idxaux(act_row) = idxaux(act_row) + 1 - end if + !$omp atomic capture + i1 =idxaux(act_row) + idxaux(act_row) = idxaux(act_row) + 1 + !$omp end atomic + ias(i1) = ia(i) + jas(i1) = ja(i) + vs(i1) = val(i) end do + !$omp end do - !$OMP BARRIER - !$OMP SINGLE - !t1 = omp_get_wtime() - !write(0,*) ithread,'Srt&Cpy :',t1-t0 - !$OMP END SINGLE ! Let's sort column indices and values. After that we will store ! the number of unique values in 'kaux' - do j=idxstart,idxend - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + !$omp do private(j,first_elem,last_elem,nzl,iret) schedule(dynamic,256) + do j=1,nr + first_elem = iaux(j) + last_elem = iaux(j+1) - 1 + nzl = last_elem - first_elem + 1 - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle + ! The row has elements? + if (nzl > 0) then + call psi_msort_up(nzl,jas(first_elem:last_elem), & + & ixt,iret) + if (iret == 0) then + call psb_ip_reord(nzl,vs(first_elem:last_elem),& + & ias(first_elem:last_elem),jas(first_elem:last_elem), & + & ixt) end if - kaux(j) = kaux(j) + 1 - end do - end if - end do + + ! Over each row we count the unique values + kaux(j) = 1 + do i=first_elem+1,last_elem + if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then + cycle + end if + kaux(j) = kaux(j) + 1 + end do + end if + end do + !$omp end do + deallocate(ixt) + end block ! -------------------------------------------------- ! ---------------- kaux composition ---------------- @@ -4547,7 +4575,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf nzout = kaux(nr+1) - 1 - deallocate(sum,kaux,idxaux,stat=info) + deallocate(kaux,idxaux,stat=info) #else !if (.not.srt_inp) then @@ -4704,7 +4732,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf & call psb_ip_reord(nzin,val,ia,ja,iaux) #if defined(OPENMP) !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & + !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads,maxnzr) & !$OMP private(i,j,idxstart,idxend,nzl,act_row,iret,ithread, & !$OMP work,first_elem,last_elem) @@ -4726,38 +4754,41 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf idxend = idxstart + work - 1 - ! --------------------------------------------------- + block + integer(psb_ipk_), allocatable :: ixt(:) + allocate(ixt(2*maxnzr+2)) + ! --------------------------------------------------- - first_elem = 0 - last_elem = -1 - act_row = idxstart - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > idxend) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 + first_elem = 0 + last_elem = -1 + act_row = idxstart + do j=1,nzin + if (ia(j) < act_row) then + cycle + else if ((ia(j) > idxend) .or. (work < 1)) then + exit + else if (ia(j) > act_row) then + nzl = last_elem - first_elem + 1 - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if + if (nzl > 0) then + call psi_msort_up(nzl,ja(first_elem:last_elem),ixt,iret) + if (iret == 0) & + & call psb_ip_reord(nzl,val(first_elem:last_elem),& + & ia(first_elem:last_elem),ja(first_elem:last_elem),ixt) + end if - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if + act_row = act_row + 1 + first_elem = 0 + last_elem = -1 + else + if (first_elem == 0) then + first_elem = j + end if - last_elem = j - end if - end do + last_elem = j + end if + end do + end block !$OMP END PARALLEL #else i = 1