From db0577cd0777ab1e80955bcf460fca66cf1afa51 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Wed, 23 Mar 2022 19:44:58 +0100 Subject: [PATCH] Fix fix_coo for OpenMP --- base/serial/impl/psb_c_coo_impl.F90 | 1300 +-------------------------- base/serial/impl/psb_c_csr_impl.f90 | 292 +++--- base/serial/impl/psb_d_coo_impl.F90 | 1300 +-------------------------- base/serial/impl/psb_d_csr_impl.f90 | 292 +++--- base/serial/impl/psb_s_coo_impl.F90 | 1300 +-------------------------- base/serial/impl/psb_s_csr_impl.f90 | 292 +++--- base/serial/impl/psb_z_coo_impl.F90 | 1300 +-------------------------- base/serial/impl/psb_z_csr_impl.f90 | 292 +++--- 8 files changed, 660 insertions(+), 5708 deletions(-) diff --git a/base/serial/impl/psb_c_coo_impl.F90 b/base/serial/impl/psb_c_coo_impl.F90 index 5b478c0e..5818c77b 100644 --- a/base/serial/impl/psb_c_coo_impl.F90 +++ b/base/serial/impl/psb_c_coo_impl.F90 @@ -3615,1288 +3615,14 @@ subroutine psb_c_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) select case(idir_) case(psb_row_major_) -#if 0 - ! Row major order - if (nr <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - ! We are not going to need 'ix2' because of the presence - ! of 'idxaux' as auxiliary buffer. - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ia(i) = nr+2 - else - !$OMP ATOMIC UPDATE - iaux(ia(i)) = iaux(ia(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - !ia(i) = nr+2 - !srt_inp = .false. - exit - end if - - iaux(ia(i)) = iaux(ia(i)) + 1 - - !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) - end do -#endif - end if - - ! Check again use_buffers. We enter here if nzin >= nr and - ! all the indices are valid - ! Check again use_buffers. - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each row. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving - ! unmodified 'iaux' - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - - idxaux(act_row) = idxaux(act_row) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nr+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - - ip = iaux(1) - iaux(1) = 0 - do i=2, nr - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nr+1) = ip - - do i=1,nzin - irw = ia(i) - ip = iaux(irw) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(irw) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - ! - ! If we did not have enough memory for buffers, - ! let's try in place. - ! - call psi_msort_up(nzin,ia(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & - !$OMP work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_row = first_idx - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - - do while ((ia(j) == ia(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ja(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - select case(dupl_) - case(psb_dupl_ovwrt_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - endif - - if(debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' -#else - call psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) -#endif - case(psb_col_major_) -#if 0 - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl_) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if -#else - call psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& + call psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) -#endif + case(psb_col_major_) + ! Dirty trick: call ROWMAJOR with rows <-> columns + call psb_c_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& + & ja,ia,val,iaux,nzout,info) +!!$ call psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& +!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -4939,6 +3665,11 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif ! Row major order if (nr <= nzin) then @@ -4995,6 +3726,7 @@ subroutine psb_c_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Check again use_buffers. if (use_buffers) then #if defined(OPENMP) + maxthreads = omp_get_max_threads() allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ @@ -5620,6 +4352,12 @@ subroutine psb_c_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif + if (nc <= nzin) then ! Avoid strange situations with large indices #if defined(OPENMP) diff --git a/base/serial/impl/psb_c_csr_impl.f90 b/base/serial/impl/psb_c_csr_impl.f90 index e37e594f..88510e53 100644 --- a/base/serial/impl/psb_c_csr_impl.f90 +++ b/base/serial/impl/psb_c_csr_impl.f90 @@ -2902,78 +2902,78 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$ maxthreads = omp_get_max_threads() - !$ allocate(sum(maxthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - - !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$OMP BARRIER - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$ maxthreads = omp_get_max_threads() +!!$ !$ allocate(sum(maxthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) @@ -2986,7 +2986,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host() @@ -3104,10 +3104,10 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - !$ use_openmp = .true. + ! $ integer(psb_ipk_), allocatable :: sum(:) + ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem + ! $ use_openmp = .true. info = psb_success_ @@ -3135,74 +3135,74 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$ allocate(sum(nthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$ allocate(sum(nthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3214,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host() diff --git a/base/serial/impl/psb_d_coo_impl.F90 b/base/serial/impl/psb_d_coo_impl.F90 index 888d8a8b..21590eb2 100644 --- a/base/serial/impl/psb_d_coo_impl.F90 +++ b/base/serial/impl/psb_d_coo_impl.F90 @@ -3615,1288 +3615,14 @@ subroutine psb_d_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) select case(idir_) case(psb_row_major_) -#if 0 - ! Row major order - if (nr <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - ! We are not going to need 'ix2' because of the presence - ! of 'idxaux' as auxiliary buffer. - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ia(i) = nr+2 - else - !$OMP ATOMIC UPDATE - iaux(ia(i)) = iaux(ia(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - !ia(i) = nr+2 - !srt_inp = .false. - exit - end if - - iaux(ia(i)) = iaux(ia(i)) + 1 - - !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) - end do -#endif - end if - - ! Check again use_buffers. We enter here if nzin >= nr and - ! all the indices are valid - ! Check again use_buffers. - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each row. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving - ! unmodified 'iaux' - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - - idxaux(act_row) = idxaux(act_row) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nr+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - - ip = iaux(1) - iaux(1) = 0 - do i=2, nr - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nr+1) = ip - - do i=1,nzin - irw = ia(i) - ip = iaux(irw) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(irw) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - ! - ! If we did not have enough memory for buffers, - ! let's try in place. - ! - call psi_msort_up(nzin,ia(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & - !$OMP work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_row = first_idx - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - - do while ((ia(j) == ia(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ja(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - select case(dupl_) - case(psb_dupl_ovwrt_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - endif - - if(debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' -#else - call psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) -#endif - case(psb_col_major_) -#if 0 - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl_) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if -#else - call psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& + call psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) -#endif + case(psb_col_major_) + ! Dirty trick: call ROWMAJOR with rows <-> columns + call psb_d_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& + & ja,ia,val,iaux,nzout,info) +!!$ call psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& +!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -4939,6 +3665,11 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif ! Row major order if (nr <= nzin) then @@ -4995,6 +3726,7 @@ subroutine psb_d_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Check again use_buffers. if (use_buffers) then #if defined(OPENMP) + maxthreads = omp_get_max_threads() allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ @@ -5620,6 +4352,12 @@ subroutine psb_d_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif + if (nc <= nzin) then ! Avoid strange situations with large indices #if defined(OPENMP) diff --git a/base/serial/impl/psb_d_csr_impl.f90 b/base/serial/impl/psb_d_csr_impl.f90 index 8a14fa37..a927b1a9 100644 --- a/base/serial/impl/psb_d_csr_impl.f90 +++ b/base/serial/impl/psb_d_csr_impl.f90 @@ -2902,78 +2902,78 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$ maxthreads = omp_get_max_threads() - !$ allocate(sum(maxthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - - !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$OMP BARRIER - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$ maxthreads = omp_get_max_threads() +!!$ !$ allocate(sum(maxthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) @@ -2986,7 +2986,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host() @@ -3104,10 +3104,10 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - !$ use_openmp = .true. + ! $ integer(psb_ipk_), allocatable :: sum(:) + ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem + ! $ use_openmp = .true. info = psb_success_ @@ -3135,74 +3135,74 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$ allocate(sum(nthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$ allocate(sum(nthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3214,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host() diff --git a/base/serial/impl/psb_s_coo_impl.F90 b/base/serial/impl/psb_s_coo_impl.F90 index 2a2b3d0e..8b29e565 100644 --- a/base/serial/impl/psb_s_coo_impl.F90 +++ b/base/serial/impl/psb_s_coo_impl.F90 @@ -3615,1288 +3615,14 @@ subroutine psb_s_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) select case(idir_) case(psb_row_major_) -#if 0 - ! Row major order - if (nr <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - ! We are not going to need 'ix2' because of the presence - ! of 'idxaux' as auxiliary buffer. - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ia(i) = nr+2 - else - !$OMP ATOMIC UPDATE - iaux(ia(i)) = iaux(ia(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - !ia(i) = nr+2 - !srt_inp = .false. - exit - end if - - iaux(ia(i)) = iaux(ia(i)) + 1 - - !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) - end do -#endif - end if - - ! Check again use_buffers. We enter here if nzin >= nr and - ! all the indices are valid - ! Check again use_buffers. - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each row. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving - ! unmodified 'iaux' - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - - idxaux(act_row) = idxaux(act_row) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nr+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - - ip = iaux(1) - iaux(1) = 0 - do i=2, nr - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nr+1) = ip - - do i=1,nzin - irw = ia(i) - ip = iaux(irw) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(irw) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - ! - ! If we did not have enough memory for buffers, - ! let's try in place. - ! - call psi_msort_up(nzin,ia(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & - !$OMP work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_row = first_idx - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - - do while ((ia(j) == ia(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ja(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - select case(dupl_) - case(psb_dupl_ovwrt_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - endif - - if(debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' -#else - call psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) -#endif - case(psb_col_major_) -#if 0 - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl_) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if -#else - call psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& + call psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) -#endif + case(psb_col_major_) + ! Dirty trick: call ROWMAJOR with rows <-> columns + call psb_s_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& + & ja,ia,val,iaux,nzout,info) +!!$ call psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& +!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -4939,6 +3665,11 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif ! Row major order if (nr <= nzin) then @@ -4995,6 +3726,7 @@ subroutine psb_s_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Check again use_buffers. if (use_buffers) then #if defined(OPENMP) + maxthreads = omp_get_max_threads() allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ @@ -5620,6 +4352,12 @@ subroutine psb_s_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif + if (nc <= nzin) then ! Avoid strange situations with large indices #if defined(OPENMP) diff --git a/base/serial/impl/psb_s_csr_impl.f90 b/base/serial/impl/psb_s_csr_impl.f90 index fbfed792..b2d9dd48 100644 --- a/base/serial/impl/psb_s_csr_impl.f90 +++ b/base/serial/impl/psb_s_csr_impl.f90 @@ -2902,78 +2902,78 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$ maxthreads = omp_get_max_threads() - !$ allocate(sum(maxthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - - !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$OMP BARRIER - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$ maxthreads = omp_get_max_threads() +!!$ !$ allocate(sum(maxthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) @@ -2986,7 +2986,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host() @@ -3104,10 +3104,10 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - !$ use_openmp = .true. + ! $ integer(psb_ipk_), allocatable :: sum(:) + ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem + ! $ use_openmp = .true. info = psb_success_ @@ -3135,74 +3135,74 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$ allocate(sum(nthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$ allocate(sum(nthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3214,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host() diff --git a/base/serial/impl/psb_z_coo_impl.F90 b/base/serial/impl/psb_z_coo_impl.F90 index 02613b0e..48ea3a50 100644 --- a/base/serial/impl/psb_z_coo_impl.F90 +++ b/base/serial/impl/psb_z_coo_impl.F90 @@ -3615,1288 +3615,14 @@ subroutine psb_z_fix_coo_inner(nr,nc,nzin,dupl,ia,ja,val,nzout,info,idir) select case(idir_) case(psb_row_major_) -#if 0 - ! Row major order - if (nr <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - ! We are not going to need 'ix2' because of the presence - ! of 'idxaux' as auxiliary buffer. - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ia,nr,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ia(i) = nr+2 - else - !$OMP ATOMIC UPDATE - iaux(ia(i)) = iaux(ia(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ia(i) < 1).or.(ia(i) > nr)) then - use_buffers = .false. - !ia(i) = nr+2 - !srt_inp = .false. - exit - end if - - iaux(ia(i)) = iaux(ia(i)) + 1 - - !srt_inp = srt_inp .and.(ia(i-1)<=ia(i)) - end do -#endif - end if - - ! Check again use_buffers. We enter here if nzin >= nr and - ! all the indices are valid - ! Check again use_buffers. - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each row. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_row) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ------------ iaux composition -------------- - ! 'iaux' will have the start index for each row - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer, 'idxaux', to get indices leaving - ! unmodified 'iaux' - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_row = ia(i) - if ((act_row >= first_idx) .and. (act_row <= last_idx)) then - ias(idxaux(act_row)) = ia(i) - jas(idxaux(act_row)) = ja(i) - vs(idxaux(act_row)) = val(i) - - idxaux(act_row) = idxaux(act_row) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The row has elements? - if (nzl > 0) then - call psi_msort_up(nzl,jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - ! Over each row we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - kaux(nr+1) = sum(nthreads+1) - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nr - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nr+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - - ip = iaux(1) - iaux(1) = 0 - do i=2, nr - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nr+1) = ip - - do i=1,nzin - irw = ia(i) - ip = iaux(irw) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(irw) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nr - - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,jas(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - ! - ! If we did not have enough memory for buffers, - ! let's try in place. - ! - call psi_msort_up(nzin,ia(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_row,iret,ithread, & - !$OMP work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nr/nthreads - if (ithread < MOD(nr,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nr,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_row = first_idx - do j=1,nzin - if (ia(j) < act_row) then - cycle - else if ((ia(j) > last_idx) .or. (work < 1)) then - exit - else if (ia(j) > act_row) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ja(first_elem:),iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nc+2))+1:(ithread*(nc+2))+nzl+2)) - end if - - act_row = act_row + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - - do while ((ia(j) == ia(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ja(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - select case(dupl_) - case(psb_dupl_ovwrt_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - endif - - if(debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' -#else - call psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& - & ia,ja,val,iaux,nzout,info) -#endif - case(psb_col_major_) -#if 0 - if (nc <= nzin) then - ! Avoid strange situations with large indices -#if defined(OPENMP) - allocate(ias(nzin),jas(nzin),vs(nzin), stat=info) -#else - allocate(ias(nzin),jas(nzin),vs(nzin),ix2(nzin+2), stat=info) -#endif - use_buffers = (info == 0) - else - use_buffers = .false. - end if - - if (use_buffers) then - iaux(:) = 0 -#if defined(OPENMP) - !$OMP PARALLEL DO default(none) schedule(STATIC) & - !$OMP shared(nzin,ja,nc,iaux) & - !$OMP private(i) & - !$OMP reduction(.and.:use_buffers) - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - ! Invalid indices are placed outside the considered range - ja(i) = nc+2 - else - !$OMP ATOMIC UPDATE - iaux(ja(i)) = iaux(ja(i)) + 1 - end if - end do - !$OMP END PARALLEL DO -#else - !srt_inp = .true. - do i=1,nzin - if ((ja(i) < 1).or.(ja(i) > nc)) then - use_buffers = .false. - !ja(i) = nc+2 - !srt_inp = .false. - exit - end if - - iaux(ja(i)) = iaux(ja(i)) + 1 - - !srt_inp = srt_inp .and.(ja(i-1)<=ja(i)) - end do -#endif - end if - - !use_buffers=use_buffers.and.srt_inp - - ! Check again use_buffers. We enter here if nzin >= nc and - ! all the indices are valid - if (use_buffers) then -#if defined(OPENMP) - allocate(kaux(MAX(nzin,nc)+2),idxaux(MAX((nr+2)*maxthreads,nc)),sum(maxthreads+1),stat=info) - if (info /= psb_success_) then - info = psb_err_alloc_dealloc_ - call psb_errpush(info,name) - goto 9999 - end if - - kaux(:) = 0 - sum(:) = 0 - sum(1) = 1 - err = 0 - - ! Here, starting from 'iaux', we apply a fixing in order to obtain the starting - ! index for each column. We do the same on 'kaux' - !$OMP PARALLEL default(none) & - !$OMP shared(idxaux,ia,ja,val,ias,jas,vs,nthreads,sum,nr,nc,nzin,iaux,kaux,dupl_,err) & - !$OMP private(s,i,j,k,ithread,first_idx,last_idx,work,nxt_val,old_val,saved_elem, & - !$OMP first_elem,last_elem,nzl,iret,act_col) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - !------------------------------------------- - ! ---------- iaux composition -------------- - - s = 0 - do i=first_idx,last_idx - s = s + iaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = iaux(first_idx) - end if - if (ithread == 0) then - iaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = iaux(first_idx+1) - iaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = iaux(i) - iaux(i) = iaux(i-1) + old_val - old_val = nxt_val - end do - - ! -------------------------------------- - - !$OMP BARRIER - - ! ------------------ Sorting and buffers ------------------- - - ! Let's use an auxiliary buffer to get indices - do j=first_idx,last_idx - idxaux(j) = iaux(j) - end do - - ! Here we sort data inside the auxiliary buffers - do i=1,nzin - act_col = ja(i) - if (act_col >= first_idx .and. act_col <= last_idx) then - ias(idxaux(act_col)) = ia(i) - jas(idxaux(act_col)) = ja(i) - vs(idxaux(act_col)) = val(i) - - idxaux(act_col) = idxaux(act_col) + 1 - end if - end do - - !$OMP BARRIER - - ! Let's sort column indices and values. After that we will store - ! the number of unique values in 'kaux' - do j=first_idx,last_idx - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - nzl = last_elem - first_elem + 1 - - ! The column has elements? - if (nzl > 0) then - call psi_msort_up(nzl,ias(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - - if (iret == 0) then - call psb_ip_reord(nzl,vs(first_elem:last_elem),& - & ias(first_elem:last_elem),jas(first_elem:last_elem), & - & idxaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - ! Over each column we count the unique values - kaux(j) = 1 - do i=first_elem+1,last_elem - if (ias(i) == ias(i-1) .and. jas(i) == jas(i-1)) then - cycle - end if - kaux(j) = kaux(j) + 1 - end do - end if - end do - - ! -------------------------------------------------- - - ! ---------------- kaux composition ---------------- - - !$OMP SINGLE - sum(:) = 0 - sum(1) = 1 - !$OMP END SINGLE - - s = 0 - do i=first_idx,last_idx - s = s + kaux(i) - end do - sum(ithread+2) = s - - !$OMP BARRIER - - !$OMP SINGLE - do i=2,nthreads+1 - sum(i) = sum(i) + sum(i-1) - end do - !$OMP END SINGLE - - if (work > 0) then - saved_elem = kaux(first_idx) - end if - if (ithread == 0) then - kaux(1) = 1 - end if - - !$OMP BARRIER - - if (work > 0) then - old_val = kaux(first_idx+1) - kaux(first_idx+1) = saved_elem + sum(ithread+1) - end if - - do i=first_idx+2,last_idx+1 - nxt_val = kaux(i) - kaux(i) = kaux(i-1) + old_val - old_val = nxt_val - end do - - !$OMP BARRIER - - ! ------------------------------------------------ - - select case(dupl_) - case(psb_dupl_ovwrt_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_add_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - val(k) = val(k) + vs(i) - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - endif - end do - end do - !$OMP END DO - - case(psb_dupl_err_) - !$OMP DO schedule(STATIC) - do j=1,nc - first_elem = iaux(j) - last_elem = iaux(j+1) - 1 - - if (first_elem > last_elem) then - cycle - end if - - k = kaux(j) - - val(k) = vs(first_elem) - ia(k) = ias(first_elem) - ja(k) = jas(first_elem) - - do i=first_elem+1,last_elem - if ((ias(i) == ias(i-1)).and.(jas(i) == jas(i-1))) then - err = 1 - else - k = k + 1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - end if - end do - end do - !$OMP END DO - - case default - !$OMP SINGLE - err = 2 - !$OMP END SINGLE - end select - - !$OMP END PARALLEL - - if (err == 1) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else if (err == 2) then - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end if - - nzout = kaux(nc+1) - 1 - - deallocate(sum,kaux,idxaux,stat=info) -#else - !if (.not.srt_inp) then - ip = iaux(1) - iaux(1) = 0 - do i=2, nc - is = iaux(i) - iaux(i) = ip - ip = ip + is - end do - iaux(nc+1) = ip - - do i=1,nzin - icl = ja(i) - ip = iaux(icl) + 1 - ias(ip) = ia(i) - jas(ip) = ja(i) - vs(ip) = val(i) - iaux(icl) = ip - end do - !end if - - select case(dupl_) - case(psb_dupl_ovwrt_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_add_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - val(k) = val(k) + vs(i) - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case(psb_dupl_err_) - k = 0 - i = 1 - do j=1, nc - nzl = iaux(j)-i+1 - imx = i+nzl-1 - - if (nzl > 0) then - call psi_msort_up(nzl,ias(i:imx),ix2,iret) - if (iret == 0) & - & call psb_ip_reord(nzl,vs(i:imx),& - & ias(i:imx),jas(i:imx),ix2) - k = k + 1 - ia(k) = ias(i) - ja(k) = jas(i) - val(k) = vs(i) - irw = ia(k) - icl = ja(k) - do - i = i + 1 - if (i > imx) exit - if ((ias(i) == irw).and.(jas(i) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - k = k+1 - val(k) = vs(i) - ia(k) = ias(i) - ja(k) = jas(i) - irw = ia(k) - icl = ja(k) - endif - enddo - end if - end do - - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - return - end select - - nzout = k - - deallocate(ix2, stat=info) -#endif - - deallocate(ias,jas,vs, stat=info) - - else if (.not.use_buffers) then - - call psi_msort_up(nzin,ja(1:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzin,val,ia,ja,iaux) -#if defined(OPENMP) - !$OMP PARALLEL default(none) & - !$OMP shared(nr,nc,nzin,iaux,ia,ja,val,nthreads) & - !$OMP private(i,j,first_idx,last_idx,nzl,act_col, & - !$OMP iret,ithread,work,first_elem,last_elem) - - !$OMP SINGLE - nthreads = omp_get_num_threads() - !$OMP END SINGLE - - ithread = omp_get_thread_num() - - ! -------- thread-specific workload -------- - - work = nc/nthreads - if (ithread < MOD(nc,nthreads)) then - work = work + 1 - first_idx = ithread*work + 1 - else - first_idx = ithread*work + MOD(nc,nthreads) + 1 - end if - - last_idx = first_idx + work - 1 - - ! --------------------------------------------------- - - first_elem = 0 - last_elem = -1 - act_col = first_idx - do j=1,nzin - if (ja(j) < act_col) then - cycle - else if ((ja(j) > last_idx) .or. (work < 1)) then - exit - else if (ja(j) > act_col) then - nzl = last_elem - first_elem + 1 - - if (nzl > 0) then - call psi_msort_up(nzl,ia(first_elem:),iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(first_elem:last_elem),& - & ia(first_elem:last_elem),ja(first_elem:last_elem),& - & iaux((ithread*(nr+2))+1:(ithread*(nr+2))+nzl+2)) - end if - - act_col = act_col + 1 - first_elem = 0 - last_elem = -1 - else - if (first_elem == 0) then - first_elem = j - end if - - last_elem = j - end if - end do - !$OMP END PARALLEL -#else - i = 1 - j = i - do while (i <= nzin) - do while ((ja(j) == ja(i))) - j = j+1 - if (j > nzin) exit - enddo - nzl = j - i - call psi_msort_up(nzl,ia(i:),iaux(1:),iret) - if (iret == 0) & - & call psb_ip_reord(nzl,val(i:i+nzl-1),& - & ia(i:i+nzl-1),ja(i:i+nzl-1),iaux) - i = j - enddo -#endif - - i = 1 - irw = ia(i) - icl = ja(i) - j = 1 - - - select case(dupl_) - case(psb_dupl_ovwrt_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_add_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - val(i) = val(i) + val(j) - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - - case(psb_dupl_err_) - do - j = j + 1 - if (j > nzin) exit - if ((ia(j) < 1 .or. ia(j) > nr) .or. (ja(j) < 1 .or. ja(j) > nc)) cycle - if ((ia(j) == irw).and.(ja(j) == icl)) then - call psb_errpush(psb_err_duplicate_coo,name) - goto 9999 - else - i = i+1 - val(i) = val(j) - ia(i) = ia(j) - ja(i) = ja(j) - irw = ia(i) - icl = ja(i) - endif - enddo - case default - write(psb_err_unit,*) 'Error in fix_coo: unsafe dupl',dupl_ - info =-7 - end select - - nzout = i - - if (debug_level >= psb_debug_serial_)& - & write(debug_unit,*) trim(name),': end second loop' - - end if -#else - call psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& + call psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,& & ia,ja,val,iaux,nzout,info) -#endif + case(psb_col_major_) + ! Dirty trick: call ROWMAJOR with rows <-> columns + call psb_z_fix_coo_inner_rowmajor(nc,nr,nzin,dupl,& + & ja,ia,val,iaux,nzout,info) +!!$ call psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,& +!!$ & ia,ja,val,iaux,nzout,info) case default write(debug_unit,*) trim(name),': unknown direction ',idir_ info = psb_err_internal_error_ @@ -4939,6 +3665,11 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif ! Row major order if (nr <= nzin) then @@ -4995,6 +3726,7 @@ subroutine psb_z_fix_coo_inner_rowmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf ! Check again use_buffers. if (use_buffers) then #if defined(OPENMP) + maxthreads = omp_get_max_threads() allocate(kaux(nr+1),idxaux(MAX((nc+2)*maxthreads,nr)),sum(maxthreads+1),stat=info) if (info /= psb_success_) then info = psb_err_alloc_dealloc_ @@ -5620,6 +4352,12 @@ subroutine psb_z_fix_coo_inner_colmajor(nr,nc,nzin,dupl,ia,ja,val,iaux,nzout,inf integer(psb_ipk_) :: debug_level, debug_unit character(len=20) :: name = 'psb_fixcoo' logical :: srt_inp, use_buffers +#if defined(OPENMP) + integer(psb_ipk_) :: work,first_idx,last_idx,first_elem,last_elem,s,nthreads,ithread + integer(psb_ipk_) :: saved_elem,old_val,nxt_val,err,act_row,act_col,maxthreads + integer(psb_ipk_), allocatable :: sum(:),kaux(:),idxaux(:) +#endif + if (nc <= nzin) then ! Avoid strange situations with large indices #if defined(OPENMP) diff --git a/base/serial/impl/psb_z_csr_impl.f90 b/base/serial/impl/psb_z_csr_impl.f90 index fba56848..ba2d322f 100644 --- a/base/serial/impl/psb_z_csr_impl.f90 +++ b/base/serial/impl/psb_z_csr_impl.f90 @@ -2902,78 +2902,78 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$ maxthreads = omp_get_max_threads() - !$ allocate(sum(maxthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - - !$OMP PARALLEL default(none) & - !$OMP shared(nza,itemp,a,nthreads,sum,nr) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$OMP BARRIER - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$ maxthreads = omp_get_max_threads() +!!$ !$ allocate(sum(maxthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(nza,itemp,a,nthreads,sum,nr) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) @@ -2986,7 +2986,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host() @@ -3104,10 +3104,10 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) character(len=20) :: name='mv_from_coo' logical :: use_openmp = .false. - !$ integer(psb_ipk_), allocatable :: sum(:) - !$ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s - !$ integer(psb_ipk_) :: nxt_val,old_val,saved_elem - !$ use_openmp = .true. + ! $ integer(psb_ipk_), allocatable :: sum(:) + ! $ integer(psb_ipk_) :: first_idx,last_idx,work,ithread,nthreads,s + ! $ integer(psb_ipk_) :: nxt_val,old_val,saved_elem + ! $ use_openmp = .true. info = psb_success_ @@ -3135,74 +3135,74 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) a%irp(:) = 0 - if (use_openmp) then - !$OMP PARALLEL default(none) & - !$OMP shared(sum,nthreads,nr,a,itemp,nza) & - !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) - - !$OMP DO schedule(STATIC) & - !$OMP private(k,i) - do k=1,nza - i = itemp(k) - a%irp(i) = a%irp(i) + 1 - end do - !$OMP END DO - - !$OMP SINGLE - !$ nthreads = omp_get_num_threads() - !$ allocate(sum(nthreads+1)) - !$ sum(:) = 0 - !$ sum(1) = 1 - !$OMP END SINGLE - - !$ ithread = omp_get_thread_num() - - !$ work = nr/nthreads - !$ if (ithread < MOD(nr,nthreads)) then - !$ work = work + 1 - !$ first_idx = ithread*work + 1 - !$ else - !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 - !$ end if - - !$ last_idx = first_idx + work - 1 - - !$ s = 0 - !$ do i=first_idx,last_idx - !$ s = s + a%irp(i) - !$ end do - !$ if (work > 0) then - !$ sum(ithread+2) = s - !$ end if - - !$OMP BARRIER - - !$OMP SINGLE - !$ do i=2,nthreads+1 - !$ sum(i) = sum(i) + sum(i-1) - !$ end do - !$OMP END SINGLE - - !$ if (work > 0) then - !$ saved_elem = a%irp(first_idx) - !$ end if - !$ if (ithread == 0) then - !$ a%irp(1) = 1 - !$ end if - - !$ if (work > 0) then - !$ old_val = a%irp(first_idx+1) - !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) - !$ end if - - !$ do i=first_idx+2,last_idx+1 - !$ nxt_val = a%irp(i) - !$ a%irp(i) = a%irp(i-1) + old_val - !$ old_val = nxt_val - !$ end do - - !$OMP END PARALLEL - else +!!$ if (use_openmp) then +!!$ !$OMP PARALLEL default(none) & +!!$ !$OMP shared(sum,nthreads,nr,a,itemp,nza) & +!!$ !$OMP private(ithread,work,first_idx,last_idx,s,saved_elem,nxt_val,old_val) +!!$ +!!$ !$OMP DO schedule(STATIC) & +!!$ !$OMP private(k,i) +!!$ do k=1,nza +!!$ i = itemp(k) +!!$ a%irp(i) = a%irp(i) + 1 +!!$ end do +!!$ !$OMP END DO +!!$ +!!$ !$OMP SINGLE +!!$ !$ nthreads = omp_get_num_threads() +!!$ !$ allocate(sum(nthreads+1)) +!!$ !$ sum(:) = 0 +!!$ !$ sum(1) = 1 +!!$ !$OMP END SINGLE +!!$ +!!$ !$ ithread = omp_get_thread_num() +!!$ +!!$ !$ work = nr/nthreads +!!$ !$ if (ithread < MOD(nr,nthreads)) then +!!$ !$ work = work + 1 +!!$ !$ first_idx = ithread*work + 1 +!!$ !$ else +!!$ !$ first_idx = ithread*work + MOD(nr,nthreads) + 1 +!!$ !$ end if +!!$ +!!$ !$ last_idx = first_idx + work - 1 +!!$ +!!$ !$ s = 0 +!!$ !$ do i=first_idx,last_idx +!!$ !$ s = s + a%irp(i) +!!$ !$ end do +!!$ !$ if (work > 0) then +!!$ !$ sum(ithread+2) = s +!!$ !$ end if +!!$ +!!$ !$OMP BARRIER +!!$ +!!$ !$OMP SINGLE +!!$ !$ do i=2,nthreads+1 +!!$ !$ sum(i) = sum(i) + sum(i-1) +!!$ !$ end do +!!$ !$OMP END SINGLE +!!$ +!!$ !$ if (work > 0) then +!!$ !$ saved_elem = a%irp(first_idx) +!!$ !$ end if +!!$ !$ if (ithread == 0) then +!!$ !$ a%irp(1) = 1 +!!$ !$ end if +!!$ +!!$ !$ if (work > 0) then +!!$ !$ old_val = a%irp(first_idx+1) +!!$ !$ a%irp(first_idx+1) = saved_elem + sum(ithread+1) +!!$ !$ end if +!!$ +!!$ !$ do i=first_idx+2,last_idx+1 +!!$ !$ nxt_val = a%irp(i) +!!$ !$ a%irp(i) = a%irp(i-1) + old_val +!!$ !$ old_val = nxt_val +!!$ !$ end do +!!$ +!!$ !$OMP END PARALLEL +!!$ else do k=1,nza i = itemp(k) a%irp(i) = a%irp(i) + 1 @@ -3214,7 +3214,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) ip = ip + ncl end do a%irp(nr+1) = ip - end if +!!$ end if call a%set_host()