diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 06dec27d..7c7452a0 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_c_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index f071a5b8..56273b49 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_d_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index cd2933f7..17e73934 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_s_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 88f49335..f88008d8 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -2931,7 +2931,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO @@ -3099,7 +3099,7 @@ subroutine psb_z_mv_csr_from_coo(a,b,info) do k=1,nza i = itemp(k) !$OMP ATOMIC UPDATE - a%irp(i+1) = a%irp(i+1) + 1 + a%irp(i) = a%irp(i) + 1 !$OMP END ATOMIC end do !$OMP END DO diff --git a/base/serial/psi_c_serial_impl.F90 b/base/serial/psi_c_serial_impl.F90 index d7c92d3a..a3898349 100644 --- a/base/serial/psi_c_serial_impl.F90 +++ b/base/serial/psi_c_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_c_exscanv(n,x,info,shift) complex(psb_spk_), intent(in), optional :: shift complex(psb_spk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - complex(psb_spk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + complex(psb_spk_), pointer :: sumb(:) + complex(psb_spk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = czero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_c_exscan diff --git a/base/serial/psi_d_serial_impl.F90 b/base/serial/psi_d_serial_impl.F90 index 56bedc84..1b5b1442 100644 --- a/base/serial/psi_d_serial_impl.F90 +++ b/base/serial/psi_d_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_d_exscanv(n,x,info,shift) real(psb_dpk_), intent(in), optional :: shift real(psb_dpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - real(psb_dpk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + real(psb_dpk_), pointer :: sumb(:) + real(psb_dpk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = dzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_d_exscan diff --git a/base/serial/psi_e_serial_impl.F90 b/base/serial/psi_e_serial_impl.F90 index ba9a0b2a..9cdcdf0e 100644 --- a/base/serial/psi_e_serial_impl.F90 +++ b/base/serial/psi_e_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_e_exscanv(n,x,info,shift) integer(psb_epk_), intent(in), optional :: shift integer(psb_epk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - integer(psb_epk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_epk_), pointer :: sumb(:) + integer(psb_epk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = ezero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_e_exscan diff --git a/base/serial/psi_i2_serial_impl.F90 b/base/serial/psi_i2_serial_impl.F90 index 200e378a..d25617a9 100644 --- a/base/serial/psi_i2_serial_impl.F90 +++ b/base/serial/psi_i2_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_i2_exscanv(n,x,info,shift) integer(psb_i2pk_), intent(in), optional :: shift integer(psb_i2pk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - integer(psb_i2pk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_i2pk_), pointer :: sumb(:) + integer(psb_i2pk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = i2zero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_i2_exscan diff --git a/base/serial/psi_m_serial_impl.F90 b/base/serial/psi_m_serial_impl.F90 index 05461e1e..05c8e60f 100644 --- a/base/serial/psi_m_serial_impl.F90 +++ b/base/serial/psi_m_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_m_exscanv(n,x,info,shift) integer(psb_mpk_), intent(in), optional :: shift integer(psb_mpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - integer(psb_mpk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + integer(psb_mpk_), pointer :: sumb(:) + integer(psb_mpk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = mzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_m_exscan diff --git a/base/serial/psi_s_serial_impl.F90 b/base/serial/psi_s_serial_impl.F90 index 6e5eadd7..26a57e68 100644 --- a/base/serial/psi_s_serial_impl.F90 +++ b/base/serial/psi_s_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_s_exscanv(n,x,info,shift) real(psb_spk_), intent(in), optional :: shift real(psb_spk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - real(psb_spk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + real(psb_spk_), pointer :: sumb(:) + real(psb_spk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = szero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_s_exscan diff --git a/base/serial/psi_z_serial_impl.F90 b/base/serial/psi_z_serial_impl.F90 index ef13db0b..0b15b2d6 100644 --- a/base/serial/psi_z_serial_impl.F90 +++ b/base/serial/psi_z_serial_impl.F90 @@ -43,6 +43,7 @@ subroutine psi_z_exscanv(n,x,info,shift) complex(psb_dpk_), intent(in), optional :: shift complex(psb_dpk_) :: shift_, tp, ts + integer(psb_ipk_) :: i logical is_nested, is_parallel if (present(shift)) then @@ -76,9 +77,11 @@ contains ! a pointer. The semantics of COPYPRIVATE is that the POINTER is copied ! so effectively we are recovering a SHARED SUMB which is what ! we need in this case. If it was an ALLOCATABLE, then it would be the contents - ! that would get copied, and the SHARED effect would no longer be there. - integer(psb_ipk_) :: i,ithread,nthreads,first_idx,last_idx,wrk - complex(psb_dpk_), pointer :: sumb(:) + ! that would get copied, and the SHARED effect would no longer be there. + ! Simple parallel version of EXSCAN + integer(psb_ipk_) :: i,ithread,nthreads,idxstart,idxend,wrk + complex(psb_dpk_), pointer :: sumb(:) + complex(psb_dpk_) :: tp, ts nthreads = omp_get_num_threads() ithread = omp_get_thread_num() @@ -90,19 +93,21 @@ contains wrk = (n)/nthreads if (ithread < MOD((n),nthreads)) then wrk = wrk + 1 - first_idx = ithread*wrk + 1 + idxstart = ithread*wrk + 1 else - first_idx = ithread*wrk + MOD((n),nthreads) + 1 + idxstart = ithread*wrk + MOD((n),nthreads) + 1 end if - last_idx = min(first_idx + wrk - 1,n ) - if (first_idx<=last_idx) then - sumb(ithread+2) = sumb(ithread+2) + x(first_idx) - do i=first_idx+1,last_idx - sumb(ithread+2) = sumb(ithread+2) + x(i) - x(i) = x(i)+x(i-1) + idxend = min(idxstart + wrk - 1,n ) + tp = zzero + if (idxstart<=idxend) then + do i=idxstart,idxend + ts = x(i) + x(i) = tp + tp = tp + ts end do end if + sumb(ithread+2) = tp !$OMP BARRIER !$OMP SINGLE @@ -115,11 +120,10 @@ contains !$OMP DO SCHEDULE(STATIC) do i=1,n - x(i) = sumb(ithread+1) + x(i) + shift_ + x(i) = x(i) + sumb(ithread+1) + shift_ end do !$OMP END DO !$OMP SINGLE - x(1) = shift_ deallocate(sumb) !$OMP END SINGLE end subroutine inner_z_exscan