From 68f20c0e7a04467f00fcc2b6febbd891a2d61557 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 7 Oct 2024 12:44:45 +0200 Subject: [PATCH 1/2] Modify init --- openacc/psb_oacc_env_mod.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openacc/psb_oacc_env_mod.F90 b/openacc/psb_oacc_env_mod.F90 index dc01ad3a..6d810f74 100644 --- a/openacc/psb_oacc_env_mod.F90 +++ b/openacc/psb_oacc_env_mod.F90 @@ -18,7 +18,7 @@ contains subroutine psb_oacc_init(ctxt, dev) type(psb_ctxt_type), intent(in) :: ctxt integer, intent(in), optional :: dev - + oacc_do_maybe_free_buffer = .false. end subroutine psb_oacc_init subroutine psb_oacc_exit() From 740609a4d8527de07a27d647f7c8b973142439aa Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 7 Oct 2024 12:45:18 +0200 Subject: [PATCH 2/2] Fix present() clauses --- openacc/psb_c_oacc_vect_mod.F90 | 80 ++++++++++++++++++++------------- openacc/psb_d_oacc_vect_mod.F90 | 80 ++++++++++++++++++++------------- openacc/psb_i_oacc_vect_mod.F90 | 58 +++++++++++++++--------- openacc/psb_l_oacc_vect_mod.F90 | 58 +++++++++++++++--------- openacc/psb_s_oacc_vect_mod.F90 | 80 ++++++++++++++++++++------------- openacc/psb_z_oacc_vect_mod.F90 | 80 ++++++++++++++++++++------------- 6 files changed, 266 insertions(+), 170 deletions(-) diff --git a/openacc/psb_c_oacc_vect_mod.F90 b/openacc/psb_c_oacc_vect_mod.F90 index e479f58d..40437184 100644 --- a/openacc/psb_c_oacc_vect_mod.F90 +++ b/openacc/psb_c_oacc_vect_mod.F90 @@ -113,7 +113,7 @@ contains complex(psb_spk_), intent(inout) :: x(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, n x(i) = abs(x(i)) end do @@ -144,7 +144,7 @@ contains complex(psb_spk_), intent(inout) :: x(:),y(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, n y(i) = abs(x(i)) end do @@ -164,7 +164,7 @@ contains complex(psb_spk_), intent(in) :: alpha complex(psb_spk_), intent(inout) :: x(:) integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, size(x) x(i) = alpha * x(i) end do @@ -189,7 +189,7 @@ contains real(psb_spk_) :: sum, mx integer(psb_ipk_) :: i mx = szero - !$acc parallel loop reduction(max:mx) + !$acc parallel loop reduction(max:mx) present(x) do i = 1, n if (abs(x(i)) > mx) mx = abs(x(i)) end do @@ -197,7 +197,7 @@ contains res = mx else sum = szero - !$acc parallel loop reduction(+:sum) + !$acc parallel loop reduction(+:sum) present(x) do i = 1, n sum = sum + abs(x(i)/mx)**2 end do @@ -223,7 +223,7 @@ contains real(psb_spk_) :: max_val integer(psb_ipk_) :: i max_val = szero - !$acc parallel loop reduction(max:max_val) + !$acc parallel loop reduction(max:max_val) present(x) do i = 1, n if (abs(x(i)) > max_val) max_val = abs(x(i)) end do @@ -248,7 +248,7 @@ contains real(psb_spk_) :: res integer(psb_ipk_) :: i res = szero - !$acc parallel loop reduction(+:res) + !$acc parallel loop reduction(+:res) present(x) do i = 1, n res = res + abs(x(i)) end do @@ -265,7 +265,7 @@ contains info = 0 if (y%is_dev()) call y%sync() - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, size(x) y%v(i) = y%v(i) * x(i) end do @@ -283,7 +283,7 @@ contains info = 0 if (z%is_dev()) call z%sync() - !$acc parallel loop + !$acc parallel loop present(x,y,z%v) do i = 1, size(x) z%v(i) = alpha * x(i) * y(i) + beta * z%v(i) end do @@ -327,7 +327,7 @@ contains complex(psb_spk_), intent(inout) :: y(:) complex(psb_spk_), intent(in) :: alpha, beta integer(psb_ipk_), intent(out) :: info - !$acc parallel + !$acc parallel present(x,y) !$acc loop do i = 1, m y(i) = alpha * x(i) + beta * y(i) @@ -384,7 +384,7 @@ contains if ((nx < m) .or. (ny < m) .or. (nz < m)) then info = psb_err_internal_error_ else - !$acc parallel loop + !$acc parallel loop present(xx%v,yy%v,zz%v) do i = 1, m yy%v(i) = alpha * xx%v(i) + beta * yy%v(i) zz%v(i) = gamma * yy%v(i) + delta * zz%v(i) @@ -416,6 +416,7 @@ contains integer(psb_ipk_) :: info, k logical :: acc_done if (.not.allocated(y%combuf)) then + write(0,*) 'allocation error for y%combuf ' call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') return end if @@ -443,8 +444,8 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_spk_) :: beta,x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -488,8 +489,8 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_spk_) :: beta, x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -531,6 +532,7 @@ contains acc_done = .false. if (.not.allocated(x%combuf)) then + write(0,*) 'oacc allocation error combuf gthzbuf ' call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') return end if @@ -556,13 +558,13 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_spk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine c_oacc_gthzbuf @@ -600,13 +602,13 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_spk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine c_oacc_gthzv_x @@ -633,7 +635,7 @@ contains if (vval%is_host()) call vval%sync() if (virl%is_host()) call virl%sync() if (x%is_host()) call x%sync() - !$acc parallel loop + !$acc parallel loop present(x%v,virl%v,vval%v) do i = 1, n x%v(virl%v(i)) = vval%v(i) end do @@ -757,7 +759,7 @@ contains if (present(first)) first_ = max(1, first) if (present(last)) last_ = min(last, last_) - !$acc parallel loop + !$acc parallel loop present(x%v) do i = first_, last_ x%v(i) = val end do @@ -849,26 +851,36 @@ contains class(psb_c_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(out) :: info - if (n /= psb_size(x%combuf)) then + + !write(0,*) 'oacc new_buffer',n,psb_size(x%combuf) + if (n > psb_size(x%combuf)) then + !write(0,*) 'oacc new_buffer: reallocating ' + if (allocated(x%combuf)) then + !if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) + !$acc exit data delete(x%combuf) + end if call x%psb_c_base_vect_type%new_buffer(n,info) !$acc enter data copyin(x%combuf) + ! call acc_copyin(x%combuf) end if end subroutine c_oacc_new_buffer subroutine c_oacc_sync_dev_space(x) implicit none class(psb_c_vect_oacc), intent(inout) :: x - if (allocated(x%v)) call acc_create(x%v) +!!$ write(0,*) 'oacc sync_dev_space' + if (psb_size(x%v)>0) call acc_copyin(x%v) end subroutine c_oacc_sync_dev_space subroutine c_oacc_sync(x) implicit none class(psb_c_vect_oacc), intent(inout) :: x if (x%is_dev()) then - call acc_update_self(x%v) + if (psb_size(x%v)>0) call acc_update_self(x%v) end if if (x%is_host()) then - call acc_update_device(x%v) + if (.not.acc_is_present(x%v)) call c_oacc_sync_dev_space(x) + if (psb_size(x%v)>0) call acc_update_device(x%v) end if call x%set_sync() end subroutine c_oacc_sync @@ -941,6 +953,8 @@ contains type(psb_c_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: info info = 0 +!!$ write(0,*) 'oacc final_vect_free' + call x%free_buffer(info) if (allocated(x%v)) then if (acc_is_present(x%v)) call acc_delete_finalize(x%v) deallocate(x%v, stat=info) @@ -953,8 +967,9 @@ contains class(psb_c_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info info = 0 +!!$ write(0,*) 'oacc vect_free' + call x%free_buffer(info) if (acc_is_present(x%v)) call acc_delete_finalize(x%v) - if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_c_base_vect_type%free(info) end subroutine c_oacc_vect_free @@ -964,8 +979,10 @@ contains integer(psb_ipk_), intent(out) :: info info = 0 - if (psb_oacc_get_maybe_free_buffer())& - & call x%free_buffer(info) + if (psb_oacc_get_maybe_free_buffer()) then + !write(0,*) 'psb_oacc_get_maybe_free_buffer() ',psb_oacc_get_maybe_free_buffer() + call x%free_buffer(info) + end if end subroutine c_oacc_vect_maybe_free_buffer @@ -973,7 +990,7 @@ contains implicit none class(psb_c_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info - +! write(0,*) 'oacc free_buffer' info = 0 if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_c_base_vect_type%free_buffer(info) @@ -985,7 +1002,6 @@ contains class(psb_c_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: res - if (x%is_dev()) call x%sync() res = size(x%v) end function c_oacc_get_size diff --git a/openacc/psb_d_oacc_vect_mod.F90 b/openacc/psb_d_oacc_vect_mod.F90 index 7fd2a441..84441c8a 100644 --- a/openacc/psb_d_oacc_vect_mod.F90 +++ b/openacc/psb_d_oacc_vect_mod.F90 @@ -113,7 +113,7 @@ contains real(psb_dpk_), intent(inout) :: x(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, n x(i) = abs(x(i)) end do @@ -144,7 +144,7 @@ contains real(psb_dpk_), intent(inout) :: x(:),y(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, n y(i) = abs(x(i)) end do @@ -164,7 +164,7 @@ contains real(psb_dpk_), intent(in) :: alpha real(psb_dpk_), intent(inout) :: x(:) integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, size(x) x(i) = alpha * x(i) end do @@ -189,7 +189,7 @@ contains real(psb_dpk_) :: sum, mx integer(psb_ipk_) :: i mx = dzero - !$acc parallel loop reduction(max:mx) + !$acc parallel loop reduction(max:mx) present(x) do i = 1, n if (abs(x(i)) > mx) mx = abs(x(i)) end do @@ -197,7 +197,7 @@ contains res = mx else sum = dzero - !$acc parallel loop reduction(+:sum) + !$acc parallel loop reduction(+:sum) present(x) do i = 1, n sum = sum + abs(x(i)/mx)**2 end do @@ -223,7 +223,7 @@ contains real(psb_dpk_) :: max_val integer(psb_ipk_) :: i max_val = dzero - !$acc parallel loop reduction(max:max_val) + !$acc parallel loop reduction(max:max_val) present(x) do i = 1, n if (abs(x(i)) > max_val) max_val = abs(x(i)) end do @@ -248,7 +248,7 @@ contains real(psb_dpk_) :: res integer(psb_ipk_) :: i res = dzero - !$acc parallel loop reduction(+:res) + !$acc parallel loop reduction(+:res) present(x) do i = 1, n res = res + abs(x(i)) end do @@ -265,7 +265,7 @@ contains info = 0 if (y%is_dev()) call y%sync() - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, size(x) y%v(i) = y%v(i) * x(i) end do @@ -283,7 +283,7 @@ contains info = 0 if (z%is_dev()) call z%sync() - !$acc parallel loop + !$acc parallel loop present(x,y,z%v) do i = 1, size(x) z%v(i) = alpha * x(i) * y(i) + beta * z%v(i) end do @@ -327,7 +327,7 @@ contains real(psb_dpk_), intent(inout) :: y(:) real(psb_dpk_), intent(in) :: alpha, beta integer(psb_ipk_), intent(out) :: info - !$acc parallel + !$acc parallel present(x,y) !$acc loop do i = 1, m y(i) = alpha * x(i) + beta * y(i) @@ -384,7 +384,7 @@ contains if ((nx < m) .or. (ny < m) .or. (nz < m)) then info = psb_err_internal_error_ else - !$acc parallel loop + !$acc parallel loop present(xx%v,yy%v,zz%v) do i = 1, m yy%v(i) = alpha * xx%v(i) + beta * yy%v(i) zz%v(i) = gamma * yy%v(i) + delta * zz%v(i) @@ -416,6 +416,7 @@ contains integer(psb_ipk_) :: info, k logical :: acc_done if (.not.allocated(y%combuf)) then + write(0,*) 'allocation error for y%combuf ' call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') return end if @@ -443,8 +444,8 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_dpk_) :: beta,x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -488,8 +489,8 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_dpk_) :: beta, x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -531,6 +532,7 @@ contains acc_done = .false. if (.not.allocated(x%combuf)) then + write(0,*) 'oacc allocation error combuf gthzbuf ' call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') return end if @@ -556,13 +558,13 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_dpk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine d_oacc_gthzbuf @@ -600,13 +602,13 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_dpk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine d_oacc_gthzv_x @@ -633,7 +635,7 @@ contains if (vval%is_host()) call vval%sync() if (virl%is_host()) call virl%sync() if (x%is_host()) call x%sync() - !$acc parallel loop + !$acc parallel loop present(x%v,virl%v,vval%v) do i = 1, n x%v(virl%v(i)) = vval%v(i) end do @@ -757,7 +759,7 @@ contains if (present(first)) first_ = max(1, first) if (present(last)) last_ = min(last, last_) - !$acc parallel loop + !$acc parallel loop present(x%v) do i = first_, last_ x%v(i) = val end do @@ -849,26 +851,36 @@ contains class(psb_d_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(out) :: info - if (n /= psb_size(x%combuf)) then + + !write(0,*) 'oacc new_buffer',n,psb_size(x%combuf) + if (n > psb_size(x%combuf)) then + !write(0,*) 'oacc new_buffer: reallocating ' + if (allocated(x%combuf)) then + !if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) + !$acc exit data delete(x%combuf) + end if call x%psb_d_base_vect_type%new_buffer(n,info) !$acc enter data copyin(x%combuf) + ! call acc_copyin(x%combuf) end if end subroutine d_oacc_new_buffer subroutine d_oacc_sync_dev_space(x) implicit none class(psb_d_vect_oacc), intent(inout) :: x - if (allocated(x%v)) call acc_create(x%v) +!!$ write(0,*) 'oacc sync_dev_space' + if (psb_size(x%v)>0) call acc_copyin(x%v) end subroutine d_oacc_sync_dev_space subroutine d_oacc_sync(x) implicit none class(psb_d_vect_oacc), intent(inout) :: x if (x%is_dev()) then - call acc_update_self(x%v) + if (psb_size(x%v)>0) call acc_update_self(x%v) end if if (x%is_host()) then - call acc_update_device(x%v) + if (.not.acc_is_present(x%v)) call d_oacc_sync_dev_space(x) + if (psb_size(x%v)>0) call acc_update_device(x%v) end if call x%set_sync() end subroutine d_oacc_sync @@ -941,6 +953,8 @@ contains type(psb_d_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: info info = 0 +!!$ write(0,*) 'oacc final_vect_free' + call x%free_buffer(info) if (allocated(x%v)) then if (acc_is_present(x%v)) call acc_delete_finalize(x%v) deallocate(x%v, stat=info) @@ -953,8 +967,9 @@ contains class(psb_d_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info info = 0 +!!$ write(0,*) 'oacc vect_free' + call x%free_buffer(info) if (acc_is_present(x%v)) call acc_delete_finalize(x%v) - if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_d_base_vect_type%free(info) end subroutine d_oacc_vect_free @@ -964,8 +979,10 @@ contains integer(psb_ipk_), intent(out) :: info info = 0 - if (psb_oacc_get_maybe_free_buffer())& - & call x%free_buffer(info) + if (psb_oacc_get_maybe_free_buffer()) then + !write(0,*) 'psb_oacc_get_maybe_free_buffer() ',psb_oacc_get_maybe_free_buffer() + call x%free_buffer(info) + end if end subroutine d_oacc_vect_maybe_free_buffer @@ -973,7 +990,7 @@ contains implicit none class(psb_d_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info - +! write(0,*) 'oacc free_buffer' info = 0 if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_d_base_vect_type%free_buffer(info) @@ -985,7 +1002,6 @@ contains class(psb_d_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: res - if (x%is_dev()) call x%sync() res = size(x%v) end function d_oacc_get_size diff --git a/openacc/psb_i_oacc_vect_mod.F90 b/openacc/psb_i_oacc_vect_mod.F90 index 455453a1..42cdc18c 100644 --- a/openacc/psb_i_oacc_vect_mod.F90 +++ b/openacc/psb_i_oacc_vect_mod.F90 @@ -70,6 +70,7 @@ contains integer(psb_ipk_) :: info, k logical :: acc_done if (.not.allocated(y%combuf)) then + write(0,*) 'allocation error for y%combuf ' call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') return end if @@ -97,8 +98,8 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_ipk_) :: beta,x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -142,8 +143,8 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_ipk_) :: beta, x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -185,6 +186,7 @@ contains acc_done = .false. if (.not.allocated(x%combuf)) then + write(0,*) 'oacc allocation error combuf gthzbuf ' call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') return end if @@ -210,13 +212,13 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_ipk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine i_oacc_gthzbuf @@ -254,13 +256,13 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_ipk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine i_oacc_gthzv_x @@ -287,7 +289,7 @@ contains if (vval%is_host()) call vval%sync() if (virl%is_host()) call virl%sync() if (x%is_host()) call x%sync() - !$acc parallel loop + !$acc parallel loop present(x%v,virl%v,vval%v) do i = 1, n x%v(virl%v(i)) = vval%v(i) end do @@ -411,7 +413,7 @@ contains if (present(first)) first_ = max(1, first) if (present(last)) last_ = min(last, last_) - !$acc parallel loop + !$acc parallel loop present(x%v) do i = first_, last_ x%v(i) = val end do @@ -449,26 +451,36 @@ contains class(psb_i_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(out) :: info - if (n /= psb_size(x%combuf)) then + + !write(0,*) 'oacc new_buffer',n,psb_size(x%combuf) + if (n > psb_size(x%combuf)) then + !write(0,*) 'oacc new_buffer: reallocating ' + if (allocated(x%combuf)) then + !if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) + !$acc exit data delete(x%combuf) + end if call x%psb_i_base_vect_type%new_buffer(n,info) !$acc enter data copyin(x%combuf) + ! call acc_copyin(x%combuf) end if end subroutine i_oacc_new_buffer subroutine i_oacc_sync_dev_space(x) implicit none class(psb_i_vect_oacc), intent(inout) :: x - if (allocated(x%v)) call acc_create(x%v) +!!$ write(0,*) 'oacc sync_dev_space' + if (psb_size(x%v)>0) call acc_copyin(x%v) end subroutine i_oacc_sync_dev_space subroutine i_oacc_sync(x) implicit none class(psb_i_vect_oacc), intent(inout) :: x if (x%is_dev()) then - call acc_update_self(x%v) + if (psb_size(x%v)>0) call acc_update_self(x%v) end if if (x%is_host()) then - call acc_update_device(x%v) + if (.not.acc_is_present(x%v)) call i_oacc_sync_dev_space(x) + if (psb_size(x%v)>0) call acc_update_device(x%v) end if call x%set_sync() end subroutine i_oacc_sync @@ -541,6 +553,8 @@ contains type(psb_i_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: info info = 0 +!!$ write(0,*) 'oacc final_vect_free' + call x%free_buffer(info) if (allocated(x%v)) then if (acc_is_present(x%v)) call acc_delete_finalize(x%v) deallocate(x%v, stat=info) @@ -553,8 +567,9 @@ contains class(psb_i_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info info = 0 +!!$ write(0,*) 'oacc vect_free' + call x%free_buffer(info) if (acc_is_present(x%v)) call acc_delete_finalize(x%v) - if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_i_base_vect_type%free(info) end subroutine i_oacc_vect_free @@ -564,8 +579,10 @@ contains integer(psb_ipk_), intent(out) :: info info = 0 - if (psb_oacc_get_maybe_free_buffer())& - & call x%free_buffer(info) + if (psb_oacc_get_maybe_free_buffer()) then + !write(0,*) 'psb_oacc_get_maybe_free_buffer() ',psb_oacc_get_maybe_free_buffer() + call x%free_buffer(info) + end if end subroutine i_oacc_vect_maybe_free_buffer @@ -573,7 +590,7 @@ contains implicit none class(psb_i_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info - +! write(0,*) 'oacc free_buffer' info = 0 if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_i_base_vect_type%free_buffer(info) @@ -585,7 +602,6 @@ contains class(psb_i_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: res - if (x%is_dev()) call x%sync() res = size(x%v) end function i_oacc_get_size diff --git a/openacc/psb_l_oacc_vect_mod.F90 b/openacc/psb_l_oacc_vect_mod.F90 index d35e9141..60cdee35 100644 --- a/openacc/psb_l_oacc_vect_mod.F90 +++ b/openacc/psb_l_oacc_vect_mod.F90 @@ -72,6 +72,7 @@ contains integer(psb_ipk_) :: info, k logical :: acc_done if (.not.allocated(y%combuf)) then + write(0,*) 'allocation error for y%combuf ' call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') return end if @@ -99,8 +100,8 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_lpk_) :: beta,x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -144,8 +145,8 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_lpk_) :: beta, x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -187,6 +188,7 @@ contains acc_done = .false. if (.not.allocated(x%combuf)) then + write(0,*) 'oacc allocation error combuf gthzbuf ' call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') return end if @@ -212,13 +214,13 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_lpk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine l_oacc_gthzbuf @@ -256,13 +258,13 @@ contains integer(psb_ipk_) :: n, idx(:) integer(psb_lpk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine l_oacc_gthzv_x @@ -289,7 +291,7 @@ contains if (vval%is_host()) call vval%sync() if (virl%is_host()) call virl%sync() if (x%is_host()) call x%sync() - !$acc parallel loop + !$acc parallel loop present(x%v,virl%v,vval%v) do i = 1, n x%v(virl%v(i)) = vval%v(i) end do @@ -413,7 +415,7 @@ contains if (present(first)) first_ = max(1, first) if (present(last)) last_ = min(last, last_) - !$acc parallel loop + !$acc parallel loop present(x%v) do i = first_, last_ x%v(i) = val end do @@ -451,26 +453,36 @@ contains class(psb_l_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(out) :: info - if (n /= psb_size(x%combuf)) then + + !write(0,*) 'oacc new_buffer',n,psb_size(x%combuf) + if (n > psb_size(x%combuf)) then + !write(0,*) 'oacc new_buffer: reallocating ' + if (allocated(x%combuf)) then + !if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) + !$acc exit data delete(x%combuf) + end if call x%psb_l_base_vect_type%new_buffer(n,info) !$acc enter data copyin(x%combuf) + ! call acc_copyin(x%combuf) end if end subroutine l_oacc_new_buffer subroutine l_oacc_sync_dev_space(x) implicit none class(psb_l_vect_oacc), intent(inout) :: x - if (allocated(x%v)) call acc_create(x%v) +!!$ write(0,*) 'oacc sync_dev_space' + if (psb_size(x%v)>0) call acc_copyin(x%v) end subroutine l_oacc_sync_dev_space subroutine l_oacc_sync(x) implicit none class(psb_l_vect_oacc), intent(inout) :: x if (x%is_dev()) then - call acc_update_self(x%v) + if (psb_size(x%v)>0) call acc_update_self(x%v) end if if (x%is_host()) then - call acc_update_device(x%v) + if (.not.acc_is_present(x%v)) call l_oacc_sync_dev_space(x) + if (psb_size(x%v)>0) call acc_update_device(x%v) end if call x%set_sync() end subroutine l_oacc_sync @@ -543,6 +555,8 @@ contains type(psb_l_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: info info = 0 +!!$ write(0,*) 'oacc final_vect_free' + call x%free_buffer(info) if (allocated(x%v)) then if (acc_is_present(x%v)) call acc_delete_finalize(x%v) deallocate(x%v, stat=info) @@ -555,8 +569,9 @@ contains class(psb_l_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info info = 0 +!!$ write(0,*) 'oacc vect_free' + call x%free_buffer(info) if (acc_is_present(x%v)) call acc_delete_finalize(x%v) - if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_l_base_vect_type%free(info) end subroutine l_oacc_vect_free @@ -566,8 +581,10 @@ contains integer(psb_ipk_), intent(out) :: info info = 0 - if (psb_oacc_get_maybe_free_buffer())& - & call x%free_buffer(info) + if (psb_oacc_get_maybe_free_buffer()) then + !write(0,*) 'psb_oacc_get_maybe_free_buffer() ',psb_oacc_get_maybe_free_buffer() + call x%free_buffer(info) + end if end subroutine l_oacc_vect_maybe_free_buffer @@ -575,7 +592,7 @@ contains implicit none class(psb_l_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info - +! write(0,*) 'oacc free_buffer' info = 0 if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_l_base_vect_type%free_buffer(info) @@ -587,7 +604,6 @@ contains class(psb_l_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: res - if (x%is_dev()) call x%sync() res = size(x%v) end function l_oacc_get_size diff --git a/openacc/psb_s_oacc_vect_mod.F90 b/openacc/psb_s_oacc_vect_mod.F90 index 87eeccea..70c9dd49 100644 --- a/openacc/psb_s_oacc_vect_mod.F90 +++ b/openacc/psb_s_oacc_vect_mod.F90 @@ -113,7 +113,7 @@ contains real(psb_spk_), intent(inout) :: x(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, n x(i) = abs(x(i)) end do @@ -144,7 +144,7 @@ contains real(psb_spk_), intent(inout) :: x(:),y(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, n y(i) = abs(x(i)) end do @@ -164,7 +164,7 @@ contains real(psb_spk_), intent(in) :: alpha real(psb_spk_), intent(inout) :: x(:) integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, size(x) x(i) = alpha * x(i) end do @@ -189,7 +189,7 @@ contains real(psb_spk_) :: sum, mx integer(psb_ipk_) :: i mx = szero - !$acc parallel loop reduction(max:mx) + !$acc parallel loop reduction(max:mx) present(x) do i = 1, n if (abs(x(i)) > mx) mx = abs(x(i)) end do @@ -197,7 +197,7 @@ contains res = mx else sum = szero - !$acc parallel loop reduction(+:sum) + !$acc parallel loop reduction(+:sum) present(x) do i = 1, n sum = sum + abs(x(i)/mx)**2 end do @@ -223,7 +223,7 @@ contains real(psb_spk_) :: max_val integer(psb_ipk_) :: i max_val = szero - !$acc parallel loop reduction(max:max_val) + !$acc parallel loop reduction(max:max_val) present(x) do i = 1, n if (abs(x(i)) > max_val) max_val = abs(x(i)) end do @@ -248,7 +248,7 @@ contains real(psb_spk_) :: res integer(psb_ipk_) :: i res = szero - !$acc parallel loop reduction(+:res) + !$acc parallel loop reduction(+:res) present(x) do i = 1, n res = res + abs(x(i)) end do @@ -265,7 +265,7 @@ contains info = 0 if (y%is_dev()) call y%sync() - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, size(x) y%v(i) = y%v(i) * x(i) end do @@ -283,7 +283,7 @@ contains info = 0 if (z%is_dev()) call z%sync() - !$acc parallel loop + !$acc parallel loop present(x,y,z%v) do i = 1, size(x) z%v(i) = alpha * x(i) * y(i) + beta * z%v(i) end do @@ -327,7 +327,7 @@ contains real(psb_spk_), intent(inout) :: y(:) real(psb_spk_), intent(in) :: alpha, beta integer(psb_ipk_), intent(out) :: info - !$acc parallel + !$acc parallel present(x,y) !$acc loop do i = 1, m y(i) = alpha * x(i) + beta * y(i) @@ -384,7 +384,7 @@ contains if ((nx < m) .or. (ny < m) .or. (nz < m)) then info = psb_err_internal_error_ else - !$acc parallel loop + !$acc parallel loop present(xx%v,yy%v,zz%v) do i = 1, m yy%v(i) = alpha * xx%v(i) + beta * yy%v(i) zz%v(i) = gamma * yy%v(i) + delta * zz%v(i) @@ -416,6 +416,7 @@ contains integer(psb_ipk_) :: info, k logical :: acc_done if (.not.allocated(y%combuf)) then + write(0,*) 'allocation error for y%combuf ' call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') return end if @@ -443,8 +444,8 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_spk_) :: beta,x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -488,8 +489,8 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_spk_) :: beta, x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -531,6 +532,7 @@ contains acc_done = .false. if (.not.allocated(x%combuf)) then + write(0,*) 'oacc allocation error combuf gthzbuf ' call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') return end if @@ -556,13 +558,13 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_spk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine s_oacc_gthzbuf @@ -600,13 +602,13 @@ contains integer(psb_ipk_) :: n, idx(:) real(psb_spk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine s_oacc_gthzv_x @@ -633,7 +635,7 @@ contains if (vval%is_host()) call vval%sync() if (virl%is_host()) call virl%sync() if (x%is_host()) call x%sync() - !$acc parallel loop + !$acc parallel loop present(x%v,virl%v,vval%v) do i = 1, n x%v(virl%v(i)) = vval%v(i) end do @@ -757,7 +759,7 @@ contains if (present(first)) first_ = max(1, first) if (present(last)) last_ = min(last, last_) - !$acc parallel loop + !$acc parallel loop present(x%v) do i = first_, last_ x%v(i) = val end do @@ -849,26 +851,36 @@ contains class(psb_s_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(out) :: info - if (n /= psb_size(x%combuf)) then + + !write(0,*) 'oacc new_buffer',n,psb_size(x%combuf) + if (n > psb_size(x%combuf)) then + !write(0,*) 'oacc new_buffer: reallocating ' + if (allocated(x%combuf)) then + !if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) + !$acc exit data delete(x%combuf) + end if call x%psb_s_base_vect_type%new_buffer(n,info) !$acc enter data copyin(x%combuf) + ! call acc_copyin(x%combuf) end if end subroutine s_oacc_new_buffer subroutine s_oacc_sync_dev_space(x) implicit none class(psb_s_vect_oacc), intent(inout) :: x - if (allocated(x%v)) call acc_create(x%v) +!!$ write(0,*) 'oacc sync_dev_space' + if (psb_size(x%v)>0) call acc_copyin(x%v) end subroutine s_oacc_sync_dev_space subroutine s_oacc_sync(x) implicit none class(psb_s_vect_oacc), intent(inout) :: x if (x%is_dev()) then - call acc_update_self(x%v) + if (psb_size(x%v)>0) call acc_update_self(x%v) end if if (x%is_host()) then - call acc_update_device(x%v) + if (.not.acc_is_present(x%v)) call s_oacc_sync_dev_space(x) + if (psb_size(x%v)>0) call acc_update_device(x%v) end if call x%set_sync() end subroutine s_oacc_sync @@ -941,6 +953,8 @@ contains type(psb_s_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: info info = 0 +!!$ write(0,*) 'oacc final_vect_free' + call x%free_buffer(info) if (allocated(x%v)) then if (acc_is_present(x%v)) call acc_delete_finalize(x%v) deallocate(x%v, stat=info) @@ -953,8 +967,9 @@ contains class(psb_s_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info info = 0 +!!$ write(0,*) 'oacc vect_free' + call x%free_buffer(info) if (acc_is_present(x%v)) call acc_delete_finalize(x%v) - if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_s_base_vect_type%free(info) end subroutine s_oacc_vect_free @@ -964,8 +979,10 @@ contains integer(psb_ipk_), intent(out) :: info info = 0 - if (psb_oacc_get_maybe_free_buffer())& - & call x%free_buffer(info) + if (psb_oacc_get_maybe_free_buffer()) then + !write(0,*) 'psb_oacc_get_maybe_free_buffer() ',psb_oacc_get_maybe_free_buffer() + call x%free_buffer(info) + end if end subroutine s_oacc_vect_maybe_free_buffer @@ -973,7 +990,7 @@ contains implicit none class(psb_s_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info - +! write(0,*) 'oacc free_buffer' info = 0 if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_s_base_vect_type%free_buffer(info) @@ -985,7 +1002,6 @@ contains class(psb_s_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: res - if (x%is_dev()) call x%sync() res = size(x%v) end function s_oacc_get_size diff --git a/openacc/psb_z_oacc_vect_mod.F90 b/openacc/psb_z_oacc_vect_mod.F90 index 0fe1adaa..0bc10283 100644 --- a/openacc/psb_z_oacc_vect_mod.F90 +++ b/openacc/psb_z_oacc_vect_mod.F90 @@ -113,7 +113,7 @@ contains complex(psb_dpk_), intent(inout) :: x(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, n x(i) = abs(x(i)) end do @@ -144,7 +144,7 @@ contains complex(psb_dpk_), intent(inout) :: x(:),y(:) integer(psb_ipk_) :: n integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, n y(i) = abs(x(i)) end do @@ -164,7 +164,7 @@ contains complex(psb_dpk_), intent(in) :: alpha complex(psb_dpk_), intent(inout) :: x(:) integer(psb_ipk_) :: i - !$acc parallel loop + !$acc parallel loop present(x) do i = 1, size(x) x(i) = alpha * x(i) end do @@ -189,7 +189,7 @@ contains real(psb_dpk_) :: sum, mx integer(psb_ipk_) :: i mx = dzero - !$acc parallel loop reduction(max:mx) + !$acc parallel loop reduction(max:mx) present(x) do i = 1, n if (abs(x(i)) > mx) mx = abs(x(i)) end do @@ -197,7 +197,7 @@ contains res = mx else sum = dzero - !$acc parallel loop reduction(+:sum) + !$acc parallel loop reduction(+:sum) present(x) do i = 1, n sum = sum + abs(x(i)/mx)**2 end do @@ -223,7 +223,7 @@ contains real(psb_dpk_) :: max_val integer(psb_ipk_) :: i max_val = dzero - !$acc parallel loop reduction(max:max_val) + !$acc parallel loop reduction(max:max_val) present(x) do i = 1, n if (abs(x(i)) > max_val) max_val = abs(x(i)) end do @@ -248,7 +248,7 @@ contains real(psb_dpk_) :: res integer(psb_ipk_) :: i res = dzero - !$acc parallel loop reduction(+:res) + !$acc parallel loop reduction(+:res) present(x) do i = 1, n res = res + abs(x(i)) end do @@ -265,7 +265,7 @@ contains info = 0 if (y%is_dev()) call y%sync() - !$acc parallel loop + !$acc parallel loop present(x,y) do i = 1, size(x) y%v(i) = y%v(i) * x(i) end do @@ -283,7 +283,7 @@ contains info = 0 if (z%is_dev()) call z%sync() - !$acc parallel loop + !$acc parallel loop present(x,y,z%v) do i = 1, size(x) z%v(i) = alpha * x(i) * y(i) + beta * z%v(i) end do @@ -327,7 +327,7 @@ contains complex(psb_dpk_), intent(inout) :: y(:) complex(psb_dpk_), intent(in) :: alpha, beta integer(psb_ipk_), intent(out) :: info - !$acc parallel + !$acc parallel present(x,y) !$acc loop do i = 1, m y(i) = alpha * x(i) + beta * y(i) @@ -384,7 +384,7 @@ contains if ((nx < m) .or. (ny < m) .or. (nz < m)) then info = psb_err_internal_error_ else - !$acc parallel loop + !$acc parallel loop present(xx%v,yy%v,zz%v) do i = 1, m yy%v(i) = alpha * xx%v(i) + beta * yy%v(i) zz%v(i) = gamma * yy%v(i) + delta * zz%v(i) @@ -416,6 +416,7 @@ contains integer(psb_ipk_) :: info, k logical :: acc_done if (.not.allocated(y%combuf)) then + write(0,*) 'allocation error for y%combuf ' call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') return end if @@ -443,8 +444,8 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_dpk_) :: beta,x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -488,8 +489,8 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_dpk_) :: beta, x(:), y(:) integer(psb_ipk_) :: k - !$acc update device(x(1:n)) async - !$acc parallel loop + !$acc update device(x(1:n)) + !$acc parallel loop present(x,y) do k = 1, n y(idx(k)) = x(k) + beta *y(idx(k)) end do @@ -531,6 +532,7 @@ contains acc_done = .false. if (.not.allocated(x%combuf)) then + write(0,*) 'oacc allocation error combuf gthzbuf ' call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') return end if @@ -556,13 +558,13 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_dpk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine z_oacc_gthzbuf @@ -600,13 +602,13 @@ contains integer(psb_ipk_) :: n, idx(:) complex(psb_dpk_) :: x(:), y(:) integer(psb_ipk_) :: k - - !$acc parallel loop present(y) + ! + !$acc parallel loop present(x,y) do k = 1, n y(k) = x(idx(k)) end do !$acc end parallel loop - !$acc update self(y(1:n)) async + !$acc update self(y(1:n)) end subroutine inner_gth end subroutine z_oacc_gthzv_x @@ -633,7 +635,7 @@ contains if (vval%is_host()) call vval%sync() if (virl%is_host()) call virl%sync() if (x%is_host()) call x%sync() - !$acc parallel loop + !$acc parallel loop present(x%v,virl%v,vval%v) do i = 1, n x%v(virl%v(i)) = vval%v(i) end do @@ -757,7 +759,7 @@ contains if (present(first)) first_ = max(1, first) if (present(last)) last_ = min(last, last_) - !$acc parallel loop + !$acc parallel loop present(x%v) do i = first_, last_ x%v(i) = val end do @@ -849,26 +851,36 @@ contains class(psb_z_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(out) :: info - if (n /= psb_size(x%combuf)) then + + !write(0,*) 'oacc new_buffer',n,psb_size(x%combuf) + if (n > psb_size(x%combuf)) then + !write(0,*) 'oacc new_buffer: reallocating ' + if (allocated(x%combuf)) then + !if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) + !$acc exit data delete(x%combuf) + end if call x%psb_z_base_vect_type%new_buffer(n,info) !$acc enter data copyin(x%combuf) + ! call acc_copyin(x%combuf) end if end subroutine z_oacc_new_buffer subroutine z_oacc_sync_dev_space(x) implicit none class(psb_z_vect_oacc), intent(inout) :: x - if (allocated(x%v)) call acc_create(x%v) +!!$ write(0,*) 'oacc sync_dev_space' + if (psb_size(x%v)>0) call acc_copyin(x%v) end subroutine z_oacc_sync_dev_space subroutine z_oacc_sync(x) implicit none class(psb_z_vect_oacc), intent(inout) :: x if (x%is_dev()) then - call acc_update_self(x%v) + if (psb_size(x%v)>0) call acc_update_self(x%v) end if if (x%is_host()) then - call acc_update_device(x%v) + if (.not.acc_is_present(x%v)) call z_oacc_sync_dev_space(x) + if (psb_size(x%v)>0) call acc_update_device(x%v) end if call x%set_sync() end subroutine z_oacc_sync @@ -941,6 +953,8 @@ contains type(psb_z_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: info info = 0 +!!$ write(0,*) 'oacc final_vect_free' + call x%free_buffer(info) if (allocated(x%v)) then if (acc_is_present(x%v)) call acc_delete_finalize(x%v) deallocate(x%v, stat=info) @@ -953,8 +967,9 @@ contains class(psb_z_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info info = 0 +!!$ write(0,*) 'oacc vect_free' + call x%free_buffer(info) if (acc_is_present(x%v)) call acc_delete_finalize(x%v) - if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_z_base_vect_type%free(info) end subroutine z_oacc_vect_free @@ -964,8 +979,10 @@ contains integer(psb_ipk_), intent(out) :: info info = 0 - if (psb_oacc_get_maybe_free_buffer())& - & call x%free_buffer(info) + if (psb_oacc_get_maybe_free_buffer()) then + !write(0,*) 'psb_oacc_get_maybe_free_buffer() ',psb_oacc_get_maybe_free_buffer() + call x%free_buffer(info) + end if end subroutine z_oacc_vect_maybe_free_buffer @@ -973,7 +990,7 @@ contains implicit none class(psb_z_vect_oacc), intent(inout) :: x integer(psb_ipk_), intent(out) :: info - +! write(0,*) 'oacc free_buffer' info = 0 if (acc_is_present(x%combuf)) call acc_delete_finalize(x%combuf) call x%psb_z_base_vect_type%free_buffer(info) @@ -985,7 +1002,6 @@ contains class(psb_z_vect_oacc), intent(inout) :: x integer(psb_ipk_) :: res - if (x%is_dev()) call x%sync() res = size(x%v) end function z_oacc_get_size