oacc : d_vect + i_vect + d_csr

oacc_loloum
Théophane Loloum 8 months ago
parent 39cfcd3893
commit 2b8671fba6

@ -0,0 +1,48 @@
.SUFFIXES:
.SUFFIXES: .F90 .f90 .o .s .c
# Compilers and flags
CC=mpicc
FC=mpif90
FCOPT=-O0 -march=native
OFFLOAD=-fopenacc -foffload=nvptx-none="-march=sm_70"
# Directories
LIBDIR=../lib
INCDIR=../include
MODDIR=../modules
IMPLDIR=./impl # Adding the impl directory
# Include and library paths
INCLUDES=-I$(LIBDIR) -I$(INCDIR) -I$(MODDIR) -I$(IMPLDIR)
LIBS=-L$(LIBDIR) -lpsb_util -lpsb_ext -lpsb_base -lopenblas -lmetis
# Source files
FOBJS= psb_i_oacc_vect_mod.o psb_d_oacc_vect_mod.o \
psb_oacc_mod.o psb_d_oacc_csr_mat_mod.o \
impl/psb_d_oacc_csr_vect_mv.o
# Library name
LIBNAME=libpsb_openacc.a
# Rules
all: $(LIBNAME)
$(LIBNAME): $(FOBJS)
ar cr $(LIBNAME) $(FOBJS)
/bin/cp -p $(LIBNAME) $(LIBDIR)
clean:
/bin/rm -fr *.o $(LIBNAME) *.mod impl/*.o
.f90.o:
$(FC) $(FCOPT) $(OFFLOAD) $(INCLUDES) -c $< -o $@
.c.o:
$(CC) -c $< -o $@
.F90.o:
$(FC) $(FCOPT) $(OFFLOAD) $(INCLUDES) -c $< -o $@
.F90.s:
$(FC) $(FCOPT) $(INCLUDES) -c -S $< -o $@

@ -0,0 +1,61 @@
subroutine psb_d_oacc_csr_vect_mv(alpha, a, x, beta, y, info, trans)
use psb_base_mod
use psb_d_oacc_csr_mat_mod, psb_protect_name => psb_d_oacc_csr_vect_mv
implicit none
real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_oacc_csr_sparse_mat), intent(in) :: a
class(psb_d_base_vect_type), intent(inout) :: x, y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
integer(psb_ipk_) :: m, n
info = psb_success_
m = a%get_nrows()
n = a%get_ncols()
if ((n /= size(x%v)) .or. (n /= size(y%v))) then
write(0,*) 'Size error ', m, n, size(x%v), size(y%v)
info = psb_err_invalid_mat_state_
return
end if
if (a%is_host()) call a%sync()
if (x%is_host()) call x%sync()
if (y%is_host()) call y%sync()
call inner_spmv(m, n, alpha, a%val, a%ja, a%irp, x%v, beta, y%v, info)
call y%set_dev()
contains
subroutine inner_spmv(m, n, alpha, val, ja, irp, x, beta, y, info)
implicit none
integer(psb_ipk_) :: m, n
real(psb_dpk_), intent(in) :: alpha, beta
real(psb_dpk_) :: val(:), x(:), y(:)
integer(psb_ipk_) :: ja(:), irp(:)
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, j, ii, isz
real(psb_dpk_) :: tmp
integer(psb_ipk_), parameter :: vsz = 256
info = 0
!$acc parallel loop vector_length(vsz) private(isz)
do ii = 1, m, vsz
isz = min(vsz, m - ii + 1)
!$acc loop independent private(tmp)
do i = ii, ii + isz - 1
tmp = 0.0_psb_dpk_
!$acc loop seq
do j = irp(i), irp(i + 1) - 1
tmp = tmp + val(j) * x(ja(j))
end do
y(i) = alpha * tmp + beta * y(i)
end do
end do
end subroutine inner_spmv
end subroutine psb_d_oacc_csr_vect_mv

@ -0,0 +1,185 @@
module psb_d_oacc_csr_mat_mod
use iso_c_binding
use psb_d_mat_mod
use psb_d_oacc_vect_mod
use oaccsparse_mod
integer(psb_ipk_), parameter, private :: is_host = -1
integer(psb_ipk_), parameter, private :: is_sync = 0
integer(psb_ipk_), parameter, private :: is_dev = 1
type, extends(psb_d_csr_sparse_mat) :: psb_d_oacc_csr_sparse_mat
integer(psb_ipk_) :: devstate = is_host
contains
procedure, pass(a) :: all => d_oacc_csr_all
procedure, pass(a) :: is_host => d_oacc_csr_is_host
procedure, pass(a) :: is_sync => d_oacc_csr_is_sync
procedure, pass(a) :: is_dev => d_oacc_csr_is_dev
procedure, pass(a) :: set_host => d_oacc_csr_set_host
procedure, pass(a) :: set_sync => d_oacc_csr_set_sync
procedure, pass(a) :: set_dev => d_oacc_csr_set_dev
procedure, pass(a) :: sync_space => d_oacc_csr_sync_space
procedure, pass(a) :: sync => d_oacc_csr_sync
procedure, pass(a) :: vect_mv => psb_d_oacc_csr_vect_mv
end type psb_d_oacc_csr_sparse_mat
interface
subroutine psb_d_oacc_csr_vect_mv(alpha, a, x, beta, y, info, trans)
import :: psb_d_oacc_csr_sparse_mat, psb_dpk_, psb_d_base_vect_type, psb_ipk_
class(psb_d_oacc_csr_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_base_vect_type), intent(inout) :: x, y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_d_oacc_csr_vect_mv
end interface
contains
subroutine d_oacc_csr_all(m, n, nz, a, info)
implicit none
integer(psb_ipk_), intent(in) :: m, n, nz
class(psb_d_oacc_csr_sparse_mat), intent(out) :: a
integer(psb_ipk_), intent(out) :: info
info = 0
if (allocated(a%val)) then
!$acc exit data delete(a%val) finalize
deallocate(a%val, stat=info)
end if
if (allocated(a%ja)) then
!$acc exit data delete(a%ja) finalize
deallocate(a%ja, stat=info)
end if
if (allocated(a%irp)) then
!$acc exit data delete(a%irp) finalize
deallocate(a%irp, stat=info)
end if
call a%set_nrows(m)
call a%set_ncols(n)
allocate(a%val(nz),stat=info)
allocate(a%ja(nz),stat=info)
allocate(a%irp(m+1),stat=info)
if (info == 0) call a%set_host()
if (info == 0) call a%sync_space()
end subroutine d_oacc_csr_all
function d_oacc_csr_is_host(a) result(res)
implicit none
class(psb_d_oacc_csr_sparse_mat), intent(in) :: a
logical :: res
res = (a%devstate == is_host)
end function d_oacc_csr_is_host
function d_oacc_csr_is_sync(a) result(res)
implicit none
class(psb_d_oacc_csr_sparse_mat), intent(in) :: a
logical :: res
res = (a%devstate == is_sync)
end function d_oacc_csr_is_sync
function d_oacc_csr_is_dev(a) result(res)
implicit none
class(psb_d_oacc_csr_sparse_mat), intent(in) :: a
logical :: res
res = (a%devstate == is_dev)
end function d_oacc_csr_is_dev
subroutine d_oacc_csr_set_host(a)
implicit none
class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a
a%devstate = is_host
end subroutine d_oacc_csr_set_host
subroutine d_oacc_csr_set_sync(a)
implicit none
class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a
a%devstate = is_sync
end subroutine d_oacc_csr_set_sync
subroutine d_oacc_csr_set_dev(a)
implicit none
class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a
a%devstate = is_dev
end subroutine d_oacc_csr_set_dev
subroutine d_oacc_csr_sync_space(a)
implicit none
class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a
if (allocated(a%val)) then
call d_oacc_create_dev(a%val)
end if
if (allocated(a%ja)) then
call i_oacc_create_dev(a%ja)
end if
if (allocated(a%irp)) then
call i_oacc_create_dev(a%irp)
end if
contains
subroutine d_oacc_create_dev(v)
implicit none
real(psb_dpk_), intent(in) :: v(:)
!$acc enter data copyin(v)
end subroutine d_oacc_create_dev
subroutine i_oacc_create_dev(v)
implicit none
integer(psb_ipk_), intent(in) :: v(:)
!$acc enter data copyin(v)
end subroutine i_oacc_create_dev
end subroutine d_oacc_csr_sync_space
subroutine d_oacc_csr_sync(a)
implicit none
class(psb_d_oacc_csr_sparse_mat), target, intent(in) :: a
class(psb_d_oacc_csr_sparse_mat), pointer :: tmpa
integer(psb_ipk_) :: info
tmpa => a
if (a%is_dev()) then
call d_oacc_csr_to_host(a%val)
call i_oacc_csr_to_host(a%ja)
call i_oacc_csr_to_host(a%irp)
else if (a%is_host()) then
call d_oacc_csr_to_dev(a%val)
call i_oacc_csr_to_dev(a%ja)
call i_oacc_csr_to_dev(a%irp)
end if
call tmpa%set_sync()
end subroutine d_oacc_csr_sync
subroutine d_oacc_csr_to_dev(v)
implicit none
real(psb_dpk_), intent(in) :: v(:)
!$acc update device(v)
end subroutine d_oacc_csr_to_dev
subroutine d_oacc_csr_to_host(v)
implicit none
real(psb_dpk_), intent(in) :: v(:)
!$acc update self(v)
end subroutine d_oacc_csr_to_host
subroutine i_oacc_csr_to_dev(v)
implicit none
integer(psb_ipk_), intent(in) :: v(:)
!$acc update device(v)
end subroutine i_oacc_csr_to_dev
subroutine i_oacc_csr_to_host(v)
implicit none
integer(psb_ipk_), intent(in) :: v(:)
!$acc update self(v)
end subroutine i_oacc_csr_to_host
end module psb_d_oacc_csr_mat_mod

@ -0,0 +1,966 @@
module psb_d_oacc_vect_mod
use iso_c_binding
use psb_const_mod
use psb_error_mod
use psb_d_vect_mod
use psb_i_oacc_vect_mod
use psb_i_vect_mod
integer(psb_ipk_), parameter, private :: is_host = -1
integer(psb_ipk_), parameter, private :: is_sync = 0
integer(psb_ipk_), parameter, private :: is_dev = 1
type, extends(psb_d_base_vect_type) :: psb_d_vect_oacc
integer :: state = is_host
contains
procedure, pass(x) :: get_nrows => d_oacc_get_nrows
procedure, nopass :: get_fmt => d_oacc_get_fmt
procedure, pass(x) :: all => d_oacc_vect_all
procedure, pass(x) :: zero => d_oacc_zero
procedure, pass(x) :: asb_m => d_oacc_asb_m
procedure, pass(x) :: sync => d_oacc_sync
procedure, pass(x) :: sync_space => d_oacc_sync_space
procedure, pass(x) :: bld_x => d_oacc_bld_x
procedure, pass(x) :: bld_mn => d_oacc_bld_mn
procedure, pass(x) :: free => d_oacc_vect_free
procedure, pass(x) :: ins_a => d_oacc_ins_a
procedure, pass(x) :: ins_v => d_oacc_ins_v
procedure, pass(x) :: is_host => d_oacc_is_host
procedure, pass(x) :: is_dev => d_oacc_is_dev
procedure, pass(x) :: is_sync => d_oacc_is_sync
procedure, pass(x) :: set_host => d_oacc_set_host
procedure, pass(x) :: set_dev => d_oacc_set_dev
procedure, pass(x) :: set_sync => d_oacc_set_sync
procedure, pass(x) :: set_scal => d_oacc_set_scal
procedure, pass(x) :: gthzv_x => d_oacc_gthzv_x
procedure, pass(x) :: gthzbuf_x => d_oacc_gthzbuf
procedure, pass(y) :: sctb => d_oacc_sctb
procedure, pass(y) :: sctb_x => d_oacc_sctb_x
procedure, pass(y) :: sctb_buf => d_oacc_sctb_buf
procedure, pass(x) :: get_size => oacc_get_size
procedure, pass(x) :: dot_v => d_oacc_vect_dot
procedure, pass(x) :: dot_a => d_oacc_dot_a
procedure, pass(y) :: axpby_v => d_oacc_axpby_v
procedure, pass(y) :: axpby_a => d_oacc_axpby_a
procedure, pass(z) :: abgdxyz => d_oacc_abgdxyz
procedure, pass(y) :: mlt_v => d_oacc_mlt_v
procedure, pass(y) :: mlt_a => d_oacc_mlt_a
procedure, pass(z) :: mlt_a_2 => d_oacc_mlt_a_2
procedure, pass(z) :: mlt_v_2 => d_oacc_mlt_v_2
procedure, pass(x) :: scal => d_oacc_scal
procedure, pass(x) :: nrm2 => d_oacc_nrm2
procedure, pass(x) :: amax => d_oacc_amax
procedure, pass(x) :: asum => d_oacc_asum
procedure, pass(x) :: absval1 => d_oacc_absval1
procedure, pass(x) :: absval2 => d_oacc_absval2
end type psb_d_vect_oacc
real(psb_dpk_), allocatable :: v1(:),v2(:),p(:)
contains
subroutine d_oacc_absval1(x)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: n, i
if (x%is_host()) call x%sync_space()
n = size(x%v)
!$acc parallel loop
do i = 1, n
x%v(i) = abs(x%v(i))
end do
call x%set_dev()
end subroutine d_oacc_absval1
subroutine d_oacc_absval2(x, y)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
class(psb_d_base_vect_type), intent(inout) :: y
integer(psb_ipk_) :: n
integer(psb_ipk_) :: i
n = min(size(x%v), size(y%v))
select type (yy => y)
class is (psb_d_vect_oacc)
if (x%is_host()) call x%sync()
if (yy%is_host()) call yy%sync()
!$acc parallel loop
do i = 1, n
yy%v(i) = abs(x%v(i))
end do
class default
if (x%is_dev()) call x%sync()
if (y%is_dev()) call y%sync()
call x%psb_d_base_vect_type%absval(y)
end select
end subroutine d_oacc_absval2
subroutine d_oacc_scal(alpha, x)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
real(psb_dpk_), intent(in) :: alpha
integer(psb_ipk_) :: info
integer(psb_ipk_) :: i
if (x%is_host()) call x%sync_space()
!$acc parallel loop
do i = 1, size(x%v)
x%v(i) = alpha * x%v(i)
end do
call x%set_dev()
end subroutine d_oacc_scal
function d_oacc_nrm2(n, x) result(res)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n
real(psb_dpk_) :: res
integer(psb_ipk_) :: info
real(psb_dpk_) :: sum
integer(psb_ipk_) :: i
if (x%is_host()) call x%sync_space()
sum = 0.0
!$acc parallel loop reduction(+:sum)
do i = 1, n
sum = sum + x%v(i) * x%v(i)
end do
res = sqrt(sum)
end function d_oacc_nrm2
function d_oacc_amax(n, x) result(res)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n
real(psb_dpk_) :: res
integer(psb_ipk_) :: info
real(psb_dpk_) :: max_val
integer(psb_ipk_) :: i
if (x%is_host()) call x%sync_space()
max_val = -huge(0.0)
!$acc parallel loop reduction(max:max_val)
do i = 1, n
if (x%v(i) > max_val) max_val = x%v(i)
end do
res = max_val
end function d_oacc_amax
function d_oacc_asum(n, x) result(res)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n
real(psb_dpk_) :: res
integer(psb_ipk_) :: info
real(psb_dpk_) :: sum
integer(psb_ipk_) :: i
if (x%is_host()) call x%sync_space()
sum = 0.0
!$acc parallel loop reduction(+:sum)
do i = 1, n
sum = sum + abs(x%v(i))
end do
res = sum
end function d_oacc_asum
subroutine d_oacc_mlt_v(x, y, info)
use psi_serial_mod
implicit none
class(psb_d_base_vect_type), intent(inout) :: x
class(psb_d_vect_oacc), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, n
info = 0
n = min(x%get_nrows(), y%get_nrows())
select type(xx => x)
type is (psb_d_base_vect_type)
if (y%is_dev()) call y%sync()
!$acc parallel loop
do i = 1, n
y%v(i) = y%v(i) * xx%v(i)
end do
call y%set_host()
class default
if (xx%is_dev()) call xx%sync()
if (y%is_dev()) call y%sync()
!$acc parallel loop
do i = 1, n
y%v(i) = y%v(i) * xx%v(i)
end do
call y%set_host()
end select
end subroutine d_oacc_mlt_v
subroutine d_oacc_mlt_a(x, y, info)
use psi_serial_mod
implicit none
real(psb_dpk_), intent(in) :: x(:)
class(psb_d_vect_oacc), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, n
info = 0
if (y%is_dev()) call y%sync_space()
!$acc parallel loop
do i = 1, size(x)
y%v(i) = y%v(i) * x(i)
end do
call y%set_host()
end subroutine d_oacc_mlt_a
subroutine d_oacc_mlt_a_2(alpha, x, y, beta, z, info)
use psi_serial_mod
implicit none
real(psb_dpk_), intent(in) :: alpha, beta
real(psb_dpk_), intent(in) :: x(:)
real(psb_dpk_), intent(in) :: y(:)
class(psb_d_vect_oacc), intent(inout) :: z
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, n
info = 0
if (z%is_dev()) call z%sync_space()
!$acc parallel loop
do i = 1, size(x)
z%v(i) = alpha * x(i) * y(i) + beta * z%v(i)
end do
call z%set_host()
end subroutine d_oacc_mlt_a_2
subroutine d_oacc_mlt_v_2(alpha, x, y, beta, z, info, conjgx, conjgy)
use psi_serial_mod
use psb_string_mod
implicit none
real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_base_vect_type), intent(inout) :: x
class(psb_d_base_vect_type), intent(inout) :: y
class(psb_d_vect_oacc), intent(inout) :: z
integer(psb_ipk_), intent(out) :: info
character(len=1), intent(in), optional :: conjgx, conjgy
integer(psb_ipk_) :: i, n
logical :: conjgx_, conjgy_
conjgx_ = .false.
conjgy_ = .false.
if (present(conjgx)) conjgx_ = (psb_toupper(conjgx) == 'C')
if (present(conjgy)) conjgy_ = (psb_toupper(conjgy) == 'C')
n = min(x%get_nrows(), y%get_nrows(), z%get_nrows())
info = 0
select type(xx => x)
class is (psb_d_vect_oacc)
select type (yy => y)
class is (psb_d_vect_oacc)
if (xx%is_host()) call xx%sync_space()
if (yy%is_host()) call yy%sync_space()
if ((beta /= dzero) .and. (z%is_host())) call z%sync_space()
!$acc parallel loop
do i = 1, n
z%v(i) = alpha * xx%v(i) * yy%v(i) + beta * z%v(i)
end do
call z%set_dev()
class default
if (xx%is_dev()) call xx%sync_space()
if (yy%is_dev()) call yy%sync()
if ((beta /= dzero) .and. (z%is_dev())) call z%sync_space()
!$acc parallel loop
do i = 1, n
z%v(i) = alpha * xx%v(i) * yy%v(i) + beta * z%v(i)
end do
call z%set_host()
end select
class default
if (x%is_dev()) call x%sync()
if (y%is_dev()) call y%sync()
if ((beta /= dzero) .and. (z%is_dev())) call z%sync_space()
!$acc parallel loop
do i = 1, n
z%v(i) = alpha * x%v(i) * y%v(i) + beta * z%v(i)
end do
call z%set_host()
end select
end subroutine d_oacc_mlt_v_2
subroutine d_oacc_axpby_v(m, alpha, x, beta, y, info)
use psi_serial_mod
implicit none
integer(psb_ipk_), intent(in) :: m
class(psb_d_base_vect_type), intent(inout) :: x
class(psb_d_vect_oacc), intent(inout) :: y
real(psb_dpk_), intent(in) :: alpha, beta
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nx, ny, i
info = psb_success_
select type(xx => x)
type is (psb_d_vect_oacc)
if ((beta /= dzero) .and. y%is_host()) call y%sync_space()
if (xx%is_host()) call xx%sync_space()
nx = size(xx%v)
ny = size(y%v)
if ((nx < m) .or. (ny < m)) then
info = psb_err_internal_error_
else
!$acc parallel loop
do i = 1, m
y%v(i) = alpha * xx%v(i) + beta * y%v(i)
end do
end if
call y%set_dev()
class default
if ((alpha /= dzero) .and. (x%is_dev())) call x%sync()
call y%axpby(m, alpha, x%v, beta, info)
end select
end subroutine d_oacc_axpby_v
subroutine d_oacc_axpby_a(m, alpha, x, beta, y, info)
use psi_serial_mod
implicit none
integer(psb_ipk_), intent(in) :: m
real(psb_dpk_), intent(in) :: x(:)
class(psb_d_vect_oacc), intent(inout) :: y
real(psb_dpk_), intent(in) :: alpha, beta
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i
if ((beta /= dzero) .and. (y%is_dev())) call y%sync_space()
!$acc parallel loop
do i = 1, m
y%v(i) = alpha * x(i) + beta * y%v(i)
end do
call y%set_host()
end subroutine d_oacc_axpby_a
subroutine d_oacc_abgdxyz(m, alpha, beta, gamma, delta, x, y, z, info)
use psi_serial_mod
implicit none
integer(psb_ipk_), intent(in) :: m
class(psb_d_base_vect_type), intent(inout) :: x
class(psb_d_base_vect_type), intent(inout) :: y
class(psb_d_vect_oacc), intent(inout) :: z
real(psb_dpk_), intent(in) :: alpha, beta, gamma, delta
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nx, ny, nz, i
logical :: gpu_done
info = psb_success_
gpu_done = .false.
select type(xx => x)
class is (psb_d_vect_oacc)
select type(yy => y)
class is (psb_d_vect_oacc)
select type(zz => z)
class is (psb_d_vect_oacc)
if ((beta /= dzero) .and. yy%is_host()) call yy%sync_space()
if ((delta /= dzero) .and. zz%is_host()) call zz%sync_space()
if (xx%is_host()) call xx%sync_space()
nx = size(xx%v)
ny = size(yy%v)
nz = size(zz%v)
if ((nx < m) .or. (ny < m) .or. (nz < m)) then
info = psb_err_internal_error_
else
!$acc parallel loop
do i = 1, m
yy%v(i) = alpha * xx%v(i) + beta * yy%v(i)
zz%v(i) = gamma * yy%v(i) + delta * zz%v(i)
end do
end if
call yy%set_dev()
call zz%set_dev()
gpu_done = .true.
end select
end select
end select
if (.not. gpu_done) then
if (x%is_host()) call x%sync()
if (y%is_host()) call y%sync()
if (z%is_host()) call z%sync()
call y%axpby(m, alpha, x, beta, info)
call z%axpby(m, gamma, y, delta, info)
end if
end subroutine d_oacc_abgdxyz
subroutine d_oacc_sctb_buf(i, n, idx, beta, y)
use psb_base_mod
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
real(psb_dpk_) :: beta
class(psb_d_vect_oacc) :: y
integer(psb_ipk_) :: info
if (.not.allocated(y%combuf)) then
call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf')
return
end if
select type(ii => idx)
class is (psb_i_vect_oacc)
if (ii%is_host()) call ii%sync_space(info)
if (y%is_host()) call y%sync_space()
!$acc parallel loop
do i = 1, n
y%v(ii%v(i)) = beta * y%v(ii%v(i)) + y%combuf(i)
end do
class default
!$acc parallel loop
do i = 1, n
y%v(idx%v(i)) = beta * y%v(idx%v(i)) + y%combuf(i)
end do
end select
end subroutine d_oacc_sctb_buf
subroutine d_oacc_sctb_x(i, n, idx, x, beta, y)
use psb_base_mod
implicit none
integer(psb_ipk_):: i, n
class(psb_i_base_vect_type) :: idx
real(psb_dpk_) :: beta, x(:)
class(psb_d_vect_oacc) :: y
integer(psb_ipk_) :: info, ni
select type(ii => idx)
class is (psb_i_vect_oacc)
if (ii%is_host()) call ii%sync_space(info)
class default
call psb_errpush(info, 'd_oacc_sctb_x')
return
end select
if (y%is_host()) call y%sync_space()
!$acc parallel loop
do i = 1, n
y%v(idx%v(i)) = beta * y%v(idx%v(i)) + x(i)
end do
call y%set_dev()
end subroutine d_oacc_sctb_x
subroutine d_oacc_sctb(n, idx, x, beta, y)
use psb_base_mod
implicit none
integer(psb_ipk_) :: n
integer(psb_ipk_) :: idx(:)
real(psb_dpk_) :: beta, x(:)
class(psb_d_vect_oacc) :: y
integer(psb_ipk_) :: info
integer(psb_ipk_) :: i
if (n == 0) return
if (y%is_dev()) call y%sync_space()
!$acc parallel loop
do i = 1, n
y%v(idx(i)) = beta * y%v(idx(i)) + x(i)
end do
call y%set_host()
end subroutine d_oacc_sctb
subroutine d_oacc_gthzbuf(i, n, idx, x)
use psb_base_mod
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
class(psb_d_vect_oacc) :: x
integer(psb_ipk_) :: info
info = 0
if (.not.allocated(x%combuf)) then
call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf')
return
end if
select type(ii => idx)
class is (psb_i_vect_oacc)
if (ii%is_host()) call ii%sync_space(info)
class default
call psb_errpush(info, 'd_oacc_gthzbuf')
return
end select
if (x%is_host()) call x%sync_space()
!$acc parallel loop
do i = 1, n
x%combuf(i) = x%v(idx%v(i))
end do
end subroutine d_oacc_gthzbuf
subroutine d_oacc_gthzv_x(i, n, idx, x, y)
use psb_base_mod
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type):: idx
real(psb_dpk_) :: y(:)
class(psb_d_vect_oacc):: x
integer(psb_ipk_) :: info
info = 0
select type(ii => idx)
class is (psb_i_vect_oacc)
if (ii%is_host()) call ii%sync_space(info)
class default
call psb_errpush(info, 'd_oacc_gthzv_x')
return
end select
if (x%is_host()) call x%sync_space()
!$acc parallel loop
do i = 1, n
y(i) = x%v(idx%v(i))
end do
end subroutine d_oacc_gthzv_x
subroutine d_oacc_ins_v(n, irl, val, dupl, x, info)
use psi_serial_mod
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n, dupl
class(psb_i_base_vect_type), intent(inout) :: irl
class(psb_d_base_vect_type), intent(inout) :: val
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, isz
logical :: done_oacc
info = 0
if (psb_errstatus_fatal()) return
done_oacc = .false.
select type(virl => irl)
type is (psb_i_vect_oacc)
select type(vval => val)
type is (psb_d_vect_oacc)
if (vval%is_host()) call vval%sync_space()
if (virl%is_host()) call virl%sync_space(info)
if (x%is_host()) call x%sync_space()
!$acc parallel loop
do i = 1, n
x%v(virl%v(i)) = vval%v(i)
end do
call x%set_dev()
done_oacc = .true.
end select
end select
if (.not.done_oacc) then
select type(virl => irl)
type is (psb_i_vect_oacc)
if (virl%is_dev()) call virl%sync_space(info)
end select
select type(vval => val)
type is (psb_d_vect_oacc)
if (vval%is_dev()) call vval%sync_space()
end select
call x%ins(n, irl%v, val%v, dupl, info)
end if
if (info /= 0) then
call psb_errpush(info, 'oacc_vect_ins')
return
end if
end subroutine d_oacc_ins_v
subroutine d_oacc_ins_a(n, irl, val, dupl, x, info)
use psi_serial_mod
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n, dupl
integer(psb_ipk_), intent(in) :: irl(:)
real(psb_dpk_), intent(in) :: val(:)
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i
info = 0
if (x%is_dev()) call x%sync_space()
call x%psb_d_base_vect_type%ins(n, irl, val, dupl, info)
call x%set_host()
!$acc update device(x%v)
end subroutine d_oacc_ins_a
subroutine d_oacc_bld_mn(x, n)
use psb_base_mod
implicit none
integer(psb_mpk_), intent(in) :: n
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: info
call x%all(n, info)
if (info /= 0) then
call psb_errpush(info, 'd_oacc_bld_mn', i_err=(/n, n, n, n, n/))
end if
call x%set_host()
!$acc update device(x%v)
end subroutine d_oacc_bld_mn
subroutine d_oacc_bld_x(x, this)
use psb_base_mod
implicit none
real(psb_dpk_), intent(in) :: this(:)
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: info
call psb_realloc(size(this), x%v, info)
if (info /= 0) then
info = psb_err_alloc_request_
call psb_errpush(info, 'd_oacc_bld_x', &
i_err=(/size(this), izero, izero, izero, izero/))
return
end if
x%v(:) = this(:)
call x%set_host()
!$acc update device(x%v)
end subroutine d_oacc_bld_x
subroutine d_oacc_asb_m(n, x, info)
use psb_base_mod
implicit none
integer(psb_mpk_), intent(in) :: n
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
integer(psb_mpk_) :: nd
info = psb_success_
if (x%is_dev()) then
nd = size(x%v)
if (nd < n) then
call x%sync()
call x%psb_d_base_vect_type%asb(n, info)
if (info == psb_success_) call x%sync_space()
call x%set_host()
end if
else
if (size(x%v) < n) then
call x%psb_d_base_vect_type%asb(n, info)
if (info == psb_success_) call x%sync_space()
call x%set_host()
end if
end if
end subroutine d_oacc_asb_m
subroutine d_oacc_set_scal(x, val, first, last)
class(psb_d_vect_oacc), intent(inout) :: x
real(psb_dpk_), intent(in) :: val
integer(psb_ipk_), optional :: first, last
integer(psb_ipk_) :: first_, last_
first_ = 1
last_ = x%get_nrows()
if (present(first)) first_ = max(1, first)
if (present(last)) last_ = min(last, last_)
!$acc parallel loop
do i = first_, last_
x%v(i) = val
end do
!$acc end parallel loop
call x%set_dev()
end subroutine d_oacc_set_scal
subroutine d_oacc_zero(x)
use psi_serial_mod
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
call x%set_dev()
call x%set_scal(dzero)
end subroutine d_oacc_zero
function d_oacc_get_nrows(x) result(res)
implicit none
class(psb_d_vect_oacc), intent(in) :: x
integer(psb_ipk_) :: res
if (allocated(x%v)) res = size(x%v)
end function d_oacc_get_nrows
function d_oacc_get_fmt() result(res)
implicit none
character(len=5) :: res
res = "dOACC"
end function d_oacc_get_fmt
function d_oacc_vect_dot(n, x, y) result(res)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
class(psb_d_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(in) :: n
real(psb_dpk_) :: res
real(psb_dpk_), external :: ddot
integer(psb_ipk_) :: info
integer(psb_ipk_) :: i
res = dzero
select type(yy => y)
type is (psb_d_base_vect_type)
if (x%is_dev()) call x%sync()
res = ddot(n, x%v, 1, yy%v, 1)
type is (psb_d_vect_oacc)
if (x%is_host()) call x%sync()
if (yy%is_host()) call yy%sync()
!$acc parallel loop reduction(+:res) present(x%v, yy%v)
do i = 1, n
res = res + x%v(i) * yy%v(i)
end do
!$acc end parallel loop
class default
call x%sync()
res = y%dot(n, x%v)
end select
end function d_oacc_vect_dot
function d_oacc_dot_a(n, x, y) result(res)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
real(psb_dpk_), intent(in) :: y(:)
integer(psb_ipk_), intent(in) :: n
real(psb_dpk_) :: res
real(psb_dpk_), external :: ddot
if (x%is_dev()) call x%sync()
res = ddot(n, y, 1, x%v, 1)
end function d_oacc_dot_a
! subroutine d_oacc_set_vect(x,y)
! implicit none
! class(psb_d_vect_oacc), intent(inout) :: x
! real(psb_dpk_), intent(in) :: y(:)
! integer(psb_ipk_) :: info
! if (size(x%v) /= size(y)) then
! call x%free(info)
! call x%all(size(y),info)
! end if
! x%v(:) = y(:)
! call x%set_host()
! end subroutine d_oacc_set_vect
subroutine d_oacc_to_dev(v)
implicit none
real(psb_dpk_) :: v(:)
!$acc update device(v)
end subroutine d_oacc_to_dev
subroutine d_oacc_to_host(v)
implicit none
real(psb_dpk_) :: v(:)
!$acc update self(v)
end subroutine d_oacc_to_host
subroutine d_oacc_sync_space(x)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
if (allocated(x%v)) then
call d_oacc_create_dev(x%v)
end if
contains
subroutine d_oacc_create_dev(v)
implicit none
real(psb_dpk_) :: v(:)
!$acc enter data copyin(v)
end subroutine d_oacc_create_dev
end subroutine d_oacc_sync_space
subroutine d_oacc_sync(x)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
if (x%is_dev()) then
call d_oacc_to_host(x%v)
end if
if (x%is_host()) then
call d_oacc_to_dev(x%v)
end if
call x%set_sync()
end subroutine d_oacc_sync
subroutine d_oacc_set_host(x)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
x%state = is_host
end subroutine d_oacc_set_host
subroutine d_oacc_set_dev(x)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
x%state = is_dev
end subroutine d_oacc_set_dev
subroutine d_oacc_set_sync(x)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
x%state = is_sync
end subroutine d_oacc_set_sync
function d_oacc_is_dev(x) result(res)
implicit none
class(psb_d_vect_oacc), intent(in) :: x
logical :: res
res = (x%state == is_dev)
end function d_oacc_is_dev
function d_oacc_is_host(x) result(res)
implicit none
class(psb_d_vect_oacc), intent(in) :: x
logical :: res
res = (x%state == is_host)
end function d_oacc_is_host
function d_oacc_is_sync(x) result(res)
implicit none
class(psb_d_vect_oacc), intent(in) :: x
logical :: res
res = (x%state == is_sync)
end function d_oacc_is_sync
subroutine d_oacc_vect_all(n, x, info)
use psi_serial_mod
use psb_realloc_mod
implicit none
integer(psb_ipk_), intent(in) :: n
class(psb_d_vect_oacc), intent(out) :: x
integer(psb_ipk_), intent(out) :: info
call psb_realloc(n, x%v, info)
if (info == 0) then
call x%set_host()
!$acc enter data create(x%v)
call x%sync_space()
end if
if (info /= 0) then
info = psb_err_alloc_request_
call psb_errpush(info, 'd_oacc_all', &
i_err=(/n, n, n, n, n/))
end if
end subroutine d_oacc_vect_all
subroutine d_oacc_vect_free(x, info)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
info = 0
if (allocated(x%v)) then
!$acc exit data delete(x%v) finalize
deallocate(x%v, stat=info)
end if
end subroutine d_oacc_vect_free
function oacc_get_size(x) result(res)
implicit none
class(psb_d_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: res
if (x%is_dev()) call x%sync()
res = size(x%v)
end function oacc_get_size
subroutine initialize(N)
integer(psb_ipk_) :: N
integer(psb_ipk_) :: i
allocate(v1(N),v2(N),p(N))
!$acc enter data create(v1,v2,p)
!$acc parallel
!$acc loop
do i=1,n
v1(i) = i
v2(i) = n+i
end do
!$acc end parallel
end subroutine initialize
subroutine finalize_dev()
!$acc exit data delete(v1,v2,p)
end subroutine finalize_dev
subroutine finalize_host()
deallocate(v1,v2,p)
end subroutine finalize_host
subroutine to_dev()
!$acc update device(v1,v2)
end subroutine to_dev
subroutine to_host()
!$acc update self(v1,v2)
end subroutine to_host
function d_dot(N) result(res)
real(kind(1.d0)) :: res
integer(psb_ipk_) :: i,N
real(kind(1.d0)) :: t1,t2,t3
res = 0.0d0
!$acc parallel
!$acc loop reduction(+:res)
do i=1,N
res = res + v1(i) * v2(i)
end do
!$acc end parallel
end function d_dot
function h_dot(N) result(res)
integer(psb_ipk_) :: i,N
real(kind(1.d0)) :: t1,t2,t3,res
res = 0.0d0
do i=1,N
res = res + v1(i) * v2(i)
end do
end function h_dot
end module psb_d_oacc_vect_mod

@ -0,0 +1,455 @@
module psb_i_oacc_vect_mod
use iso_c_binding
use psb_const_mod
use psb_error_mod
use psb_i_vect_mod
integer(psb_ipk_), parameter, private :: is_host = -1
integer(psb_ipk_), parameter, private :: is_sync = 0
integer(psb_ipk_), parameter, private :: is_dev = 1
type, extends(psb_i_base_vect_type) :: psb_i_vect_oacc
integer :: state = is_host
contains
procedure, pass(x) :: get_nrows => i_oacc_get_nrows
procedure, nopass :: get_fmt => i_oacc_get_fmt
procedure, pass(x) :: all => i_oacc_all
procedure, pass(x) :: zero => i_oacc_zero
procedure, pass(x) :: asb_m => i_oacc_asb_m
procedure, pass(x) :: sync => i_oacc_sync
procedure, pass(x) :: sync_space => i_oacc_sync_space
procedure, pass(x) :: bld_x => i_oacc_bld_x
procedure, pass(x) :: bld_mn => i_oacc_bld_mn
procedure, pass(x) :: free => i_oacc_free
procedure, pass(x) :: ins_a => i_oacc_ins_a
procedure, pass(x) :: ins_v => i_oacc_ins_v
procedure, pass(x) :: is_host => i_oacc_is_host
procedure, pass(x) :: is_dev => i_oacc_is_dev
procedure, pass(x) :: is_sync => i_oacc_is_sync
procedure, pass(x) :: set_host => i_oacc_set_host
procedure, pass(x) :: set_dev => i_oacc_set_dev
procedure, pass(x) :: set_sync => i_oacc_set_sync
procedure, pass(x) :: set_scal => i_oacc_set_scal
procedure, pass(x) :: gthzv_x => i_oacc_gthzv_x
procedure, pass(y) :: sctb => i_oacc_sctb
procedure, pass(y) :: sctb_x => i_oacc_sctb_x
procedure, pass(x) :: gthzbuf => i_oacc_gthzbuf
procedure, pass(y) :: sctb_buf => i_oacc_sctb_buf
final :: i_oacc_vect_finalize
end type psb_i_vect_oacc
public :: psb_i_vect_oacc_
private :: constructor
interface psb_i_vect_oacc_
module procedure constructor
end interface psb_i_vect_oacc_
contains
function constructor(x) result(this)
integer(psb_ipk_) :: x(:)
type(psb_i_vect_oacc) :: this
integer(psb_ipk_) :: info
this%v = x
call this%asb(size(x), info)
end function constructor
subroutine i_oacc_gthzv_x(i, n, idx, x, y)
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
integer(psb_ipk_) :: y(:)
class(psb_i_vect_oacc) :: x
integer :: info
!$acc parallel loop
do i = 1, n
y(i) = x%v(idx%v(i))
end do
end subroutine i_oacc_gthzv_x
subroutine i_oacc_gthzbuf(i, n, idx, x)
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
class(psb_i_vect_oacc) :: x
integer :: info
if (.not.allocated(x%combuf)) then
call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf')
return
end if
!$acc parallel loop
do i = 1, n
x%combuf(i) = x%v(idx%v(i))
end do
end subroutine i_oacc_gthzbuf
subroutine i_oacc_sctb(n, idx, x, beta, y)
implicit none
integer(psb_ipk_) :: n, idx(:)
integer(psb_ipk_) :: beta, x(:)
class(psb_i_vect_oacc) :: y
integer(psb_ipk_) :: info
integer :: i
if (n == 0) return
!$acc parallel loop
do i = 1, n
y%v(idx(i)) = beta * y%v(idx(i)) + x(i)
end do
end subroutine i_oacc_sctb
subroutine i_oacc_sctb_x(i, n, idx, x, beta, y)
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
integer(psb_ipk_) :: beta, x(:)
class(psb_i_vect_oacc) :: y
integer :: info
select type(ii => idx)
class is (psb_i_vect_oacc)
if (ii%is_host()) call ii%sync_space(info)
if (y%is_host()) call y%sync_space(info)
!$acc parallel loop
do i = 1, n
y%v(ii%v(i)) = beta * y%v(ii%v(i)) + x(i)
end do
class default
!$acc parallel loop
do i = 1, n
y%v(idx%v(i)) = beta * y%v(idx%v(i)) + x(i)
end do
end select
end subroutine i_oacc_sctb_x
subroutine i_oacc_sctb_buf(i, n, idx, beta, y)
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
integer(psb_ipk_) :: beta
class(psb_i_vect_oacc) :: y
integer(psb_ipk_) :: info
if (.not.allocated(y%v)) then
call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf')
return
end if
!$acc parallel loop
do i = 1, n
y%v(idx%v(i)) = beta * y%v(idx%v(i)) + y%v(i)
end do
end subroutine i_oacc_sctb_buf
subroutine i_oacc_set_host(x)
class(psb_i_vect_oacc), intent(inout) :: x
x%state = is_host
end subroutine i_oacc_set_host
subroutine i_oacc_set_sync(x)
class(psb_i_vect_oacc), intent(inout) :: x
x%state = is_sync
end subroutine i_oacc_set_sync
subroutine i_oacc_set_dev(x)
class(psb_i_vect_oacc), intent(inout) :: x
x%state = is_dev
end subroutine i_oacc_set_dev
subroutine i_oacc_set_scal(x, val, first, last)
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: val
integer(psb_ipk_), optional :: first, last
integer(psb_ipk_) :: first_, last_
first_ = 1
last_ = size(x%v)
if (present(first)) first_ = max(1, first)
if (present(last)) last_ = min(size(x%v), last)
!$acc parallel loop
do i = first_, last_
x%v(i) = val
end do
call x%set_dev()
end subroutine i_oacc_set_scal
function i_oacc_is_host(x) result(res)
implicit none
class(psb_i_vect_oacc), intent(in) :: x
logical :: res
res = (x%state == is_host)
end function i_oacc_is_host
function i_oacc_is_dev(x) result(res)
implicit none
class(psb_i_vect_oacc), intent(in) :: x
logical :: res
res = (x%state == is_dev)
end function i_oacc_is_dev
function i_oacc_is_sync(x) result(res)
implicit none
class(psb_i_vect_oacc), intent(in) :: x
logical :: res
res = (x%state == is_sync)
end function i_oacc_is_sync
subroutine i_oacc_free(x, info)
use psb_error_mod
implicit none
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
info = 0
if (allocated(x%v)) deallocate(x%v, stat=info)
if (info /= 0) then
info = psb_err_alloc_dealloc_
call psb_errpush(info, 'i_oacc_free')
end if
call x%set_sync()
end subroutine i_oacc_free
subroutine i_oacc_ins_a(n, irl, val, dupl, x, info)
implicit none
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n, dupl
integer(psb_ipk_), intent(in) :: irl(:)
integer(psb_ipk_), intent(in) :: val(:)
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i
info = 0
if (x%is_dev()) call x%sync()
call x%psb_i_base_vect_type%ins(n, irl, val, dupl, info)
call x%set_host()
end subroutine i_oacc_ins_a
subroutine i_oacc_ins_v(n, irl, val, dupl, x, info)
implicit none
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n, dupl
class(psb_i_base_vect_type), intent(inout) :: irl
class(psb_i_base_vect_type), intent(inout) :: val
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, isz
logical :: done_oacc
info = 0
if (psb_errstatus_fatal()) return
done_oacc = .false.
select type(virl => irl)
class is (psb_i_vect_oacc)
select type(vval => val)
class is (psb_i_vect_oacc)
if (vval%is_host()) call vval%sync()
if (virl%is_host()) call virl%sync()
if (x%is_host()) call x%sync()
! Add the OpenACC kernel call here if needed
call x%set_dev()
done_oacc = .true.
end select
end select
if (.not.done_oacc) then
if (irl%is_dev()) call irl%sync()
if (val%is_dev()) call val%sync()
call x%ins(n, irl%v, val%v, dupl, info)
end if
if (info /= 0) then
call psb_errpush(info,'i_oacc_ins_v')
return
end if
end subroutine i_oacc_ins_v
subroutine i_oacc_bld_x(x, this)
use psb_error_mod
implicit none
integer(psb_ipk_), intent(in) :: this(:)
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: info
call psb_realloc(size(this), x%v, info)
if (info /= 0) then
info = psb_err_alloc_request_
call psb_errpush(info, 'i_oacc_bld_x', i_err = (/size(this), izero, izero, izero, izero/))
end if
x%v(:) = this(:)
call x%set_host()
call x%sync()
end subroutine i_oacc_bld_x
subroutine i_oacc_bld_mn(x, n)
use psb_error_mod
implicit none
integer(psb_mpk_), intent(in) :: n
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: info
call x%all(n, info)
if (info /= 0) then
call psb_errpush(info, 'i_oacc_bld_mn', i_err = (/n, n, n, n, n/))
end if
end subroutine i_oacc_bld_mn
subroutine i_oacc_sync(x)
use psb_error_mod
implicit none
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: n, info
info = 0
if (x%is_host()) then
n = size(x%v)
if (.not.allocated(x%v)) then
write(*, *) 'Incoherent situation : x%v not allocated'
call psb_realloc(n, x%v, info)
end if
if ((n > size(x%v)) .or. (n > x%get_nrows())) then
write(*, *) 'Incoherent situation : sizes', n, size(x%v), x%get_nrows()
call psb_realloc(n, x%v, info)
end if
!$acc update device(x%v)
else if (x%is_dev()) then
n = size(x%v)
if (.not.allocated(x%v)) then
write(*, *) 'Incoherent situation : x%v not allocated'
call psb_realloc(n, x%v, info)
end if
!$acc update self(x%v)
end if
if (info == 0) call x%set_sync()
if (info /= 0) then
info = psb_err_internal_error_
call psb_errpush(info, 'i_oacc_sync')
end if
end subroutine i_oacc_sync
subroutine i_oacc_sync_space(x, info)
use psb_error_mod
implicit none
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nh, nd
info = 0
if (x%is_dev()) then
nh = size(x%v)
nd = nh
if (nh < nd) then
call psb_realloc(nd, x%v, info)
end if
else
nh = size(x%v)
nd = nh
if (nh < nd) then
call psb_realloc(nd, x%v, info)
end if
end if
end subroutine i_oacc_sync_space
function i_oacc_get_nrows(x) result(res)
implicit none
class(psb_i_vect_oacc), intent(in) :: x
integer(psb_ipk_) :: res
res = 0
if (allocated(x%v)) res = size(x%v)
end function i_oacc_get_nrows
function i_oacc_get_fmt() result(res)
implicit none
character(len=5) :: res
res = 'iOACC'
end function i_oacc_get_fmt
subroutine i_oacc_all(n, x, info)
use psb_error_mod
implicit none
class(psb_i_vect_oacc), intent(out) :: x
integer(psb_ipk_), intent(in) :: n
integer(psb_ipk_), intent(out) :: info
call psb_realloc(n, x%v, info)
if (info == 0) call x%set_host()
if (info == 0) call x%sync_space(info)
if (info /= 0) then
info = psb_err_alloc_request_
call psb_errpush(info, 'i_oacc_all', i_err=(/n, n, n, n, n/))
end if
end subroutine i_oacc_all
subroutine i_oacc_zero(x)
use psb_error_mod
implicit none
class(psb_i_vect_oacc), intent(inout) :: x
! Ensure zeroing on the GPU side
call x%set_dev()
x%v = 0
!$acc update device(x%v)
end subroutine i_oacc_zero
subroutine i_oacc_asb_m(n, x, info)
use psb_error_mod
use psb_realloc_mod
implicit none
integer(psb_ipk_), intent(in) :: n
class(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nh, nd
info = 0
if (x%is_dev()) then
nd = size(x%v)
if (nd < n) then
call x%sync()
call x%psb_i_base_vect_type%asb(n, info)
if (info == psb_success_) call x%sync_space(info)
call x%set_host()
end if
else
nh = size(x%v)
if (nh < n) then
call x%psb_i_base_vect_type%asb(n, info)
if (info == psb_success_) call x%sync_space(info)
call x%set_host()
end if
end if
end subroutine i_oacc_asb_m
subroutine i_oacc_vect_finalize(x)
use psi_serial_mod
use psb_realloc_mod
implicit none
type(psb_i_vect_oacc), intent(inout) :: x
integer(psb_ipk_) :: info
info = 0
call x%free(info)
end subroutine i_oacc_vect_finalize
end module psb_i_oacc_vect_mod

@ -0,0 +1,7 @@
module psb_oacc_mod
use psb_const_mod
use psb_d_oacc_vect_mod
use psb_d_oacc_csr_mat_mod
end module psb_oacc_mod

@ -0,0 +1,53 @@
TOPDIR=../..
include $(TOPDIR)/Make.inc
# Directories
LIBDIR=$(TOPDIR)/lib/
PSBLIBDIR=$(TOPDIR)/lib/
PSBINCDIR=$(TOPDIR)/include
PSBMODDIR=$(TOPDIR)/modules
INCDIR=$(TOPDIR)/include
MODDIR=$(TOPDIR)/modules
EXEDIR=./runs
# Libraries
PSBLAS_LIB= -L$(LIBDIR) -L$(PSBLIBDIR) -lpsb_util -lpsb_ext -lpsb_base -lpsb_openacc -lopenblas -lmetis
LDLIBS=$(PSBGPULDLIBS)
# Includes
FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(FMFLAG). $(FMFLAG)$(PSBMODDIR) $(FMFLAG)$(PSBINCDIR) $(LIBRSB_DEFINES)
# Compiler flags
FFLAGS=-O0 -march=native -fopenacc -foffload=nvptx-none="-march=sm_70"
CFLAGS=-O0 -march=native
# Source files
SRCS=test.F90 vectoacc.F90
CSRC=timers.c
# Object files
OBJS=$(SRCS:.F90=.o) $(CSRC:.c=.o)
# Default rule
all: dir
dir:
@if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi
# Pattern rule for creating executables
%: %.o timers.o
$(FC) $(FFLAGS) $^ -o $@ $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS)
/bin/mv $@ $(EXEDIR)
# Compilation rules
%.o: %.F90
$(FC) $(FFLAGS) $(FINCLUDES) -c $< -o $@
%.o: %.c
$(CC) $(CFLAGS) $(FINCLUDES) -c $< -o $@
clean:
/bin/rm -fr *.o *.mod $(EXEDIR)/*
# Phony targets
.PHONY: all dir clean

@ -0,0 +1,617 @@
module psb_d_pde3d_mod
use psb_base_mod, only : psb_dpk_, psb_ipk_, psb_lpk_, psb_desc_type,&
& psb_dspmat_type, psb_d_vect_type, dzero,&
& psb_d_base_sparse_mat, psb_d_base_vect_type, &
& psb_i_base_vect_type, psb_l_base_vect_type
interface
function d_func_3d(x,y,z) result(val)
import :: psb_dpk_
real(psb_dpk_), intent(in) :: x,y,z
real(psb_dpk_) :: val
end function d_func_3d
end interface
interface psb_gen_pde3d
module procedure psb_d_gen_pde3d
end interface psb_gen_pde3d
contains
function d_null_func_3d(x,y,z) result(val)
real(psb_dpk_), intent(in) :: x,y,z
real(psb_dpk_) :: val
val = dzero
end function d_null_func_3d
!
! functions parametrizing the differential equation
!
!
! Note: b1, b2 and b3 are the coefficients of the first
! derivative of the unknown function. The default
! we apply here is to have them zero, so that the resulting
! matrix is symmetric/hermitian and suitable for
! testing with CG and FCG.
! When testing methods for non-hermitian matrices you can
! change the B1/B2/B3 functions to e.g. done/sqrt((3*done))
!
function b1(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: b1
real(psb_dpk_), intent(in) :: x,y,z
b1=done/sqrt((3*done))
end function b1
function b2(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: b2
real(psb_dpk_), intent(in) :: x,y,z
b2=done/sqrt((3*done))
end function b2
function b3(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: b3
real(psb_dpk_), intent(in) :: x,y,z
b3=done/sqrt((3*done))
end function b3
function c(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: c
real(psb_dpk_), intent(in) :: x,y,z
c=dzero
end function c
function a1(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: a1
real(psb_dpk_), intent(in) :: x,y,z
a1=done/80
end function a1
function a2(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: a2
real(psb_dpk_), intent(in) :: x,y,z
a2=done/80
end function a2
function a3(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: a3
real(psb_dpk_), intent(in) :: x,y,z
a3=done/80
end function a3
function g(x,y,z)
use psb_base_mod, only : psb_dpk_, done, dzero
implicit none
real(psb_dpk_) :: g
real(psb_dpk_), intent(in) :: x,y,z
g = dzero
if (x == done) then
g = done
else if (x == dzero) then
g = exp(y**2-z**2)
end if
end function g
!
! subroutine to allocate and fill in the coefficient matrix and
! the rhs.
!
subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info,&
& f,amold,vmold,imold,partition,nrl,iv,tnd)
use psb_base_mod
use psb_util_mod
!
! Discretizes the partial differential equation
!
! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u)
! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f
! dxdx dydy dzdz dx dy dz
!
! with Dirichlet boundary conditions
! u = g
!
! on the unit cube 0<=x,y,z<=1.
!
!
! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation.
!
implicit none
integer(psb_ipk_) :: idim
type(psb_dspmat_type) :: a
type(psb_d_vect_type) :: xv,bv
type(psb_desc_type) :: desc_a
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: info
character(len=*) :: afmt
procedure(d_func_3d), optional :: f
class(psb_d_base_sparse_mat), optional :: amold
class(psb_d_base_vect_type), optional :: vmold
class(psb_i_base_vect_type), optional :: imold
integer(psb_ipk_), optional :: partition, nrl,iv(:)
logical, optional :: tnd
! Local variables.
integer(psb_ipk_), parameter :: nb=20
type(psb_d_csc_sparse_mat) :: acsc
type(psb_d_coo_sparse_mat) :: acoo
type(psb_d_csr_sparse_mat) :: acsr
real(psb_dpk_) :: zt(nb),x,y,z
integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
integer(psb_lpk_) :: m,n,glob_row,nt
integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
! For 3D partition
! Note: integer control variables going directly into an MPI call
! must be 4 bytes, i.e. psb_mpk_
integer(psb_mpk_) :: npdims(3), npp, minfo
integer(psb_ipk_) :: npx,npy,npz, iamx,iamy,iamz,mynx,myny,mynz
integer(psb_ipk_), allocatable :: bndx(:),bndy(:),bndz(:)
! Process grid
integer(psb_ipk_) :: np, iam
integer(psb_ipk_) :: icoeff
integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:)
real(psb_dpk_), allocatable :: val(:)
! deltah dimension of each grid cell
! deltat discretization time
real(psb_dpk_) :: deltah, sqdeltah, deltah2
real(psb_dpk_), parameter :: rhs=dzero,one=done,zero=dzero
real(psb_dpk_) :: t0, t1, t2, t3, tasb, talc, ttot, tgen, tcdasb
integer(psb_ipk_) :: err_act
procedure(d_func_3d), pointer :: f_
logical :: tnd_
character(len=20) :: name, ch_err,tmpfmt
info = psb_success_
name = 'create_matrix'
call psb_erractionsave(err_act)
call psb_info(ctxt, iam, np)
if (present(f)) then
f_ => f
else
f_ => d_null_func_3d
end if
deltah = done/(idim+2)
sqdeltah = deltah*deltah
deltah2 = (2*done)* deltah
if (present(partition)) then
if ((1<= partition).and.(partition <= 3)) then
partition_ = partition
else
write(*,*) 'Invalid partition choice ',partition,' defaulting to 3'
partition_ = 3
end if
else
partition_ = 3
end if
! initialize array descriptor and sparse matrix storage. provide an
! estimate of the number of non zeroes
m = (1_psb_lpk_*idim)*idim*idim
n = m
nnz = ((n*7)/(np))
if(iam == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n
t0 = psb_wtime()
select case(partition_)
case(1)
! A BLOCK partition
if (present(nrl)) then
nr = nrl
else
!
! Using a simple BLOCK distribution.
!
nt = (m+np-1)/np
nr = max(0,min(nt,m-(iam*nt)))
end if
nt = nr
call psb_sum(ctxt,nt)
if (nt /= m) then
write(psb_err_unit,*) iam, 'Initialization error ',nr,nt,m
info = -1
call psb_barrier(ctxt)
call psb_abort(ctxt)
return
end if
!
! First example of use of CDALL: specify for each process a number of
! contiguous rows
!
call psb_cdall(ctxt,desc_a,info,nl=nr)
myidx = desc_a%get_global_indices()
nlr = size(myidx)
case(2)
! A partition defined by the user through IV
if (present(iv)) then
if (size(iv) /= m) then
write(psb_err_unit,*) iam, 'Initialization error: wrong IV size',size(iv),m
info = -1
call psb_barrier(ctxt)
call psb_abort(ctxt)
return
end if
else
write(psb_err_unit,*) iam, 'Initialization error: IV not present'
info = -1
call psb_barrier(ctxt)
call psb_abort(ctxt)
return
end if
!
! Second example of use of CDALL: specify for each row the
! process that owns it
!
call psb_cdall(ctxt,desc_a,info,vg=iv)
myidx = desc_a%get_global_indices()
nlr = size(myidx)
case(3)
! A 3-dimensional partition
! A nifty MPI function will split the process list
npdims = 0
call mpi_dims_create(np,3,npdims,info)
npx = npdims(1)
npy = npdims(2)
npz = npdims(3)
allocate(bndx(0:npx),bndy(0:npy),bndz(0:npz))
! We can reuse idx2ijk for process indices as well.
call idx2ijk(iamx,iamy,iamz,iam,npx,npy,npz,base=0)
! Now let's split the 3D cube in hexahedra
call dist1Didx(bndx,idim,npx)
mynx = bndx(iamx+1)-bndx(iamx)
call dist1Didx(bndy,idim,npy)
myny = bndy(iamy+1)-bndy(iamy)
call dist1Didx(bndz,idim,npz)
mynz = bndz(iamz+1)-bndz(iamz)
! How many indices do I own?
nlr = mynx*myny*mynz
allocate(myidx(nlr))
! Now, let's generate the list of indices I own
nr = 0
do i=bndx(iamx),bndx(iamx+1)-1
do j=bndy(iamy),bndy(iamy+1)-1
do k=bndz(iamz),bndz(iamz+1)-1
nr = nr + 1
call ijk2idx(myidx(nr),i,j,k,idim,idim,idim)
end do
end do
end do
if (nr /= nlr) then
write(psb_err_unit,*) iam,iamx,iamy,iamz, 'Initialization error: NR vs NLR ',&
& nr,nlr,mynx,myny,mynz
info = -1
call psb_barrier(ctxt)
call psb_abort(ctxt)
end if
!
! Third example of use of CDALL: specify for each process
! the set of global indices it owns.
!
call psb_cdall(ctxt,desc_a,info,vl=myidx)
case default
write(psb_err_unit,*) iam, 'Initialization error: should not get here'
info = -1
call psb_barrier(ctxt)
call psb_abort(ctxt)
return
end select
if (info == psb_success_) call psb_spall(a,desc_a,info,nnz=nnz,&
& dupl=psb_dupl_err_)
! define rhs from boundary conditions; also build initial guess
if (info == psb_success_) call psb_geall(xv,desc_a,info)
if (info == psb_success_) call psb_geall(bv,desc_a,info)
call psb_barrier(ctxt)
talc = psb_wtime()-t0
if (info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='allocation rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
! we build an auxiliary matrix consisting of one row at a
! time; just a small matrix. might be extended to generate
! a bunch of rows per call.
!
allocate(val(20*nb),irow(20*nb),&
&icol(20*nb),stat=info)
if (info /= psb_success_ ) then
info=psb_err_alloc_dealloc_
call psb_errpush(info,name)
goto 9999
endif
! loop over rows belonging to current process in a block
! distribution.
call psb_barrier(ctxt)
t1 = psb_wtime()
do ii=1, nlr,nb
ib = min(nb,nlr-ii+1)
icoeff = 1
do k=1,ib
i=ii+k-1
! local matrix pointer
glob_row=myidx(i)
! compute gridpoint coordinates
call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
! x, y, z coordinates
x = (ix-1)*deltah
y = (iy-1)*deltah
z = (iz-1)*deltah
zt(k) = f_(x,y,z)
! internal point: build discretization
!
! term depending on (x-1,y,z)
!
val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
if (ix == 1) then
zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y-1,z)
val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
if (iy == 1) then
zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z-1)
val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
if (iz == 1) then
zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y,z)
val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
& + c(x,y,z)
call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
! term depending on (x,y,z+1)
val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
if (iz == idim) then
zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x,y+1,z)
val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
if (iy == idim) then
zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
! term depending on (x+1,y,z)
val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
if (ix==idim) then
zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k)
else
call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
irow(icoeff) = glob_row
icoeff = icoeff+1
endif
end do
call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
if(info /= psb_success_) exit
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
if(info /= psb_success_) exit
zt(:)=dzero
call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
if(info /= psb_success_) exit
end do
tgen = psb_wtime()-t1
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='insert rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
deallocate(val,irow,icol)
call psb_barrier(ctxt)
t1 = psb_wtime()
call psb_cdasb(desc_a,info,mold=imold)
tcdasb = psb_wtime()-t1
call psb_barrier(ctxt)
t1 = psb_wtime()
if (info == psb_success_) then
if (present(amold)) then
call psb_spasb(a,desc_a,info,mold=amold,bld_and=tnd)
else
call psb_spasb(a,desc_a,info,afmt=afmt,bld_and=tnd)
end if
end if
call psb_barrier(ctxt)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='asb rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
if (info == psb_success_) call psb_geasb(xv,desc_a,info,mold=vmold)
if (info == psb_success_) call psb_geasb(bv,desc_a,info,mold=vmold)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='asb rout.'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
tasb = psb_wtime()-t1
call psb_barrier(ctxt)
ttot = psb_wtime() - t0
call psb_amx(ctxt,talc)
call psb_amx(ctxt,tgen)
call psb_amx(ctxt,tasb)
call psb_amx(ctxt,ttot)
if(iam == psb_root_) then
tmpfmt = a%get_fmt()
write(psb_out_unit,'("The matrix has been generated and assembled in ",a3," format.")')&
& tmpfmt
write(psb_out_unit,'("-allocation time : ",es12.5)') talc
write(psb_out_unit,'("-coeff. gen. time : ",es12.5)') tgen
write(psb_out_unit,'("-desc asbly time : ",es12.5)') tcdasb
write(psb_out_unit,'("- mat asbly time : ",es12.5)') tasb
write(psb_out_unit,'("-total time : ",es12.5)') ttot
end if
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(ctxt,err_act)
return
end subroutine psb_d_gen_pde3d
end module psb_d_pde3d_mod
program test
use psb_base_mod
use psb_ext_mod
use psb_oacc_mod
use psb_d_pde3d_mod
implicit none
integer(psb_ipk_) :: n, i, info, m, nrm, nz
integer(psb_ipk_), parameter :: ntests=80, ngpu=20
real(psb_dpk_) :: dot_dev, dot_host
type(psb_d_vect_oacc) :: tx, ty
type(psb_d_oacc_csr_sparse_mat) :: aacsr
real(psb_dpk_) :: t0, t1, t2, t3, csflp, elflp
double precision, external :: etime
type(psb_dspmat_type) :: a
type(psb_desc_type) :: desc_a
type(psb_d_vect_type) :: xxv, bv
type(psb_d_csr_sparse_mat) :: acsr
character(len=5) :: afmt='csr'
real(psb_dpk_), allocatable :: vv(:), ydev(:), yhost(:)
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: iam, np, nth, idim
integer(psb_epk_) :: neq
call psb_init(ctxt)
call psb_info(ctxt, iam, np)
write(*,*) 'Enter size :'
read(*,*) idim
idim = max(1, idim)
n = idim**3
call psb_gen_pde3d(ctxt, idim, a, bv, xxv, desc_a, afmt, info)
call a%cp_to(acsr)
m = acsr%get_nrows()
n = acsr%get_ncols()
nz = acsr%get_nzeros()
call aacsr%all(m, n, nz, info)
aacsr%val = (acsr%val)
aacsr%ja = (acsr%ja)
aacsr%irp = (acsr%irp)
call aacsr%set_host()
call aacsr%sync()
call initialize(n)
call to_host()
t2 = etime()
do i = 1, ntests
dot_host = h_dot(n)
end do
t3 = etime()
call tx%all(n, info)
call ty%all(n, info)
vv = bv%get_vect()
call bv%set_vect(v1)
call tx%set_vect(v1)
call ty%set_vect(v2)
t0 = etime()
do i = 1, ntests * ngpu
dot_dev = tx%dot_v(n, ty)
end do
!$acc wait
t1 = etime()
write(*,*) ' Dot Results : dev:', dot_dev, ' host:', dot_host
write(*,*) ' Timing : dev:', t1 - t0, (t1 - t0) / (ntests * ngpu), &
' host:', t3 - t2, (t3 - t2) / ntests
call a%mv_from(acsr)
t2 = etime()
do i = 1, ntests
call a%spmm(done, bv, dzero, xxv, info)
end do
t3 = etime()
yhost = xxv%get_vect()
t0 = etime()
do i = 1, ntests * ngpu
call aacsr%vect_mv(done, tx, dzero, ty, info)
end do
!$acc wait
t1 = etime()
ydev = ty%get_vect()
write(*,*) 'Correctness check: ', maxval(abs(ydev(:) - yhost(:)))
write(*,*) ' CSR PROD '
write(*, '(2(a,f12.3,2x))') ' Timing (ms): '
csflp = 2.d0 * nz / ((t1 - t0) / (ntests * ngpu))
write(*, '(2(a,f12.3,2x))') ' dev:', 1e3 * (t1 - t0) / (ntests * ngpu), ' :', csflp / 1.d6
csflp = 2.d0 * nz / ((t3 - t2) / (ntests))
write(*, '(2(a,f12.3,2x))') ' host:', 1e3 * (t3 - t2) / ntests, ' :', csflp / 1.d6
write(*,*) 'Done'
call tx%free(info)
call ty%free(info)
call finalize_dev()
call finalize_host()
call psb_exit(ctxt)
end program test

@ -0,0 +1,97 @@
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
double wtime()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec)*1.0e3 + ((double)tt.tv_usec)*1.0e-3;
return(temp);
}
double timef_()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec)*1.0e3 + ((double)tt.tv_usec)*1.0e-3;
return(temp);
}
double timef()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec)*1.0e3 + ((double)tt.tv_usec)*1.0e-3;
return(temp);
}
double etime()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6;
return(temp);
}
double etime_()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6;
return(temp);
}
double etimef()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6;
return(temp);
}
double etimef_()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6;
return(temp);
}

@ -0,0 +1,85 @@
program vectoacc
use psb_base_mod
use psb_oacc_mod
implicit none
type(psb_d_vect_oacc) :: v3, v4, v5
integer(psb_ipk_) :: info, n, i
real(psb_dpk_) :: alpha, beta, result
double precision, external :: etime
real(psb_dpk_) :: dot_host, dot_dev, t_host, t_dev
double precision :: time_start, time_end
integer(psb_ipk_), parameter :: ntests=80, ngpu=20
write(*, *) 'Test of the vector operations with OpenACC'
write(*, *) 'Enter the size of the vectors'
read(*, *) n
alpha = 2.0
beta = 0.5
call v3%all(n, info)
call v4%all(n, info)
call v5%all(n, info)
do i = 1, n
v3%v(i) = real(i, psb_dpk_)
v4%v(i) = real(n - i, psb_dpk_)
end do
call v3%set_dev()
call v4%set_dev()
call v3%scal(alpha)
call v3%sync()
do i = 1, n
if (v3%v(i) /= alpha * real(i, psb_dpk_)) then
write(*, *) 'Scal error : index', i
end if
end do
write(*, *) 'Scal test passed'
result = v3%dot_v(n, v4)
call v3%sync()
call v4%sync()
if (result /= sum(v3%v * v4%v)) then
write(*, *) 'Dot_v error, expected result:', sum(v3%v * v4%v), 'instead of :', result
end if
write(*, *) 'Dot_v test passed'
result = v3%nrm2(n)
call v3%sync()
if (result /= sqrt(sum(v3%v ** 2))) then
write(*, *) 'nrm2 error, expected result:', sqrt(sum(v3%v ** 2)), 'instead of :', result
end if
write(*, *) 'nrm2 test passed'
call v3%set_host()
call v4%set_host()
time_start = etime()
do i = 1, ntests
dot_host = sum(v3%v * v4%v)
end do
time_end = etime()
t_host = (time_end - time_start) / real(ntests)
write(*, *) 'Performance host: ', t_host, ' sec'
call v3%set_dev()
call v4%set_dev()
time_start = etime()
do i = 1, ntests
dot_dev = v3%dot_v(n, v4)
end do
!$acc wait
time_end = etime()
t_dev = (time_end - time_start) / real(ntests)
write(*, *) 'Performance device: ', t_dev, ' sec'
call v3%free(info)
call v4%free(info)
call v5%free(info)
end program vectoacc
Loading…
Cancel
Save