diff --git a/openacc/Makefile b/openacc/Makefile new file mode 100644 index 00000000..3590249c --- /dev/null +++ b/openacc/Makefile @@ -0,0 +1,48 @@ +.SUFFIXES: +.SUFFIXES: .F90 .f90 .o .s .c + +# Compilers and flags +CC=mpicc +FC=mpif90 +FCOPT=-O0 -march=native +OFFLOAD=-fopenacc -foffload=nvptx-none="-march=sm_70" + +# Directories +LIBDIR=../lib +INCDIR=../include +MODDIR=../modules +IMPLDIR=./impl # Adding the impl directory + +# Include and library paths +INCLUDES=-I$(LIBDIR) -I$(INCDIR) -I$(MODDIR) -I$(IMPLDIR) +LIBS=-L$(LIBDIR) -lpsb_util -lpsb_ext -lpsb_base -lopenblas -lmetis + +# Source files +FOBJS= psb_i_oacc_vect_mod.o psb_d_oacc_vect_mod.o \ + psb_oacc_mod.o psb_d_oacc_csr_mat_mod.o \ + impl/psb_d_oacc_csr_vect_mv.o + +# Library name +LIBNAME=libpsb_openacc.a + +# Rules +all: $(LIBNAME) + +$(LIBNAME): $(FOBJS) + ar cr $(LIBNAME) $(FOBJS) + /bin/cp -p $(LIBNAME) $(LIBDIR) + +clean: + /bin/rm -fr *.o $(LIBNAME) *.mod impl/*.o + +.f90.o: + $(FC) $(FCOPT) $(OFFLOAD) $(INCLUDES) -c $< -o $@ + +.c.o: + $(CC) -c $< -o $@ + +.F90.o: + $(FC) $(FCOPT) $(OFFLOAD) $(INCLUDES) -c $< -o $@ + +.F90.s: + $(FC) $(FCOPT) $(INCLUDES) -c -S $< -o $@ diff --git a/openacc/impl/psb_d_oacc_csr_vect_mv.F90 b/openacc/impl/psb_d_oacc_csr_vect_mv.F90 new file mode 100644 index 00000000..f0394591 --- /dev/null +++ b/openacc/impl/psb_d_oacc_csr_vect_mv.F90 @@ -0,0 +1,61 @@ +subroutine psb_d_oacc_csr_vect_mv(alpha, a, x, beta, y, info, trans) + use psb_base_mod + use psb_d_oacc_csr_mat_mod, psb_protect_name => psb_d_oacc_csr_vect_mv + implicit none + + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_oacc_csr_sparse_mat), intent(in) :: a + class(psb_d_base_vect_type), intent(inout) :: x, y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + integer(psb_ipk_) :: m, n + + info = psb_success_ + m = a%get_nrows() + n = a%get_ncols() + + if ((n /= size(x%v)) .or. (n /= size(y%v))) then + write(0,*) 'Size error ', m, n, size(x%v), size(y%v) + info = psb_err_invalid_mat_state_ + return + end if + + if (a%is_host()) call a%sync() + if (x%is_host()) call x%sync() + if (y%is_host()) call y%sync() + + call inner_spmv(m, n, alpha, a%val, a%ja, a%irp, x%v, beta, y%v, info) + call y%set_dev() + +contains + + subroutine inner_spmv(m, n, alpha, val, ja, irp, x, beta, y, info) + implicit none + integer(psb_ipk_) :: m, n + real(psb_dpk_), intent(in) :: alpha, beta + real(psb_dpk_) :: val(:), x(:), y(:) + integer(psb_ipk_) :: ja(:), irp(:) + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: i, j, ii, isz + real(psb_dpk_) :: tmp + integer(psb_ipk_), parameter :: vsz = 256 + + info = 0 + + !$acc parallel loop vector_length(vsz) private(isz) + do ii = 1, m, vsz + isz = min(vsz, m - ii + 1) + !$acc loop independent private(tmp) + do i = ii, ii + isz - 1 + tmp = 0.0_psb_dpk_ + !$acc loop seq + do j = irp(i), irp(i + 1) - 1 + tmp = tmp + val(j) * x(ja(j)) + end do + y(i) = alpha * tmp + beta * y(i) + end do + end do + end subroutine inner_spmv + +end subroutine psb_d_oacc_csr_vect_mv diff --git a/openacc/psb_d_oacc_csr_mat_mod.F90 b/openacc/psb_d_oacc_csr_mat_mod.F90 new file mode 100644 index 00000000..c6b7e3d0 --- /dev/null +++ b/openacc/psb_d_oacc_csr_mat_mod.F90 @@ -0,0 +1,185 @@ +module psb_d_oacc_csr_mat_mod + + use iso_c_binding + use psb_d_mat_mod + use psb_d_oacc_vect_mod + use oaccsparse_mod + + integer(psb_ipk_), parameter, private :: is_host = -1 + integer(psb_ipk_), parameter, private :: is_sync = 0 + integer(psb_ipk_), parameter, private :: is_dev = 1 + + type, extends(psb_d_csr_sparse_mat) :: psb_d_oacc_csr_sparse_mat + integer(psb_ipk_) :: devstate = is_host + contains + procedure, pass(a) :: all => d_oacc_csr_all + procedure, pass(a) :: is_host => d_oacc_csr_is_host + procedure, pass(a) :: is_sync => d_oacc_csr_is_sync + procedure, pass(a) :: is_dev => d_oacc_csr_is_dev + procedure, pass(a) :: set_host => d_oacc_csr_set_host + procedure, pass(a) :: set_sync => d_oacc_csr_set_sync + procedure, pass(a) :: set_dev => d_oacc_csr_set_dev + procedure, pass(a) :: sync_space => d_oacc_csr_sync_space + procedure, pass(a) :: sync => d_oacc_csr_sync + procedure, pass(a) :: vect_mv => psb_d_oacc_csr_vect_mv + end type psb_d_oacc_csr_sparse_mat + + interface + subroutine psb_d_oacc_csr_vect_mv(alpha, a, x, beta, y, info, trans) + import :: psb_d_oacc_csr_sparse_mat, psb_dpk_, psb_d_base_vect_type, psb_ipk_ + class(psb_d_oacc_csr_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_vect_type), intent(inout) :: x, y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_d_oacc_csr_vect_mv + end interface + +contains + + subroutine d_oacc_csr_all(m, n, nz, a, info) + implicit none + integer(psb_ipk_), intent(in) :: m, n, nz + class(psb_d_oacc_csr_sparse_mat), intent(out) :: a + integer(psb_ipk_), intent(out) :: info + + info = 0 + if (allocated(a%val)) then + !$acc exit data delete(a%val) finalize + deallocate(a%val, stat=info) + end if + if (allocated(a%ja)) then + !$acc exit data delete(a%ja) finalize + deallocate(a%ja, stat=info) + end if + if (allocated(a%irp)) then + !$acc exit data delete(a%irp) finalize + deallocate(a%irp, stat=info) + end if + + call a%set_nrows(m) + call a%set_ncols(n) + + allocate(a%val(nz),stat=info) + allocate(a%ja(nz),stat=info) + allocate(a%irp(m+1),stat=info) + if (info == 0) call a%set_host() + if (info == 0) call a%sync_space() + end subroutine d_oacc_csr_all + + function d_oacc_csr_is_host(a) result(res) + implicit none + class(psb_d_oacc_csr_sparse_mat), intent(in) :: a + logical :: res + + res = (a%devstate == is_host) + end function d_oacc_csr_is_host + + function d_oacc_csr_is_sync(a) result(res) + implicit none + class(psb_d_oacc_csr_sparse_mat), intent(in) :: a + logical :: res + + res = (a%devstate == is_sync) + end function d_oacc_csr_is_sync + + function d_oacc_csr_is_dev(a) result(res) + implicit none + class(psb_d_oacc_csr_sparse_mat), intent(in) :: a + logical :: res + + res = (a%devstate == is_dev) + end function d_oacc_csr_is_dev + + subroutine d_oacc_csr_set_host(a) + implicit none + class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a + + a%devstate = is_host + end subroutine d_oacc_csr_set_host + + subroutine d_oacc_csr_set_sync(a) + implicit none + class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a + + a%devstate = is_sync + end subroutine d_oacc_csr_set_sync + + subroutine d_oacc_csr_set_dev(a) + implicit none + class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a + + a%devstate = is_dev + end subroutine d_oacc_csr_set_dev + + subroutine d_oacc_csr_sync_space(a) + implicit none + class(psb_d_oacc_csr_sparse_mat), intent(inout) :: a + if (allocated(a%val)) then + call d_oacc_create_dev(a%val) + end if + if (allocated(a%ja)) then + call i_oacc_create_dev(a%ja) + end if + if (allocated(a%irp)) then + call i_oacc_create_dev(a%irp) + end if + contains + subroutine d_oacc_create_dev(v) + implicit none + real(psb_dpk_), intent(in) :: v(:) + !$acc enter data copyin(v) + end subroutine d_oacc_create_dev + subroutine i_oacc_create_dev(v) + implicit none + integer(psb_ipk_), intent(in) :: v(:) + !$acc enter data copyin(v) + end subroutine i_oacc_create_dev + end subroutine d_oacc_csr_sync_space + + subroutine d_oacc_csr_sync(a) + implicit none + class(psb_d_oacc_csr_sparse_mat), target, intent(in) :: a + class(psb_d_oacc_csr_sparse_mat), pointer :: tmpa + integer(psb_ipk_) :: info + + tmpa => a + if (a%is_dev()) then + call d_oacc_csr_to_host(a%val) + call i_oacc_csr_to_host(a%ja) + call i_oacc_csr_to_host(a%irp) + else if (a%is_host()) then + call d_oacc_csr_to_dev(a%val) + call i_oacc_csr_to_dev(a%ja) + call i_oacc_csr_to_dev(a%irp) + end if + call tmpa%set_sync() + end subroutine d_oacc_csr_sync + + subroutine d_oacc_csr_to_dev(v) + implicit none + real(psb_dpk_), intent(in) :: v(:) + !$acc update device(v) + end subroutine d_oacc_csr_to_dev + + subroutine d_oacc_csr_to_host(v) + implicit none + real(psb_dpk_), intent(in) :: v(:) + !$acc update self(v) + end subroutine d_oacc_csr_to_host + + subroutine i_oacc_csr_to_dev(v) + implicit none + integer(psb_ipk_), intent(in) :: v(:) + !$acc update device(v) + end subroutine i_oacc_csr_to_dev + + subroutine i_oacc_csr_to_host(v) + implicit none + integer(psb_ipk_), intent(in) :: v(:) + !$acc update self(v) + end subroutine i_oacc_csr_to_host + + +end module psb_d_oacc_csr_mat_mod + diff --git a/openacc/psb_d_oacc_vect_mod.F90 b/openacc/psb_d_oacc_vect_mod.F90 new file mode 100644 index 00000000..ac5428f3 --- /dev/null +++ b/openacc/psb_d_oacc_vect_mod.F90 @@ -0,0 +1,966 @@ +module psb_d_oacc_vect_mod + use iso_c_binding + use psb_const_mod + use psb_error_mod + use psb_d_vect_mod + use psb_i_oacc_vect_mod + use psb_i_vect_mod + + integer(psb_ipk_), parameter, private :: is_host = -1 + integer(psb_ipk_), parameter, private :: is_sync = 0 + integer(psb_ipk_), parameter, private :: is_dev = 1 + + type, extends(psb_d_base_vect_type) :: psb_d_vect_oacc + integer :: state = is_host + + contains + procedure, pass(x) :: get_nrows => d_oacc_get_nrows + procedure, nopass :: get_fmt => d_oacc_get_fmt + + procedure, pass(x) :: all => d_oacc_vect_all + procedure, pass(x) :: zero => d_oacc_zero + procedure, pass(x) :: asb_m => d_oacc_asb_m + procedure, pass(x) :: sync => d_oacc_sync + procedure, pass(x) :: sync_space => d_oacc_sync_space + procedure, pass(x) :: bld_x => d_oacc_bld_x + procedure, pass(x) :: bld_mn => d_oacc_bld_mn + procedure, pass(x) :: free => d_oacc_vect_free + procedure, pass(x) :: ins_a => d_oacc_ins_a + procedure, pass(x) :: ins_v => d_oacc_ins_v + procedure, pass(x) :: is_host => d_oacc_is_host + procedure, pass(x) :: is_dev => d_oacc_is_dev + procedure, pass(x) :: is_sync => d_oacc_is_sync + procedure, pass(x) :: set_host => d_oacc_set_host + procedure, pass(x) :: set_dev => d_oacc_set_dev + procedure, pass(x) :: set_sync => d_oacc_set_sync + procedure, pass(x) :: set_scal => d_oacc_set_scal + + procedure, pass(x) :: gthzv_x => d_oacc_gthzv_x + procedure, pass(x) :: gthzbuf_x => d_oacc_gthzbuf + procedure, pass(y) :: sctb => d_oacc_sctb + procedure, pass(y) :: sctb_x => d_oacc_sctb_x + procedure, pass(y) :: sctb_buf => d_oacc_sctb_buf + + procedure, pass(x) :: get_size => oacc_get_size + procedure, pass(x) :: dot_v => d_oacc_vect_dot + procedure, pass(x) :: dot_a => d_oacc_dot_a + procedure, pass(y) :: axpby_v => d_oacc_axpby_v + procedure, pass(y) :: axpby_a => d_oacc_axpby_a + procedure, pass(z) :: abgdxyz => d_oacc_abgdxyz + procedure, pass(y) :: mlt_v => d_oacc_mlt_v + procedure, pass(y) :: mlt_a => d_oacc_mlt_a + procedure, pass(z) :: mlt_a_2 => d_oacc_mlt_a_2 + procedure, pass(z) :: mlt_v_2 => d_oacc_mlt_v_2 + procedure, pass(x) :: scal => d_oacc_scal + procedure, pass(x) :: nrm2 => d_oacc_nrm2 + procedure, pass(x) :: amax => d_oacc_amax + procedure, pass(x) :: asum => d_oacc_asum + procedure, pass(x) :: absval1 => d_oacc_absval1 + procedure, pass(x) :: absval2 => d_oacc_absval2 + + end type psb_d_vect_oacc + + real(psb_dpk_), allocatable :: v1(:),v2(:),p(:) + +contains + + subroutine d_oacc_absval1(x) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: n, i + + if (x%is_host()) call x%sync_space() + n = size(x%v) + !$acc parallel loop + do i = 1, n + x%v(i) = abs(x%v(i)) + end do + call x%set_dev() + end subroutine d_oacc_absval1 + + subroutine d_oacc_absval2(x, y) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + class(psb_d_base_vect_type), intent(inout) :: y + integer(psb_ipk_) :: n + integer(psb_ipk_) :: i + + n = min(size(x%v), size(y%v)) + select type (yy => y) + class is (psb_d_vect_oacc) + if (x%is_host()) call x%sync() + if (yy%is_host()) call yy%sync() + !$acc parallel loop + do i = 1, n + yy%v(i) = abs(x%v(i)) + end do + class default + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() + call x%psb_d_base_vect_type%absval(y) + end select + end subroutine d_oacc_absval2 + + + + subroutine d_oacc_scal(alpha, x) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + real(psb_dpk_), intent(in) :: alpha + integer(psb_ipk_) :: info + integer(psb_ipk_) :: i + + if (x%is_host()) call x%sync_space() + !$acc parallel loop + do i = 1, size(x%v) + x%v(i) = alpha * x%v(i) + end do + call x%set_dev() + end subroutine d_oacc_scal + + function d_oacc_nrm2(n, x) result(res) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_) :: res + integer(psb_ipk_) :: info + real(psb_dpk_) :: sum + integer(psb_ipk_) :: i + + if (x%is_host()) call x%sync_space() + sum = 0.0 + !$acc parallel loop reduction(+:sum) + do i = 1, n + sum = sum + x%v(i) * x%v(i) + end do + res = sqrt(sum) + end function d_oacc_nrm2 + + function d_oacc_amax(n, x) result(res) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_) :: res + integer(psb_ipk_) :: info + real(psb_dpk_) :: max_val + integer(psb_ipk_) :: i + + if (x%is_host()) call x%sync_space() + max_val = -huge(0.0) + !$acc parallel loop reduction(max:max_val) + do i = 1, n + if (x%v(i) > max_val) max_val = x%v(i) + end do + res = max_val + end function d_oacc_amax + + function d_oacc_asum(n, x) result(res) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_) :: res + integer(psb_ipk_) :: info + real(psb_dpk_) :: sum + integer(psb_ipk_) :: i + + if (x%is_host()) call x%sync_space() + sum = 0.0 + !$acc parallel loop reduction(+:sum) + do i = 1, n + sum = sum + abs(x%v(i)) + end do + res = sum + end function d_oacc_asum + + + subroutine d_oacc_mlt_v(x, y, info) + use psi_serial_mod + implicit none + class(psb_d_base_vect_type), intent(inout) :: x + class(psb_d_vect_oacc), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: i, n + + info = 0 + n = min(x%get_nrows(), y%get_nrows()) + select type(xx => x) + type is (psb_d_base_vect_type) + if (y%is_dev()) call y%sync() + !$acc parallel loop + do i = 1, n + y%v(i) = y%v(i) * xx%v(i) + end do + call y%set_host() + class default + if (xx%is_dev()) call xx%sync() + if (y%is_dev()) call y%sync() + !$acc parallel loop + do i = 1, n + y%v(i) = y%v(i) * xx%v(i) + end do + call y%set_host() + end select + end subroutine d_oacc_mlt_v + + subroutine d_oacc_mlt_a(x, y, info) + use psi_serial_mod + implicit none + real(psb_dpk_), intent(in) :: x(:) + class(psb_d_vect_oacc), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: i, n + + info = 0 + if (y%is_dev()) call y%sync_space() + !$acc parallel loop + do i = 1, size(x) + y%v(i) = y%v(i) * x(i) + end do + call y%set_host() + end subroutine d_oacc_mlt_a + + subroutine d_oacc_mlt_a_2(alpha, x, y, beta, z, info) + use psi_serial_mod + implicit none + real(psb_dpk_), intent(in) :: alpha, beta + real(psb_dpk_), intent(in) :: x(:) + real(psb_dpk_), intent(in) :: y(:) + class(psb_d_vect_oacc), intent(inout) :: z + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: i, n + + info = 0 + if (z%is_dev()) call z%sync_space() + !$acc parallel loop + do i = 1, size(x) + z%v(i) = alpha * x(i) * y(i) + beta * z%v(i) + end do + call z%set_host() + end subroutine d_oacc_mlt_a_2 + + subroutine d_oacc_mlt_v_2(alpha, x, y, beta, z, info, conjgx, conjgy) + use psi_serial_mod + use psb_string_mod + implicit none + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_vect_type), intent(inout) :: x + class(psb_d_base_vect_type), intent(inout) :: y + class(psb_d_vect_oacc), intent(inout) :: z + integer(psb_ipk_), intent(out) :: info + character(len=1), intent(in), optional :: conjgx, conjgy + integer(psb_ipk_) :: i, n + logical :: conjgx_, conjgy_ + + conjgx_ = .false. + conjgy_ = .false. + if (present(conjgx)) conjgx_ = (psb_toupper(conjgx) == 'C') + if (present(conjgy)) conjgy_ = (psb_toupper(conjgy) == 'C') + + n = min(x%get_nrows(), y%get_nrows(), z%get_nrows()) + + info = 0 + select type(xx => x) + class is (psb_d_vect_oacc) + select type (yy => y) + class is (psb_d_vect_oacc) + if (xx%is_host()) call xx%sync_space() + if (yy%is_host()) call yy%sync_space() + if ((beta /= dzero) .and. (z%is_host())) call z%sync_space() + !$acc parallel loop + do i = 1, n + z%v(i) = alpha * xx%v(i) * yy%v(i) + beta * z%v(i) + end do + call z%set_dev() + class default + if (xx%is_dev()) call xx%sync_space() + if (yy%is_dev()) call yy%sync() + if ((beta /= dzero) .and. (z%is_dev())) call z%sync_space() + !$acc parallel loop + do i = 1, n + z%v(i) = alpha * xx%v(i) * yy%v(i) + beta * z%v(i) + end do + call z%set_host() + end select + class default + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() + if ((beta /= dzero) .and. (z%is_dev())) call z%sync_space() + !$acc parallel loop + do i = 1, n + z%v(i) = alpha * x%v(i) * y%v(i) + beta * z%v(i) + end do + call z%set_host() + end select + end subroutine d_oacc_mlt_v_2 + + + subroutine d_oacc_axpby_v(m, alpha, x, beta, y, info) + use psi_serial_mod + implicit none + integer(psb_ipk_), intent(in) :: m + class(psb_d_base_vect_type), intent(inout) :: x + class(psb_d_vect_oacc), intent(inout) :: y + real(psb_dpk_), intent(in) :: alpha, beta + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: nx, ny, i + + info = psb_success_ + + select type(xx => x) + type is (psb_d_vect_oacc) + if ((beta /= dzero) .and. y%is_host()) call y%sync_space() + if (xx%is_host()) call xx%sync_space() + nx = size(xx%v) + ny = size(y%v) + if ((nx < m) .or. (ny < m)) then + info = psb_err_internal_error_ + else + !$acc parallel loop + do i = 1, m + y%v(i) = alpha * xx%v(i) + beta * y%v(i) + end do + end if + call y%set_dev() + class default + if ((alpha /= dzero) .and. (x%is_dev())) call x%sync() + call y%axpby(m, alpha, x%v, beta, info) + end select + end subroutine d_oacc_axpby_v + + subroutine d_oacc_axpby_a(m, alpha, x, beta, y, info) + use psi_serial_mod + implicit none + integer(psb_ipk_), intent(in) :: m + real(psb_dpk_), intent(in) :: x(:) + class(psb_d_vect_oacc), intent(inout) :: y + real(psb_dpk_), intent(in) :: alpha, beta + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: i + + if ((beta /= dzero) .and. (y%is_dev())) call y%sync_space() + !$acc parallel loop + do i = 1, m + y%v(i) = alpha * x(i) + beta * y%v(i) + end do + call y%set_host() + end subroutine d_oacc_axpby_a + + subroutine d_oacc_abgdxyz(m, alpha, beta, gamma, delta, x, y, z, info) + use psi_serial_mod + implicit none + integer(psb_ipk_), intent(in) :: m + class(psb_d_base_vect_type), intent(inout) :: x + class(psb_d_base_vect_type), intent(inout) :: y + class(psb_d_vect_oacc), intent(inout) :: z + real(psb_dpk_), intent(in) :: alpha, beta, gamma, delta + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: nx, ny, nz, i + logical :: gpu_done + + info = psb_success_ + gpu_done = .false. + + select type(xx => x) + class is (psb_d_vect_oacc) + select type(yy => y) + class is (psb_d_vect_oacc) + select type(zz => z) + class is (psb_d_vect_oacc) + if ((beta /= dzero) .and. yy%is_host()) call yy%sync_space() + if ((delta /= dzero) .and. zz%is_host()) call zz%sync_space() + if (xx%is_host()) call xx%sync_space() + nx = size(xx%v) + ny = size(yy%v) + nz = size(zz%v) + if ((nx < m) .or. (ny < m) .or. (nz < m)) then + info = psb_err_internal_error_ + else + !$acc parallel loop + do i = 1, m + yy%v(i) = alpha * xx%v(i) + beta * yy%v(i) + zz%v(i) = gamma * yy%v(i) + delta * zz%v(i) + end do + end if + call yy%set_dev() + call zz%set_dev() + gpu_done = .true. + end select + end select + end select + + if (.not. gpu_done) then + if (x%is_host()) call x%sync() + if (y%is_host()) call y%sync() + if (z%is_host()) call z%sync() + call y%axpby(m, alpha, x, beta, info) + call z%axpby(m, gamma, y, delta, info) + end if + end subroutine d_oacc_abgdxyz + + + subroutine d_oacc_sctb_buf(i, n, idx, beta, y) + use psb_base_mod + implicit none + integer(psb_ipk_) :: i, n + class(psb_i_base_vect_type) :: idx + real(psb_dpk_) :: beta + class(psb_d_vect_oacc) :: y + integer(psb_ipk_) :: info + + if (.not.allocated(y%combuf)) then + call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') + return + end if + + select type(ii => idx) + class is (psb_i_vect_oacc) + if (ii%is_host()) call ii%sync_space(info) + if (y%is_host()) call y%sync_space() + + !$acc parallel loop + do i = 1, n + y%v(ii%v(i)) = beta * y%v(ii%v(i)) + y%combuf(i) + end do + + class default + !$acc parallel loop + do i = 1, n + y%v(idx%v(i)) = beta * y%v(idx%v(i)) + y%combuf(i) + end do + end select + end subroutine d_oacc_sctb_buf + + subroutine d_oacc_sctb_x(i, n, idx, x, beta, y) + use psb_base_mod + implicit none + integer(psb_ipk_):: i, n + class(psb_i_base_vect_type) :: idx + real(psb_dpk_) :: beta, x(:) + class(psb_d_vect_oacc) :: y + integer(psb_ipk_) :: info, ni + + select type(ii => idx) + class is (psb_i_vect_oacc) + if (ii%is_host()) call ii%sync_space(info) + class default + call psb_errpush(info, 'd_oacc_sctb_x') + return + end select + + if (y%is_host()) call y%sync_space() + + !$acc parallel loop + do i = 1, n + y%v(idx%v(i)) = beta * y%v(idx%v(i)) + x(i) + end do + + call y%set_dev() + end subroutine d_oacc_sctb_x + + + + subroutine d_oacc_sctb(n, idx, x, beta, y) + use psb_base_mod + implicit none + integer(psb_ipk_) :: n + integer(psb_ipk_) :: idx(:) + real(psb_dpk_) :: beta, x(:) + class(psb_d_vect_oacc) :: y + integer(psb_ipk_) :: info + integer(psb_ipk_) :: i + + if (n == 0) return + if (y%is_dev()) call y%sync_space() + + !$acc parallel loop + do i = 1, n + y%v(idx(i)) = beta * y%v(idx(i)) + x(i) + end do + + call y%set_host() + end subroutine d_oacc_sctb + + + subroutine d_oacc_gthzbuf(i, n, idx, x) + use psb_base_mod + implicit none + integer(psb_ipk_) :: i, n + class(psb_i_base_vect_type) :: idx + class(psb_d_vect_oacc) :: x + integer(psb_ipk_) :: info + + info = 0 + if (.not.allocated(x%combuf)) then + call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') + return + end if + + select type(ii => idx) + class is (psb_i_vect_oacc) + if (ii%is_host()) call ii%sync_space(info) + class default + call psb_errpush(info, 'd_oacc_gthzbuf') + return + end select + + if (x%is_host()) call x%sync_space() + + !$acc parallel loop + do i = 1, n + x%combuf(i) = x%v(idx%v(i)) + end do + end subroutine d_oacc_gthzbuf + + subroutine d_oacc_gthzv_x(i, n, idx, x, y) + use psb_base_mod + implicit none + integer(psb_ipk_) :: i, n + class(psb_i_base_vect_type):: idx + real(psb_dpk_) :: y(:) + class(psb_d_vect_oacc):: x + integer(psb_ipk_) :: info + + info = 0 + + select type(ii => idx) + class is (psb_i_vect_oacc) + if (ii%is_host()) call ii%sync_space(info) + class default + call psb_errpush(info, 'd_oacc_gthzv_x') + return + end select + + if (x%is_host()) call x%sync_space() + + !$acc parallel loop + do i = 1, n + y(i) = x%v(idx%v(i)) + end do + end subroutine d_oacc_gthzv_x + + subroutine d_oacc_ins_v(n, irl, val, dupl, x, info) + use psi_serial_mod + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: n, dupl + class(psb_i_base_vect_type), intent(inout) :: irl + class(psb_d_base_vect_type), intent(inout) :: val + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: i, isz + logical :: done_oacc + + info = 0 + if (psb_errstatus_fatal()) return + + done_oacc = .false. + select type(virl => irl) + type is (psb_i_vect_oacc) + select type(vval => val) + type is (psb_d_vect_oacc) + if (vval%is_host()) call vval%sync_space() + if (virl%is_host()) call virl%sync_space(info) + if (x%is_host()) call x%sync_space() + !$acc parallel loop + do i = 1, n + x%v(virl%v(i)) = vval%v(i) + end do + call x%set_dev() + done_oacc = .true. + end select + end select + + if (.not.done_oacc) then + select type(virl => irl) + type is (psb_i_vect_oacc) + if (virl%is_dev()) call virl%sync_space(info) + end select + select type(vval => val) + type is (psb_d_vect_oacc) + if (vval%is_dev()) call vval%sync_space() + end select + call x%ins(n, irl%v, val%v, dupl, info) + end if + + if (info /= 0) then + call psb_errpush(info, 'oacc_vect_ins') + return + end if + + end subroutine d_oacc_ins_v + + + + subroutine d_oacc_ins_a(n, irl, val, dupl, x, info) + use psi_serial_mod + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: n, dupl + integer(psb_ipk_), intent(in) :: irl(:) + real(psb_dpk_), intent(in) :: val(:) + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: i + + info = 0 + if (x%is_dev()) call x%sync_space() + call x%psb_d_base_vect_type%ins(n, irl, val, dupl, info) + call x%set_host() + !$acc update device(x%v) + + end subroutine d_oacc_ins_a + + + + subroutine d_oacc_bld_mn(x, n) + use psb_base_mod + implicit none + integer(psb_mpk_), intent(in) :: n + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: info + + call x%all(n, info) + if (info /= 0) then + call psb_errpush(info, 'd_oacc_bld_mn', i_err=(/n, n, n, n, n/)) + end if + call x%set_host() + !$acc update device(x%v) + + end subroutine d_oacc_bld_mn + + + subroutine d_oacc_bld_x(x, this) + use psb_base_mod + implicit none + real(psb_dpk_), intent(in) :: this(:) + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: info + + call psb_realloc(size(this), x%v, info) + if (info /= 0) then + info = psb_err_alloc_request_ + call psb_errpush(info, 'd_oacc_bld_x', & + i_err=(/size(this), izero, izero, izero, izero/)) + return + end if + + x%v(:) = this(:) + call x%set_host() + !$acc update device(x%v) + + end subroutine d_oacc_bld_x + + + subroutine d_oacc_asb_m(n, x, info) + use psb_base_mod + implicit none + integer(psb_mpk_), intent(in) :: n + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(out) :: info + integer(psb_mpk_) :: nd + + info = psb_success_ + + if (x%is_dev()) then + nd = size(x%v) + if (nd < n) then + call x%sync() + call x%psb_d_base_vect_type%asb(n, info) + if (info == psb_success_) call x%sync_space() + call x%set_host() + end if + else + if (size(x%v) < n) then + call x%psb_d_base_vect_type%asb(n, info) + if (info == psb_success_) call x%sync_space() + call x%set_host() + end if + end if + end subroutine d_oacc_asb_m + + + + subroutine d_oacc_set_scal(x, val, first, last) + class(psb_d_vect_oacc), intent(inout) :: x + real(psb_dpk_), intent(in) :: val + integer(psb_ipk_), optional :: first, last + + integer(psb_ipk_) :: first_, last_ + first_ = 1 + last_ = x%get_nrows() + if (present(first)) first_ = max(1, first) + if (present(last)) last_ = min(last, last_) + + !$acc parallel loop + do i = first_, last_ + x%v(i) = val + end do + !$acc end parallel loop + + call x%set_dev() + end subroutine d_oacc_set_scal + + + + subroutine d_oacc_zero(x) + use psi_serial_mod + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + call x%set_dev() + call x%set_scal(dzero) + end subroutine d_oacc_zero + + function d_oacc_get_nrows(x) result(res) + implicit none + class(psb_d_vect_oacc), intent(in) :: x + integer(psb_ipk_) :: res + + if (allocated(x%v)) res = size(x%v) + end function d_oacc_get_nrows + + function d_oacc_get_fmt() result(res) + implicit none + character(len=5) :: res + res = "dOACC" + + end function d_oacc_get_fmt + + function d_oacc_vect_dot(n, x, y) result(res) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + class(psb_d_base_vect_type), intent(inout) :: y + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_) :: res + real(psb_dpk_), external :: ddot + integer(psb_ipk_) :: info + integer(psb_ipk_) :: i + + res = dzero + + select type(yy => y) + type is (psb_d_base_vect_type) + if (x%is_dev()) call x%sync() + res = ddot(n, x%v, 1, yy%v, 1) + type is (psb_d_vect_oacc) + if (x%is_host()) call x%sync() + if (yy%is_host()) call yy%sync() + + !$acc parallel loop reduction(+:res) present(x%v, yy%v) + do i = 1, n + res = res + x%v(i) * yy%v(i) + end do + !$acc end parallel loop + + class default + call x%sync() + res = y%dot(n, x%v) + end select + + end function d_oacc_vect_dot + + + + + function d_oacc_dot_a(n, x, y) result(res) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + real(psb_dpk_), intent(in) :: y(:) + integer(psb_ipk_), intent(in) :: n + real(psb_dpk_) :: res + real(psb_dpk_), external :: ddot + + if (x%is_dev()) call x%sync() + res = ddot(n, y, 1, x%v, 1) + + end function d_oacc_dot_a + + ! subroutine d_oacc_set_vect(x,y) + ! implicit none + ! class(psb_d_vect_oacc), intent(inout) :: x + ! real(psb_dpk_), intent(in) :: y(:) + ! integer(psb_ipk_) :: info + + ! if (size(x%v) /= size(y)) then + ! call x%free(info) + ! call x%all(size(y),info) + ! end if + ! x%v(:) = y(:) + ! call x%set_host() + ! end subroutine d_oacc_set_vect + + subroutine d_oacc_to_dev(v) + implicit none + real(psb_dpk_) :: v(:) + !$acc update device(v) + end subroutine d_oacc_to_dev + + subroutine d_oacc_to_host(v) + implicit none + real(psb_dpk_) :: v(:) + !$acc update self(v) + end subroutine d_oacc_to_host + + subroutine d_oacc_sync_space(x) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + if (allocated(x%v)) then + call d_oacc_create_dev(x%v) + end if + contains + subroutine d_oacc_create_dev(v) + implicit none + real(psb_dpk_) :: v(:) + !$acc enter data copyin(v) + end subroutine d_oacc_create_dev + end subroutine d_oacc_sync_space + + subroutine d_oacc_sync(x) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + if (x%is_dev()) then + call d_oacc_to_host(x%v) + end if + if (x%is_host()) then + call d_oacc_to_dev(x%v) + end if + call x%set_sync() + end subroutine d_oacc_sync + + subroutine d_oacc_set_host(x) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + + x%state = is_host + end subroutine d_oacc_set_host + + subroutine d_oacc_set_dev(x) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + + x%state = is_dev + end subroutine d_oacc_set_dev + + subroutine d_oacc_set_sync(x) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + + x%state = is_sync + end subroutine d_oacc_set_sync + + function d_oacc_is_dev(x) result(res) + implicit none + class(psb_d_vect_oacc), intent(in) :: x + logical :: res + + res = (x%state == is_dev) + end function d_oacc_is_dev + + function d_oacc_is_host(x) result(res) + implicit none + class(psb_d_vect_oacc), intent(in) :: x + logical :: res + + res = (x%state == is_host) + end function d_oacc_is_host + + function d_oacc_is_sync(x) result(res) + implicit none + class(psb_d_vect_oacc), intent(in) :: x + logical :: res + + res = (x%state == is_sync) + end function d_oacc_is_sync + + subroutine d_oacc_vect_all(n, x, info) + use psi_serial_mod + use psb_realloc_mod + implicit none + integer(psb_ipk_), intent(in) :: n + class(psb_d_vect_oacc), intent(out) :: x + integer(psb_ipk_), intent(out) :: info + + call psb_realloc(n, x%v, info) + if (info == 0) then + call x%set_host() + !$acc enter data create(x%v) + call x%sync_space() + end if + if (info /= 0) then + info = psb_err_alloc_request_ + call psb_errpush(info, 'd_oacc_all', & + i_err=(/n, n, n, n, n/)) + end if + end subroutine d_oacc_vect_all + + + subroutine d_oacc_vect_free(x, info) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(out) :: info + info = 0 + if (allocated(x%v)) then + !$acc exit data delete(x%v) finalize + deallocate(x%v, stat=info) + end if + + end subroutine d_oacc_vect_free + + function oacc_get_size(x) result(res) + implicit none + class(psb_d_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: res + + if (x%is_dev()) call x%sync() + res = size(x%v) + end function oacc_get_size + + + subroutine initialize(N) + integer(psb_ipk_) :: N + integer(psb_ipk_) :: i + allocate(v1(N),v2(N),p(N)) + !$acc enter data create(v1,v2,p) + !$acc parallel + !$acc loop + do i=1,n + v1(i) = i + v2(i) = n+i + end do + !$acc end parallel + end subroutine initialize + subroutine finalize_dev() + !$acc exit data delete(v1,v2,p) + end subroutine finalize_dev + subroutine finalize_host() + deallocate(v1,v2,p) + end subroutine finalize_host + subroutine to_dev() + !$acc update device(v1,v2) + end subroutine to_dev + subroutine to_host() + !$acc update self(v1,v2) + end subroutine to_host + function d_dot(N) result(res) + real(kind(1.d0)) :: res + integer(psb_ipk_) :: i,N + real(kind(1.d0)) :: t1,t2,t3 + res = 0.0d0 + !$acc parallel + !$acc loop reduction(+:res) + do i=1,N + res = res + v1(i) * v2(i) + end do + !$acc end parallel + + end function d_dot + function h_dot(N) result(res) + integer(psb_ipk_) :: i,N + real(kind(1.d0)) :: t1,t2,t3,res + res = 0.0d0 + do i=1,N + res = res + v1(i) * v2(i) + end do + end function h_dot + +end module psb_d_oacc_vect_mod \ No newline at end of file diff --git a/openacc/psb_i_oacc_vect_mod.F90 b/openacc/psb_i_oacc_vect_mod.F90 new file mode 100644 index 00000000..70fc325e --- /dev/null +++ b/openacc/psb_i_oacc_vect_mod.F90 @@ -0,0 +1,455 @@ +module psb_i_oacc_vect_mod + use iso_c_binding + use psb_const_mod + use psb_error_mod + use psb_i_vect_mod + + integer(psb_ipk_), parameter, private :: is_host = -1 + integer(psb_ipk_), parameter, private :: is_sync = 0 + integer(psb_ipk_), parameter, private :: is_dev = 1 + + type, extends(psb_i_base_vect_type) :: psb_i_vect_oacc + integer :: state = is_host + contains + procedure, pass(x) :: get_nrows => i_oacc_get_nrows + procedure, nopass :: get_fmt => i_oacc_get_fmt + + procedure, pass(x) :: all => i_oacc_all + procedure, pass(x) :: zero => i_oacc_zero + procedure, pass(x) :: asb_m => i_oacc_asb_m + procedure, pass(x) :: sync => i_oacc_sync + procedure, pass(x) :: sync_space => i_oacc_sync_space + procedure, pass(x) :: bld_x => i_oacc_bld_x + procedure, pass(x) :: bld_mn => i_oacc_bld_mn + procedure, pass(x) :: free => i_oacc_free + procedure, pass(x) :: ins_a => i_oacc_ins_a + procedure, pass(x) :: ins_v => i_oacc_ins_v + procedure, pass(x) :: is_host => i_oacc_is_host + procedure, pass(x) :: is_dev => i_oacc_is_dev + procedure, pass(x) :: is_sync => i_oacc_is_sync + procedure, pass(x) :: set_host => i_oacc_set_host + procedure, pass(x) :: set_dev => i_oacc_set_dev + procedure, pass(x) :: set_sync => i_oacc_set_sync + procedure, pass(x) :: set_scal => i_oacc_set_scal + procedure, pass(x) :: gthzv_x => i_oacc_gthzv_x + procedure, pass(y) :: sctb => i_oacc_sctb + procedure, pass(y) :: sctb_x => i_oacc_sctb_x + procedure, pass(x) :: gthzbuf => i_oacc_gthzbuf + procedure, pass(y) :: sctb_buf => i_oacc_sctb_buf + + final :: i_oacc_vect_finalize + end type psb_i_vect_oacc + + public :: psb_i_vect_oacc_ + private :: constructor + interface psb_i_vect_oacc_ + module procedure constructor + end interface psb_i_vect_oacc_ + +contains + + function constructor(x) result(this) + integer(psb_ipk_) :: x(:) + type(psb_i_vect_oacc) :: this + integer(psb_ipk_) :: info + + this%v = x + call this%asb(size(x), info) + end function constructor + + + subroutine i_oacc_gthzv_x(i, n, idx, x, y) + implicit none + integer(psb_ipk_) :: i, n + class(psb_i_base_vect_type) :: idx + integer(psb_ipk_) :: y(:) + class(psb_i_vect_oacc) :: x + integer :: info + + !$acc parallel loop + do i = 1, n + y(i) = x%v(idx%v(i)) + end do + end subroutine i_oacc_gthzv_x + + subroutine i_oacc_gthzbuf(i, n, idx, x) + implicit none + integer(psb_ipk_) :: i, n + class(psb_i_base_vect_type) :: idx + class(psb_i_vect_oacc) :: x + integer :: info + + if (.not.allocated(x%combuf)) then + call psb_errpush(psb_err_alloc_dealloc_, 'gthzbuf') + return + end if + + !$acc parallel loop + do i = 1, n + x%combuf(i) = x%v(idx%v(i)) + end do + end subroutine i_oacc_gthzbuf + + subroutine i_oacc_sctb(n, idx, x, beta, y) + implicit none + integer(psb_ipk_) :: n, idx(:) + integer(psb_ipk_) :: beta, x(:) + class(psb_i_vect_oacc) :: y + integer(psb_ipk_) :: info + integer :: i + + if (n == 0) return + + !$acc parallel loop + do i = 1, n + y%v(idx(i)) = beta * y%v(idx(i)) + x(i) + end do + end subroutine i_oacc_sctb + + subroutine i_oacc_sctb_x(i, n, idx, x, beta, y) + implicit none + integer(psb_ipk_) :: i, n + class(psb_i_base_vect_type) :: idx + integer(psb_ipk_) :: beta, x(:) + class(psb_i_vect_oacc) :: y + integer :: info + + select type(ii => idx) + class is (psb_i_vect_oacc) + if (ii%is_host()) call ii%sync_space(info) + if (y%is_host()) call y%sync_space(info) + + !$acc parallel loop + do i = 1, n + y%v(ii%v(i)) = beta * y%v(ii%v(i)) + x(i) + end do + + class default + !$acc parallel loop + do i = 1, n + y%v(idx%v(i)) = beta * y%v(idx%v(i)) + x(i) + end do + end select + end subroutine i_oacc_sctb_x + + subroutine i_oacc_sctb_buf(i, n, idx, beta, y) + implicit none + integer(psb_ipk_) :: i, n + class(psb_i_base_vect_type) :: idx + integer(psb_ipk_) :: beta + class(psb_i_vect_oacc) :: y + integer(psb_ipk_) :: info + + if (.not.allocated(y%v)) then + call psb_errpush(psb_err_alloc_dealloc_, 'sctb_buf') + return + end if + + !$acc parallel loop + do i = 1, n + y%v(idx%v(i)) = beta * y%v(idx%v(i)) + y%v(i) + end do + end subroutine i_oacc_sctb_buf + + subroutine i_oacc_set_host(x) + class(psb_i_vect_oacc), intent(inout) :: x + x%state = is_host + end subroutine i_oacc_set_host + + subroutine i_oacc_set_sync(x) + class(psb_i_vect_oacc), intent(inout) :: x + x%state = is_sync + end subroutine i_oacc_set_sync + + subroutine i_oacc_set_dev(x) + class(psb_i_vect_oacc), intent(inout) :: x + x%state = is_dev + end subroutine i_oacc_set_dev + + subroutine i_oacc_set_scal(x, val, first, last) + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: val + integer(psb_ipk_), optional :: first, last + + integer(psb_ipk_) :: first_, last_ + + first_ = 1 + last_ = size(x%v) + if (present(first)) first_ = max(1, first) + if (present(last)) last_ = min(size(x%v), last) + + !$acc parallel loop + do i = first_, last_ + x%v(i) = val + end do + call x%set_dev() + end subroutine i_oacc_set_scal + + function i_oacc_is_host(x) result(res) + implicit none + class(psb_i_vect_oacc), intent(in) :: x + logical :: res + + res = (x%state == is_host) + end function i_oacc_is_host + + function i_oacc_is_dev(x) result(res) + implicit none + class(psb_i_vect_oacc), intent(in) :: x + logical :: res + + res = (x%state == is_dev) + end function i_oacc_is_dev + + function i_oacc_is_sync(x) result(res) + implicit none + class(psb_i_vect_oacc), intent(in) :: x + logical :: res + + res = (x%state == is_sync) + end function i_oacc_is_sync + + subroutine i_oacc_free(x, info) + use psb_error_mod + implicit none + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(out) :: info + + info = 0 + if (allocated(x%v)) deallocate(x%v, stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_ + call psb_errpush(info, 'i_oacc_free') + end if + call x%set_sync() + end subroutine i_oacc_free + + subroutine i_oacc_ins_a(n, irl, val, dupl, x, info) + implicit none + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: n, dupl + integer(psb_ipk_), intent(in) :: irl(:) + integer(psb_ipk_), intent(in) :: val(:) + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: i + + info = 0 + if (x%is_dev()) call x%sync() + call x%psb_i_base_vect_type%ins(n, irl, val, dupl, info) + call x%set_host() + end subroutine i_oacc_ins_a + + subroutine i_oacc_ins_v(n, irl, val, dupl, x, info) + implicit none + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(in) :: n, dupl + class(psb_i_base_vect_type), intent(inout) :: irl + class(psb_i_base_vect_type), intent(inout) :: val + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: i, isz + logical :: done_oacc + + info = 0 + if (psb_errstatus_fatal()) return + + done_oacc = .false. + select type(virl => irl) + class is (psb_i_vect_oacc) + select type(vval => val) + class is (psb_i_vect_oacc) + if (vval%is_host()) call vval%sync() + if (virl%is_host()) call virl%sync() + if (x%is_host()) call x%sync() + ! Add the OpenACC kernel call here if needed + call x%set_dev() + done_oacc = .true. + end select + end select + + if (.not.done_oacc) then + if (irl%is_dev()) call irl%sync() + if (val%is_dev()) call val%sync() + call x%ins(n, irl%v, val%v, dupl, info) + end if + + if (info /= 0) then + call psb_errpush(info,'i_oacc_ins_v') + return + end if + end subroutine i_oacc_ins_v + + subroutine i_oacc_bld_x(x, this) + use psb_error_mod + implicit none + integer(psb_ipk_), intent(in) :: this(:) + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: info + + call psb_realloc(size(this), x%v, info) + if (info /= 0) then + info = psb_err_alloc_request_ + call psb_errpush(info, 'i_oacc_bld_x', i_err = (/size(this), izero, izero, izero, izero/)) + end if + x%v(:) = this(:) + call x%set_host() + call x%sync() + end subroutine i_oacc_bld_x + + subroutine i_oacc_bld_mn(x, n) + use psb_error_mod + implicit none + integer(psb_mpk_), intent(in) :: n + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: info + + call x%all(n, info) + if (info /= 0) then + call psb_errpush(info, 'i_oacc_bld_mn', i_err = (/n, n, n, n, n/)) + end if + end subroutine i_oacc_bld_mn + + subroutine i_oacc_sync(x) + use psb_error_mod + implicit none + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: n, info + + info = 0 + if (x%is_host()) then + n = size(x%v) + if (.not.allocated(x%v)) then + write(*, *) 'Incoherent situation : x%v not allocated' + call psb_realloc(n, x%v, info) + end if + if ((n > size(x%v)) .or. (n > x%get_nrows())) then + write(*, *) 'Incoherent situation : sizes', n, size(x%v), x%get_nrows() + call psb_realloc(n, x%v, info) + end if + !$acc update device(x%v) + else if (x%is_dev()) then + n = size(x%v) + if (.not.allocated(x%v)) then + write(*, *) 'Incoherent situation : x%v not allocated' + call psb_realloc(n, x%v, info) + end if + !$acc update self(x%v) + end if + if (info == 0) call x%set_sync() + if (info /= 0) then + info = psb_err_internal_error_ + call psb_errpush(info, 'i_oacc_sync') + end if + end subroutine i_oacc_sync + + subroutine i_oacc_sync_space(x, info) + use psb_error_mod + implicit none + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: nh, nd + + info = 0 + if (x%is_dev()) then + nh = size(x%v) + nd = nh + if (nh < nd) then + call psb_realloc(nd, x%v, info) + end if + else + nh = size(x%v) + nd = nh + if (nh < nd) then + call psb_realloc(nd, x%v, info) + end if + end if + end subroutine i_oacc_sync_space + + function i_oacc_get_nrows(x) result(res) + implicit none + class(psb_i_vect_oacc), intent(in) :: x + integer(psb_ipk_) :: res + + res = 0 + if (allocated(x%v)) res = size(x%v) + end function i_oacc_get_nrows + + function i_oacc_get_fmt() result(res) + implicit none + character(len=5) :: res + res = 'iOACC' + end function i_oacc_get_fmt + + subroutine i_oacc_all(n, x, info) + use psb_error_mod + implicit none + class(psb_i_vect_oacc), intent(out) :: x + integer(psb_ipk_), intent(in) :: n + integer(psb_ipk_), intent(out) :: info + + call psb_realloc(n, x%v, info) + if (info == 0) call x%set_host() + if (info == 0) call x%sync_space(info) + if (info /= 0) then + info = psb_err_alloc_request_ + call psb_errpush(info, 'i_oacc_all', i_err=(/n, n, n, n, n/)) + end if + end subroutine i_oacc_all + + subroutine i_oacc_zero(x) + use psb_error_mod + implicit none + class(psb_i_vect_oacc), intent(inout) :: x + ! Ensure zeroing on the GPU side + call x%set_dev() + x%v = 0 + !$acc update device(x%v) + end subroutine i_oacc_zero + + subroutine i_oacc_asb_m(n, x, info) + use psb_error_mod + use psb_realloc_mod + implicit none + integer(psb_ipk_), intent(in) :: n + class(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: nh, nd + + info = 0 + if (x%is_dev()) then + nd = size(x%v) + if (nd < n) then + call x%sync() + call x%psb_i_base_vect_type%asb(n, info) + if (info == psb_success_) call x%sync_space(info) + call x%set_host() + end if + else + nh = size(x%v) + if (nh < n) then + call x%psb_i_base_vect_type%asb(n, info) + if (info == psb_success_) call x%sync_space(info) + call x%set_host() + end if + end if + end subroutine i_oacc_asb_m + + subroutine i_oacc_vect_finalize(x) + use psi_serial_mod + use psb_realloc_mod + implicit none + type(psb_i_vect_oacc), intent(inout) :: x + integer(psb_ipk_) :: info + + info = 0 + call x%free(info) + end subroutine i_oacc_vect_finalize + +end module psb_i_oacc_vect_mod + + + + + + \ No newline at end of file diff --git a/openacc/psb_oacc_mod.F90 b/openacc/psb_oacc_mod.F90 new file mode 100644 index 00000000..ce5e85f9 --- /dev/null +++ b/openacc/psb_oacc_mod.F90 @@ -0,0 +1,7 @@ +module psb_oacc_mod + use psb_const_mod + + use psb_d_oacc_vect_mod + use psb_d_oacc_csr_mat_mod + +end module psb_oacc_mod \ No newline at end of file diff --git a/test/openacc/Makefile b/test/openacc/Makefile new file mode 100644 index 00000000..6df14d42 --- /dev/null +++ b/test/openacc/Makefile @@ -0,0 +1,53 @@ +TOPDIR=../.. +include $(TOPDIR)/Make.inc + +# Directories +LIBDIR=$(TOPDIR)/lib/ +PSBLIBDIR=$(TOPDIR)/lib/ +PSBINCDIR=$(TOPDIR)/include +PSBMODDIR=$(TOPDIR)/modules +INCDIR=$(TOPDIR)/include +MODDIR=$(TOPDIR)/modules +EXEDIR=./runs + +# Libraries +PSBLAS_LIB= -L$(LIBDIR) -L$(PSBLIBDIR) -lpsb_util -lpsb_ext -lpsb_base -lpsb_openacc -lopenblas -lmetis +LDLIBS=$(PSBGPULDLIBS) + +# Includes +FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(FMFLAG). $(FMFLAG)$(PSBMODDIR) $(FMFLAG)$(PSBINCDIR) $(LIBRSB_DEFINES) + +# Compiler flags +FFLAGS=-O0 -march=native -fopenacc -foffload=nvptx-none="-march=sm_70" +CFLAGS=-O0 -march=native + +# Source files +SRCS=test.F90 vectoacc.F90 +CSRC=timers.c + +# Object files +OBJS=$(SRCS:.F90=.o) $(CSRC:.c=.o) + +# Default rule +all: dir + +dir: + @if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi + +# Pattern rule for creating executables +%: %.o timers.o + $(FC) $(FFLAGS) $^ -o $@ $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) + /bin/mv $@ $(EXEDIR) + +# Compilation rules +%.o: %.F90 + $(FC) $(FFLAGS) $(FINCLUDES) -c $< -o $@ + +%.o: %.c + $(CC) $(CFLAGS) $(FINCLUDES) -c $< -o $@ + +clean: + /bin/rm -fr *.o *.mod $(EXEDIR)/* + +# Phony targets +.PHONY: all dir clean diff --git a/test/openacc/test.F90 b/test/openacc/test.F90 new file mode 100644 index 00000000..0d0b756f --- /dev/null +++ b/test/openacc/test.F90 @@ -0,0 +1,617 @@ +module psb_d_pde3d_mod + + + use psb_base_mod, only : psb_dpk_, psb_ipk_, psb_lpk_, psb_desc_type,& + & psb_dspmat_type, psb_d_vect_type, dzero,& + & psb_d_base_sparse_mat, psb_d_base_vect_type, & + & psb_i_base_vect_type, psb_l_base_vect_type + + interface + function d_func_3d(x,y,z) result(val) + import :: psb_dpk_ + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + end function d_func_3d + end interface + + interface psb_gen_pde3d + module procedure psb_d_gen_pde3d + end interface psb_gen_pde3d + + contains + + function d_null_func_3d(x,y,z) result(val) + + real(psb_dpk_), intent(in) :: x,y,z + real(psb_dpk_) :: val + + val = dzero + + end function d_null_func_3d + ! + ! functions parametrizing the differential equation + ! + + ! + ! Note: b1, b2 and b3 are the coefficients of the first + ! derivative of the unknown function. The default + ! we apply here is to have them zero, so that the resulting + ! matrix is symmetric/hermitian and suitable for + ! testing with CG and FCG. + ! When testing methods for non-hermitian matrices you can + ! change the B1/B2/B3 functions to e.g. done/sqrt((3*done)) + ! + function b1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b1 + real(psb_dpk_), intent(in) :: x,y,z + b1=done/sqrt((3*done)) + end function b1 + function b2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b2 + real(psb_dpk_), intent(in) :: x,y,z + b2=done/sqrt((3*done)) + end function b2 + function b3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: b3 + real(psb_dpk_), intent(in) :: x,y,z + b3=done/sqrt((3*done)) + end function b3 + function c(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: c + real(psb_dpk_), intent(in) :: x,y,z + c=dzero + end function c + function a1(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a1 + real(psb_dpk_), intent(in) :: x,y,z + a1=done/80 + end function a1 + function a2(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a2 + real(psb_dpk_), intent(in) :: x,y,z + a2=done/80 + end function a2 + function a3(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: a3 + real(psb_dpk_), intent(in) :: x,y,z + a3=done/80 + end function a3 + function g(x,y,z) + use psb_base_mod, only : psb_dpk_, done, dzero + implicit none + real(psb_dpk_) :: g + real(psb_dpk_), intent(in) :: x,y,z + g = dzero + if (x == done) then + g = done + else if (x == dzero) then + g = exp(y**2-z**2) + end if + end function g + + + ! + ! subroutine to allocate and fill in the coefficient matrix and + ! the rhs. + ! + subroutine psb_d_gen_pde3d(ctxt,idim,a,bv,xv,desc_a,afmt,info,& + & f,amold,vmold,imold,partition,nrl,iv,tnd) + use psb_base_mod + use psb_util_mod + ! + ! Discretizes the partial differential equation + ! + ! a1 dd(u) a2 dd(u) a3 dd(u) b1 d(u) b2 d(u) b3 d(u) + ! - ------ - ------ - ------ + ----- + ------ + ------ + c u = f + ! dxdx dydy dzdz dx dy dz + ! + ! with Dirichlet boundary conditions + ! u = g + ! + ! on the unit cube 0<=x,y,z<=1. + ! + ! + ! Note that if b1=b2=b3=c=0., the PDE is the Laplace equation. + ! + implicit none + integer(psb_ipk_) :: idim + type(psb_dspmat_type) :: a + type(psb_d_vect_type) :: xv,bv + type(psb_desc_type) :: desc_a + type(psb_ctxt_type) :: ctxt + integer(psb_ipk_) :: info + character(len=*) :: afmt + procedure(d_func_3d), optional :: f + class(psb_d_base_sparse_mat), optional :: amold + class(psb_d_base_vect_type), optional :: vmold + class(psb_i_base_vect_type), optional :: imold + integer(psb_ipk_), optional :: partition, nrl,iv(:) + logical, optional :: tnd + ! Local variables. + + integer(psb_ipk_), parameter :: nb=20 + type(psb_d_csc_sparse_mat) :: acsc + type(psb_d_coo_sparse_mat) :: acoo + type(psb_d_csr_sparse_mat) :: acsr + real(psb_dpk_) :: zt(nb),x,y,z + integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_ + integer(psb_lpk_) :: m,n,glob_row,nt + integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner + ! For 3D partition + ! Note: integer control variables going directly into an MPI call + ! must be 4 bytes, i.e. psb_mpk_ + integer(psb_mpk_) :: npdims(3), npp, minfo + integer(psb_ipk_) :: npx,npy,npz, iamx,iamy,iamz,mynx,myny,mynz + integer(psb_ipk_), allocatable :: bndx(:),bndy(:),bndz(:) + ! Process grid + integer(psb_ipk_) :: np, iam + integer(psb_ipk_) :: icoeff + integer(psb_lpk_), allocatable :: irow(:),icol(:),myidx(:) + real(psb_dpk_), allocatable :: val(:) + ! deltah dimension of each grid cell + ! deltat discretization time + real(psb_dpk_) :: deltah, sqdeltah, deltah2 + real(psb_dpk_), parameter :: rhs=dzero,one=done,zero=dzero + real(psb_dpk_) :: t0, t1, t2, t3, tasb, talc, ttot, tgen, tcdasb + integer(psb_ipk_) :: err_act + procedure(d_func_3d), pointer :: f_ + logical :: tnd_ + character(len=20) :: name, ch_err,tmpfmt + + info = psb_success_ + name = 'create_matrix' + call psb_erractionsave(err_act) + + call psb_info(ctxt, iam, np) + + + if (present(f)) then + f_ => f + else + f_ => d_null_func_3d + end if + + deltah = done/(idim+2) + sqdeltah = deltah*deltah + deltah2 = (2*done)* deltah + + if (present(partition)) then + if ((1<= partition).and.(partition <= 3)) then + partition_ = partition + else + write(*,*) 'Invalid partition choice ',partition,' defaulting to 3' + partition_ = 3 + end if + else + partition_ = 3 + end if + + ! initialize array descriptor and sparse matrix storage. provide an + ! estimate of the number of non zeroes + + m = (1_psb_lpk_*idim)*idim*idim + n = m + nnz = ((n*7)/(np)) + if(iam == psb_root_) write(psb_out_unit,'("Generating Matrix (size=",i0,")...")')n + t0 = psb_wtime() + select case(partition_) + case(1) + ! A BLOCK partition + if (present(nrl)) then + nr = nrl + else + ! + ! Using a simple BLOCK distribution. + ! + nt = (m+np-1)/np + nr = max(0,min(nt,m-(iam*nt))) + end if + + nt = nr + call psb_sum(ctxt,nt) + if (nt /= m) then + write(psb_err_unit,*) iam, 'Initialization error ',nr,nt,m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! First example of use of CDALL: specify for each process a number of + ! contiguous rows + ! + call psb_cdall(ctxt,desc_a,info,nl=nr) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(2) + ! A partition defined by the user through IV + + if (present(iv)) then + if (size(iv) /= m) then + write(psb_err_unit,*) iam, 'Initialization error: wrong IV size',size(iv),m + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + else + write(psb_err_unit,*) iam, 'Initialization error: IV not present' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end if + + ! + ! Second example of use of CDALL: specify for each row the + ! process that owns it + ! + call psb_cdall(ctxt,desc_a,info,vg=iv) + myidx = desc_a%get_global_indices() + nlr = size(myidx) + + case(3) + ! A 3-dimensional partition + + ! A nifty MPI function will split the process list + npdims = 0 + call mpi_dims_create(np,3,npdims,info) + npx = npdims(1) + npy = npdims(2) + npz = npdims(3) + + allocate(bndx(0:npx),bndy(0:npy),bndz(0:npz)) + ! We can reuse idx2ijk for process indices as well. + call idx2ijk(iamx,iamy,iamz,iam,npx,npy,npz,base=0) + ! Now let's split the 3D cube in hexahedra + call dist1Didx(bndx,idim,npx) + mynx = bndx(iamx+1)-bndx(iamx) + call dist1Didx(bndy,idim,npy) + myny = bndy(iamy+1)-bndy(iamy) + call dist1Didx(bndz,idim,npz) + mynz = bndz(iamz+1)-bndz(iamz) + + ! How many indices do I own? + nlr = mynx*myny*mynz + allocate(myidx(nlr)) + ! Now, let's generate the list of indices I own + nr = 0 + do i=bndx(iamx),bndx(iamx+1)-1 + do j=bndy(iamy),bndy(iamy+1)-1 + do k=bndz(iamz),bndz(iamz+1)-1 + nr = nr + 1 + call ijk2idx(myidx(nr),i,j,k,idim,idim,idim) + end do + end do + end do + if (nr /= nlr) then + write(psb_err_unit,*) iam,iamx,iamy,iamz, 'Initialization error: NR vs NLR ',& + & nr,nlr,mynx,myny,mynz + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + end if + + ! + ! Third example of use of CDALL: specify for each process + ! the set of global indices it owns. + ! + call psb_cdall(ctxt,desc_a,info,vl=myidx) + + case default + write(psb_err_unit,*) iam, 'Initialization error: should not get here' + info = -1 + call psb_barrier(ctxt) + call psb_abort(ctxt) + return + end select + + + if (info == psb_success_) call psb_spall(a,desc_a,info,nnz=nnz,& + & dupl=psb_dupl_err_) + ! define rhs from boundary conditions; also build initial guess + if (info == psb_success_) call psb_geall(xv,desc_a,info) + if (info == psb_success_) call psb_geall(bv,desc_a,info) + + call psb_barrier(ctxt) + talc = psb_wtime()-t0 + + if (info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='allocation rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + ! we build an auxiliary matrix consisting of one row at a + ! time; just a small matrix. might be extended to generate + ! a bunch of rows per call. + ! + allocate(val(20*nb),irow(20*nb),& + &icol(20*nb),stat=info) + if (info /= psb_success_ ) then + info=psb_err_alloc_dealloc_ + call psb_errpush(info,name) + goto 9999 + endif + + + ! loop over rows belonging to current process in a block + ! distribution. + + call psb_barrier(ctxt) + t1 = psb_wtime() + do ii=1, nlr,nb + ib = min(nb,nlr-ii+1) + icoeff = 1 + do k=1,ib + i=ii+k-1 + ! local matrix pointer + glob_row=myidx(i) + ! compute gridpoint coordinates + call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim) + ! x, y, z coordinates + x = (ix-1)*deltah + y = (iy-1)*deltah + z = (iz-1)*deltah + zt(k) = f_(x,y,z) + ! internal point: build discretization + ! + ! term depending on (x-1,y,z) + ! + val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2 + if (ix == 1) then + zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y-1,z) + val(icoeff) = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2 + if (iy == 1) then + zt(k) = g(x,dzero,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y,z-1) + val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2 + if (iz == 1) then + zt(k) = g(x,y,dzero)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + ! term depending on (x,y,z) + val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah & + & + c(x,y,z) + call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + ! term depending on (x,y,z+1) + val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2 + if (iz == idim) then + zt(k) = g(x,y,done)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x,y+1,z) + val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2 + if (iy == idim) then + zt(k) = g(x,done,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + ! term depending on (x+1,y,z) + val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2 + if (ix==idim) then + zt(k) = g(done,y,z)*(-val(icoeff)) + zt(k) + else + call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim) + irow(icoeff) = glob_row + icoeff = icoeff+1 + endif + + end do + call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info) + if(info /= psb_success_) exit + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info) + if(info /= psb_success_) exit + zt(:)=dzero + call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info) + if(info /= psb_success_) exit + end do + + tgen = psb_wtime()-t1 + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='insert rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + + deallocate(val,irow,icol) + + call psb_barrier(ctxt) + t1 = psb_wtime() + call psb_cdasb(desc_a,info,mold=imold) + tcdasb = psb_wtime()-t1 + call psb_barrier(ctxt) + t1 = psb_wtime() + if (info == psb_success_) then + if (present(amold)) then + call psb_spasb(a,desc_a,info,mold=amold,bld_and=tnd) + else + call psb_spasb(a,desc_a,info,afmt=afmt,bld_and=tnd) + end if + end if + call psb_barrier(ctxt) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + if (info == psb_success_) call psb_geasb(xv,desc_a,info,mold=vmold) + if (info == psb_success_) call psb_geasb(bv,desc_a,info,mold=vmold) + if(info /= psb_success_) then + info=psb_err_from_subroutine_ + ch_err='asb rout.' + call psb_errpush(info,name,a_err=ch_err) + goto 9999 + end if + tasb = psb_wtime()-t1 + call psb_barrier(ctxt) + ttot = psb_wtime() - t0 + + call psb_amx(ctxt,talc) + call psb_amx(ctxt,tgen) + call psb_amx(ctxt,tasb) + call psb_amx(ctxt,ttot) + if(iam == psb_root_) then + tmpfmt = a%get_fmt() + write(psb_out_unit,'("The matrix has been generated and assembled in ",a3," format.")')& + & tmpfmt + write(psb_out_unit,'("-allocation time : ",es12.5)') talc + write(psb_out_unit,'("-coeff. gen. time : ",es12.5)') tgen + write(psb_out_unit,'("-desc asbly time : ",es12.5)') tcdasb + write(psb_out_unit,'("- mat asbly time : ",es12.5)') tasb + write(psb_out_unit,'("-total time : ",es12.5)') ttot + + end if + call psb_erractionrestore(err_act) + return + + 9999 call psb_error_handler(ctxt,err_act) + + return + end subroutine psb_d_gen_pde3d + + +end module psb_d_pde3d_mod + + + +program test + use psb_base_mod + use psb_ext_mod + use psb_oacc_mod + use psb_d_pde3d_mod + + implicit none + integer(psb_ipk_) :: n, i, info, m, nrm, nz + integer(psb_ipk_), parameter :: ntests=80, ngpu=20 + real(psb_dpk_) :: dot_dev, dot_host + type(psb_d_vect_oacc) :: tx, ty + type(psb_d_oacc_csr_sparse_mat) :: aacsr + real(psb_dpk_) :: t0, t1, t2, t3, csflp, elflp + double precision, external :: etime + + type(psb_dspmat_type) :: a + type(psb_desc_type) :: desc_a + type(psb_d_vect_type) :: xxv, bv + type(psb_d_csr_sparse_mat) :: acsr + character(len=5) :: afmt='csr' + real(psb_dpk_), allocatable :: vv(:), ydev(:), yhost(:) + type(psb_ctxt_type) :: ctxt + integer(psb_ipk_) :: iam, np, nth, idim + integer(psb_epk_) :: neq + + call psb_init(ctxt) + call psb_info(ctxt, iam, np) + + write(*,*) 'Enter size :' + read(*,*) idim + idim = max(1, idim) + + n = idim**3 + call psb_gen_pde3d(ctxt, idim, a, bv, xxv, desc_a, afmt, info) + call a%cp_to(acsr) + m = acsr%get_nrows() + n = acsr%get_ncols() + nz = acsr%get_nzeros() + call aacsr%all(m, n, nz, info) + aacsr%val = (acsr%val) + aacsr%ja = (acsr%ja) + aacsr%irp = (acsr%irp) + call aacsr%set_host() + call aacsr%sync() + + call initialize(n) + + call to_host() + t2 = etime() + do i = 1, ntests + dot_host = h_dot(n) + end do + t3 = etime() + + call tx%all(n, info) + call ty%all(n, info) + vv = bv%get_vect() + call bv%set_vect(v1) + call tx%set_vect(v1) + call ty%set_vect(v2) + t0 = etime() + do i = 1, ntests * ngpu + dot_dev = tx%dot_v(n, ty) + end do + !$acc wait + t1 = etime() + write(*,*) ' Dot Results : dev:', dot_dev, ' host:', dot_host + write(*,*) ' Timing : dev:', t1 - t0, (t1 - t0) / (ntests * ngpu), & + ' host:', t3 - t2, (t3 - t2) / ntests + + call a%mv_from(acsr) + t2 = etime() + do i = 1, ntests + call a%spmm(done, bv, dzero, xxv, info) + end do + t3 = etime() + yhost = xxv%get_vect() + t0 = etime() + do i = 1, ntests * ngpu + call aacsr%vect_mv(done, tx, dzero, ty, info) + end do + !$acc wait + t1 = etime() + ydev = ty%get_vect() + write(*,*) 'Correctness check: ', maxval(abs(ydev(:) - yhost(:))) + write(*,*) ' CSR PROD ' + write(*, '(2(a,f12.3,2x))') ' Timing (ms): ' + csflp = 2.d0 * nz / ((t1 - t0) / (ntests * ngpu)) + write(*, '(2(a,f12.3,2x))') ' dev:', 1e3 * (t1 - t0) / (ntests * ngpu), ' :', csflp / 1.d6 + csflp = 2.d0 * nz / ((t3 - t2) / (ntests)) + write(*, '(2(a,f12.3,2x))') ' host:', 1e3 * (t3 - t2) / ntests, ' :', csflp / 1.d6 + write(*,*) 'Done' + + call tx%free(info) + call ty%free(info) + call finalize_dev() + call finalize_host() + call psb_exit(ctxt) +end program test diff --git a/test/openacc/timers.c b/test/openacc/timers.c new file mode 100644 index 00000000..12fa4f56 --- /dev/null +++ b/test/openacc/timers.c @@ -0,0 +1,97 @@ +#include +#include +#include + +double wtime() +{ + struct timeval tt; + struct timezone tz; + double temp; + if (gettimeofday(&tt,&tz) != 0) { + fprintf(stderr,"Fatal error for gettimeofday ??? \n"); + exit(-1); + } + temp = ((double)tt.tv_sec)*1.0e3 + ((double)tt.tv_usec)*1.0e-3; + return(temp); +} + +double timef_() +{ + struct timeval tt; + struct timezone tz; + double temp; + if (gettimeofday(&tt,&tz) != 0) { + fprintf(stderr,"Fatal error for gettimeofday ??? \n"); + exit(-1); + } + temp = ((double)tt.tv_sec)*1.0e3 + ((double)tt.tv_usec)*1.0e-3; + return(temp); +} + +double timef() +{ + struct timeval tt; + struct timezone tz; + double temp; + if (gettimeofday(&tt,&tz) != 0) { + fprintf(stderr,"Fatal error for gettimeofday ??? \n"); + exit(-1); + } + temp = ((double)tt.tv_sec)*1.0e3 + ((double)tt.tv_usec)*1.0e-3; + return(temp); +} + +double etime() +{ + struct timeval tt; + struct timezone tz; + double temp; + if (gettimeofday(&tt,&tz) != 0) { + fprintf(stderr,"Fatal error for gettimeofday ??? \n"); + exit(-1); + } + temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6; + return(temp); +} + +double etime_() +{ + struct timeval tt; + struct timezone tz; + double temp; + if (gettimeofday(&tt,&tz) != 0) { + fprintf(stderr,"Fatal error for gettimeofday ??? \n"); + exit(-1); + } + temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6; + return(temp); +} + +double etimef() +{ + struct timeval tt; + struct timezone tz; + double temp; + if (gettimeofday(&tt,&tz) != 0) { + fprintf(stderr,"Fatal error for gettimeofday ??? \n"); + exit(-1); + } + temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6; + return(temp); +} + +double etimef_() +{ + struct timeval tt; + struct timezone tz; + double temp; + if (gettimeofday(&tt,&tz) != 0) { + fprintf(stderr,"Fatal error for gettimeofday ??? \n"); + exit(-1); + } + temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6; + return(temp); +} + + + diff --git a/test/openacc/vectoacc.F90 b/test/openacc/vectoacc.F90 new file mode 100644 index 00000000..b50fa317 --- /dev/null +++ b/test/openacc/vectoacc.F90 @@ -0,0 +1,85 @@ +program vectoacc + use psb_base_mod + use psb_oacc_mod + implicit none + + type(psb_d_vect_oacc) :: v3, v4, v5 + integer(psb_ipk_) :: info, n, i + real(psb_dpk_) :: alpha, beta, result + double precision, external :: etime + + real(psb_dpk_) :: dot_host, dot_dev, t_host, t_dev + double precision :: time_start, time_end + integer(psb_ipk_), parameter :: ntests=80, ngpu=20 + + write(*, *) 'Test of the vector operations with OpenACC' + + write(*, *) 'Enter the size of the vectors' + read(*, *) n + alpha = 2.0 + beta = 0.5 + + call v3%all(n, info) + call v4%all(n, info) + call v5%all(n, info) + + do i = 1, n + v3%v(i) = real(i, psb_dpk_) + v4%v(i) = real(n - i, psb_dpk_) + end do + + call v3%set_dev() + call v4%set_dev() + + call v3%scal(alpha) + call v3%sync() + + do i = 1, n + if (v3%v(i) /= alpha * real(i, psb_dpk_)) then + write(*, *) 'Scal error : index', i + end if + end do + write(*, *) 'Scal test passed' + + result = v3%dot_v(n, v4) + call v3%sync() + call v4%sync() + if (result /= sum(v3%v * v4%v)) then + write(*, *) 'Dot_v error, expected result:', sum(v3%v * v4%v), 'instead of :', result + end if + write(*, *) 'Dot_v test passed' + + result = v3%nrm2(n) + call v3%sync() + if (result /= sqrt(sum(v3%v ** 2))) then + write(*, *) 'nrm2 error, expected result:', sqrt(sum(v3%v ** 2)), 'instead of :', result + end if + write(*, *) 'nrm2 test passed' + + call v3%set_host() + call v4%set_host() + + time_start = etime() + do i = 1, ntests + dot_host = sum(v3%v * v4%v) + end do + time_end = etime() + t_host = (time_end - time_start) / real(ntests) + write(*, *) 'Performance host: ', t_host, ' sec' + + call v3%set_dev() + call v4%set_dev() + time_start = etime() + do i = 1, ntests + dot_dev = v3%dot_v(n, v4) + end do + !$acc wait + time_end = etime() + t_dev = (time_end - time_start) / real(ntests) + write(*, *) 'Performance device: ', t_dev, ' sec' + + call v3%free(info) + call v4%free(info) + call v5%free(info) + +end program vectoacc