Cuda multivect methods implementation

psblas-bgmres
gabrielequatrana 10 months ago
parent c1e4f9c2b1
commit a624b7098b

@ -1357,17 +1357,18 @@ module psb_d_cuda_multivect_mod
procedure, pass(x) :: get_nrows => d_cuda_multi_get_nrows procedure, pass(x) :: get_nrows => d_cuda_multi_get_nrows
procedure, pass(x) :: get_ncols => d_cuda_multi_get_ncols procedure, pass(x) :: get_ncols => d_cuda_multi_get_ncols
procedure, nopass :: get_fmt => d_cuda_multi_get_fmt procedure, nopass :: get_fmt => d_cuda_multi_get_fmt
! TODO
!!$ procedure, pass(x) :: dot_v => d_cuda_multi_dot_v !!$ procedure, pass(x) :: dot_v => d_cuda_multi_dot_v
!!$ procedure, pass(x) :: dot_a => d_cuda_multi_dot_a !!$ procedure, pass(x) :: dot_a => d_cuda_multi_dot_a
!!$ procedure, pass(y) :: axpby_v => d_cuda_multi_axpby_v procedure, pass(y) :: axpby_v => d_cuda_multi_axpby_v
!!$ procedure, pass(y) :: axpby_a => d_cuda_multi_axpby_a procedure, pass(y) :: axpby_a => d_cuda_multi_axpby_a
!!$ procedure, pass(y) :: mlt_v => d_cuda_multi_mlt_v !!$ procedure, pass(y) :: mlt_v => d_cuda_multi_mlt_v
!!$ procedure, pass(y) :: mlt_a => d_cuda_multi_mlt_a !!$ procedure, pass(y) :: mlt_a => d_cuda_multi_mlt_a
!!$ procedure, pass(z) :: mlt_a_2 => d_cuda_multi_mlt_a_2 !!$ procedure, pass(z) :: mlt_a_2 => d_cuda_multi_mlt_a_2
!!$ procedure, pass(z) :: mlt_v_2 => d_cuda_multi_mlt_v_2 !!$ procedure, pass(z) :: mlt_v_2 => d_cuda_multi_mlt_v_2
!!$ procedure, pass(x) :: scal => d_cuda_multi_scal !!$ procedure, pass(x) :: scal => d_cuda_multi_scal
!!$ procedure, pass(x) :: nrm2 => d_cuda_multi_nrm2 procedure, pass(x) :: nrm2 => d_cuda_multi_nrm2
!!$ procedure, pass(x) :: amax => d_cuda_multi_amax procedure, pass(x) :: amax => d_cuda_multi_amax
!!$ procedure, pass(x) :: asum => d_cuda_multi_asum !!$ procedure, pass(x) :: asum => d_cuda_multi_asum
procedure, pass(x) :: all => d_cuda_multi_all procedure, pass(x) :: all => d_cuda_multi_all
procedure, pass(x) :: zero => d_cuda_multi_zero procedure, pass(x) :: zero => d_cuda_multi_zero
@ -1607,108 +1608,109 @@ contains
res = 'dGPU' res = 'dGPU'
end function d_cuda_multi_get_fmt end function d_cuda_multi_get_fmt
!!$ function d_cuda_multi_dot_v(n,x,y) result(res) function d_cuda_multi_dot_v(n,x,y) result(res)
!!$ implicit none implicit none
!!$ class(psb_d_multivect_cuda), intent(inout) :: x class(psb_d_multivect_cuda), intent(inout) :: x
!!$ class(psb_d_base_multivect_type), intent(inout) :: y class(psb_d_base_multivect_type), intent(inout) :: y
!!$ integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(in) :: n
!!$ real(psb_dpk_) :: res real(psb_dpk_), allocatable :: res(:,:)
!!$ real(psb_dpk_), external :: ddot real(psb_dpk_), external :: ddot
!!$ integer(psb_ipk_) :: info integer(psb_ipk_) :: info
!!$
!!$ res = dzero res = dzero
!!$ ! !
!!$ ! Note: this is the gpu implementation. ! Note: this is the gpu implementation.
!!$ ! When we get here, we are sure that X is of ! When we get here, we are sure that X is of
!!$ ! TYPE psb_d_vect ! TYPE psb_d_vect
!!$ ! !
!!$ select type(yy => y) ! TODO tra
!!$ type is (psb_d_base_multivect_type) select type(yy => y)
!!$ if (x%is_dev()) call x%sync() type is (psb_d_multivect_cuda)
!!$ res = ddot(n,x%v,1,yy%v,1) if (x%is_host()) call x%sync()
!!$ type is (psb_d_multivect_cuda) if (yy%is_host()) call yy%sync()
!!$ if (x%is_host()) call x%sync() info = dotMultiVecDevice(res,n,x%deviceVect,yy%deviceVect,x%get_ncols())
!!$ if (yy%is_host()) call yy%sync() if (info /= 0) then
!!$ info = dotMultiVecDevice(res,n,x%deviceVect,yy%deviceVect) info = psb_err_internal_error_
!!$ if (info /= 0) then call psb_errpush(info,'d_cuda_multi_dot_v')
!!$ info = psb_err_internal_error_ end if
!!$ call psb_errpush(info,'d_cuda_multi_dot_v')
!!$ end if ! TODO
!!$ class default
!!$ class default ! y%sync is done in dot_a
!!$ ! y%sync is done in dot_a call x%sync()
!!$ call x%sync() res = y%dot(n,x%v)
!!$ res = y%dot(n,x%v) end select
!!$ end select
!!$ end function d_cuda_multi_dot_v
!!$ end function d_cuda_multi_dot_v
!!$ ! TODO
!!$ function d_cuda_multi_dot_a(n,x,y) result(res) function d_cuda_multi_dot_a(n,x,y) result(res)
!!$ implicit none implicit none
!!$ class(psb_d_multivect_cuda), intent(inout) :: x class(psb_d_multivect_cuda), intent(inout) :: x
!!$ real(psb_dpk_), intent(in) :: y(:) real(psb_dpk_), intent(in) :: y(:)
!!$ integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(in) :: n
!!$ real(psb_dpk_) :: res real(psb_dpk_), allocatable :: res(:,:)
!!$ real(psb_dpk_), external :: ddot real(psb_dpk_), external :: ddot
!!$
!!$ if (x%is_dev()) call x%sync() if (x%is_dev()) call x%sync()
!!$ res = ddot(n,y,1,x%v,1) allocate(res(2,2))
!!$ res = ddot(n,y,1,x%v,1)
!!$ end function d_cuda_multi_dot_a
!!$ end function d_cuda_multi_dot_a
!!$ subroutine d_cuda_multi_axpby_v(m,alpha, x, beta, y, info)
!!$ use psi_serial_mod subroutine d_cuda_multi_axpby_v(m,alpha, x, beta, y, info, n)
!!$ implicit none use psi_serial_mod
!!$ integer(psb_ipk_), intent(in) :: m implicit none
!!$ class(psb_d_base_multivect_type), intent(inout) :: x integer(psb_ipk_), intent(in) :: m
!!$ class(psb_d_multivect_cuda), intent(inout) :: y class(psb_d_base_multivect_type), intent(inout) :: x
!!$ real(psb_dpk_), intent (in) :: alpha, beta class(psb_d_multivect_cuda), intent(inout) :: y
!!$ integer(psb_ipk_), intent(out) :: info real(psb_dpk_), intent (in) :: alpha, beta
!!$ integer(psb_ipk_) :: nx, ny integer(psb_ipk_), intent(out) :: info
!!$ integer(psb_ipk_), intent(in), optional :: n
!!$ info = psb_success_ integer(psb_ipk_) :: nc, nx, ny
!!$
!!$ select type(xx => x) info = psb_success_
!!$ type is (psb_d_base_multivect_type) select type(xx => x)
!!$ if ((beta /= dzero).and.(y%is_dev()))& type is (psb_d_multivect_cuda)
!!$ & call y%sync() if ((beta /= dzero).and.(y%is_host())) call y%sync()
!!$ call psb_geaxpby(m,alpha,xx%v,beta,y%v,info) if (xx%is_host()) call xx%sync()
!!$ call y%set_host() nx = getMultiVecDeviceSize(xx%deviceVect)
!!$ type is (psb_d_multivect_cuda) ny = getMultiVecDeviceSize(y%deviceVect)
!!$ ! Do something different here if ((nx<m).or.(ny<m)) then
!!$ if ((beta /= dzero).and.y%is_host())& info = psb_err_internal_error_
!!$ & call y%sync() else
!!$ if (xx%is_host()) call xx%sync() info = axpbyMultiVecDevice(m,alpha,xx%deviceVect,beta,y%deviceVect)
!!$ nx = getMultiVecDeviceSize(xx%deviceVect) end if
!!$ ny = getMultiVecDeviceSize(y%deviceVect) call y%set_dev()
!!$ if ((nx<m).or.(ny<m)) then class default
!!$ info = psb_err_internal_error_ ! Do it on the host side
!!$ info = psb_err_internal_error_ if ((alpha /= dzero).and.(x%is_dev())) call x%sync()
!!$ else call y%axpby(m,alpha,x%v,beta,info,n=n)
!!$ info = axpbyMultiVecDevice(m,alpha,xx%deviceVect,beta,y%deviceVect) end select
!!$ end if
!!$ call y%set_dev() end subroutine d_cuda_multi_axpby_v
!!$ class default
!!$ call x%sync() subroutine d_cuda_multi_axpby_a(m,alpha, x, beta, y, info, n)
!!$ call y%axpby(m,alpha,x%v,beta,info) use psi_serial_mod
!!$ end select implicit none
!!$ integer(psb_ipk_), intent(in) :: m
!!$ end subroutine d_cuda_multi_axpby_v real(psb_dpk_), intent(in) :: x(:,:)
!!$ class(psb_d_multivect_cuda), intent(inout) :: y
!!$ subroutine d_cuda_multi_axpby_a(m,alpha, x, beta, y, info) real(psb_dpk_), intent (in) :: alpha, beta
!!$ use psi_serial_mod integer(psb_ipk_), intent(out) :: info
!!$ implicit none integer(psb_ipk_), intent(in), optional :: n
!!$ integer(psb_ipk_), intent(in) :: m integer(psb_ipk_) :: nc
!!$ real(psb_dpk_), intent(in) :: x(:)
!!$ class(psb_d_multivect_cuda), intent(inout) :: y if (present(n)) then
!!$ real(psb_dpk_), intent (in) :: alpha, beta nc = n
!!$ integer(psb_ipk_), intent(out) :: info else
!!$ nc = min(size(x,2),size(y%v,2))
!!$ if (y%is_dev()) call y%sync() end if
!!$ call psb_geaxpby(m,alpha,x,beta,y%v,info) if ((beta /= dzero).and.(y%is_dev())) call y%sync()
!!$ call y%set_host() call psb_geaxpby(m,nc,alpha,x,beta,y%v,info)
!!$ end subroutine d_cuda_multi_axpby_a call y%set_host()
!!$ end subroutine d_cuda_multi_axpby_a
!!$ subroutine d_cuda_multi_mlt_v(x, y, info) !!$ subroutine d_cuda_multi_mlt_v(x, y, info)
!!$ use psi_serial_mod !!$ use psi_serial_mod
!!$ implicit none !!$ implicit none
@ -1860,41 +1862,50 @@ contains
!!$ call x%psb_d_base_multivect_type%scal(alpha) !!$ call x%psb_d_base_multivect_type%scal(alpha)
!!$ call x%set_host() !!$ call x%set_host()
!!$ end subroutine d_cuda_multi_scal !!$ end subroutine d_cuda_multi_scal
!!$
!!$ function d_cuda_multi_nrm2(nr,x) result(res)
!!$ function d_cuda_multi_nrm2(n,x) result(res) implicit none
!!$ implicit none class(psb_d_multivect_cuda), intent(inout) :: x
!!$ class(psb_d_multivect_cuda), intent(inout) :: x integer(psb_ipk_), intent(in) :: nr
!!$ integer(psb_ipk_), intent(in) :: n real(psb_dpk_), allocatable :: res(:)
!!$ real(psb_dpk_) :: res integer(psb_ipk_) :: info
!!$ integer(psb_ipk_) :: info ! WARNING: this should be changed.
!!$ ! WARNING: this should be changed. if (x%is_host()) call x%sync()
!!$ if (x%is_host()) call x%sync() allocate(res(x%get_ncols()))
!!$ info = nrm2MultiVecDevice(res,n,x%deviceVect) info = nrm2MultiVecDevice(res,nr,x%deviceVect)
!!$
!!$ end function d_cuda_multi_nrm2 end function d_cuda_multi_nrm2
!!$
!!$ function d_cuda_multi_amax(n,x) result(res) function d_cuda_multi_amax(nr,x) result(res)
!!$ implicit none implicit none
!!$ class(psb_d_multivect_cuda), intent(inout) :: x class(psb_d_multivect_cuda), intent(inout) :: x
!!$ integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(in) :: nr
!!$ real(psb_dpk_) :: res real(psb_dpk_) :: res
!!$ integer(psb_ipk_) :: i, nc
!!$ if (x%is_dev()) call x%sync()
!!$ res = maxval(abs(x%v(1:n))) if (x%is_dev()) call x%sync()
!!$ nc = x%get_ncols()
!!$ end function d_cuda_multi_amax res = 0
!!$ do i=1,nr
!!$ function d_cuda_multi_asum(n,x) result(res) res = max(res,sum(abs(x%v(i,1:nc))))
!!$ implicit none end do
!!$ class(psb_d_multivect_cuda), intent(inout) :: x
!!$ integer(psb_ipk_), intent(in) :: n end function d_cuda_multi_amax
!!$ real(psb_dpk_) :: res
!!$ function d_cuda_multi_asum(nr,x) result(res)
!!$ if (x%is_dev()) call x%sync() implicit none
!!$ res = sum(abs(x%v(1:n))) class(psb_d_multivect_cuda), intent(inout) :: x
!!$ integer(psb_ipk_), intent(in) :: nr
!!$ end function d_cuda_multi_asum real(psb_dpk_) :: res
integer(psb_ipk_) :: j
if (x%is_dev()) call x%sync()
res = 0
do j=1,x%get_ncols()
res = max(res,sum(abs(x%v(1:nr,j))))
end do
end function d_cuda_multi_asum
subroutine d_cuda_multi_all(m,n, x, info) subroutine d_cuda_multi_all(m,n, x, info)
use psi_serial_mod use psi_serial_mod

@ -267,6 +267,15 @@ module psb_d_vectordev_mod
real(c_double) :: res real(c_double) :: res
type(c_ptr), value :: deviceVecA, deviceVecB type(c_ptr), value :: deviceVecA, deviceVecB
end function dotMultiVecDeviceDouble end function dotMultiVecDeviceDouble
function dotMultiVecDeviceDoubleR2(res, n,deviceVecA,deviceVecB,ld) &
& result(val) bind(c,name='dotMultiVecDeviceDouble')
use iso_c_binding
integer(c_int) :: val
integer(c_int), value :: n
real(c_double) :: res(ld,*)
integer(c_int), value :: ld
type(c_ptr), value :: deviceVecA, deviceVecB
end function dotMultiVecDeviceDoubleR2
end interface end interface
interface nrm2MultiVecDevice interface nrm2MultiVecDevice
@ -278,6 +287,14 @@ module psb_d_vectordev_mod
real(c_double) :: res real(c_double) :: res
type(c_ptr), value :: deviceVecA type(c_ptr), value :: deviceVecA
end function nrm2MultiVecDeviceDouble end function nrm2MultiVecDeviceDouble
function nrm2MultiVecDeviceDoubleR2(res,n,deviceVecA) &
& result(val) bind(c,name='nrm2MultiVecDeviceDouble')
use iso_c_binding
integer(c_int) :: val
integer(c_int), value :: n
real(c_double) :: res(*)
type(c_ptr), value :: deviceVecA
end function nrm2MultiVecDeviceDoubleR2
end interface end interface
interface amaxMultiVecDevice interface amaxMultiVecDevice

@ -0,0 +1,39 @@
TOPDIR=../../..
include $(TOPDIR)/Make.inc
#
# Libraries used
#
LIBDIR=$(TOPDIR)/lib/
PSBLIBDIR=$(TOPDIR)/lib/
OPTDIR=$(LIBDIR)
PSBINCDIR=$(TOPDIR)/include
PSBMODDIR=$(TOPDIR)/modules
PSBLAS_LIB= -L$(LIBDIR) -L$(PSBLIBDIR) $(LCUDA) -lpsb_ext -lpsb_util -lpsb_base
INCDIR=$(TOPDIR)/include
MODDIR=$(TOPDIR)/modules
LDLIBS=$(PSBGPULDLIBS)
FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG)$(INCDIR) $(FMFLAG). $(FMFLAG)$(PSBMODDIR) $(FMFLAG)$(PSBINCDIR) $(LIBRSB_DEFINES)
DPMGOBJS=dpdegenmm.o
EXEDIR=./runs
all: dir pgen file
pgen: dpdegenmm
file:
dpdegenmm: dir
dir:
(if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi)
dpdegenmm: $(DPMGOBJS)
$(FLINK) $(LOPT) $(DPMGOBJS) -fopenmp -o dpdegenmm $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS)
/bin/mv dpdegenmm $(EXEDIR)
clean:
/bin/rm -f $(DPMGOBJS) $(EXEDIR)/dpdegenmm
lib:
(cd ../../; make library)
verycleanlib:
(cd ../../; make veryclean)

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save