psblas3/cuda/psb_s_cuda_vect_mod.F90

2100 lines
64 KiB
Fortran

! Parallel Sparse BLAS GPU plugin
! (C) Copyright 2013
!
! Salvatore Filippone
! Alessandro Fanfarillo
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
module psb_s_cuda_vect_mod
use iso_c_binding
use psb_const_mod
use psb_error_mod
use psb_s_vect_mod
use psb_cuda_env_mod
use psb_i_vect_mod
use psb_i_cuda_vect_mod
use psb_i_vectordev_mod
use psb_s_vectordev_mod
integer(psb_ipk_), parameter, private :: is_host = -1
integer(psb_ipk_), parameter, private :: is_sync = 0
integer(psb_ipk_), parameter, private :: is_dev = 1
type, extends(psb_s_base_vect_type) :: psb_s_vect_cuda
integer :: state = is_host
type(c_ptr) :: deviceVect = c_null_ptr
real(c_float), allocatable :: pinned_buffer(:)
type(c_ptr) :: dt_p_buf = c_null_ptr
real(c_float), allocatable :: buffer(:)
type(c_ptr) :: dt_buf = c_null_ptr
integer :: dt_buf_sz = 0
type(c_ptr) :: i_buf = c_null_ptr
integer :: i_buf_sz = 0
contains
procedure, pass(x) :: get_nrows => s_cuda_get_nrows
procedure, nopass :: get_fmt => s_cuda_get_fmt
procedure, pass(x) :: all => s_cuda_all
procedure, pass(x) :: zero => s_cuda_zero
procedure, pass(x) :: asb_m => s_cuda_asb_m
procedure, pass(x) :: sync => s_cuda_sync
procedure, pass(x) :: sync_space => s_cuda_sync_space
procedure, pass(x) :: bld_x => s_cuda_bld_x
procedure, pass(x) :: bld_mn => s_cuda_bld_mn
procedure, pass(x) :: free => s_cuda_free
procedure, pass(x) :: ins_a => s_cuda_ins_a
procedure, pass(x) :: ins_v => s_cuda_ins_v
procedure, pass(x) :: is_host => s_cuda_is_host
procedure, pass(x) :: is_dev => s_cuda_is_dev
procedure, pass(x) :: is_sync => s_cuda_is_sync
procedure, pass(x) :: set_host => s_cuda_set_host
procedure, pass(x) :: set_dev => s_cuda_set_dev
procedure, pass(x) :: set_sync => s_cuda_set_sync
procedure, pass(x) :: set_scal => s_cuda_set_scal
!!$ procedure, pass(x) :: set_vect => s_cuda_set_vect
procedure, pass(x) :: gthzv_x => s_cuda_gthzv_x
procedure, pass(y) :: sctb => s_cuda_sctb
procedure, pass(y) :: sctb_x => s_cuda_sctb_x
procedure, pass(x) :: gthzbuf => s_cuda_gthzbuf
procedure, pass(y) :: sctb_buf => s_cuda_sctb_buf
procedure, pass(x) :: new_buffer => s_cuda_new_buffer
procedure, nopass :: device_wait => s_cuda_device_wait
procedure, pass(x) :: free_buffer => s_cuda_free_buffer
procedure, pass(x) :: maybe_free_buffer => s_cuda_maybe_free_buffer
procedure, pass(x) :: dot_v => s_cuda_dot_v
procedure, pass(x) :: dot_a => s_cuda_dot_a
procedure, pass(y) :: axpby_v => s_cuda_axpby_v
procedure, pass(y) :: axpby_a => s_cuda_axpby_a
procedure, pass(z) :: abgdxyz => s_cuda_abgdxyz
procedure, pass(y) :: mlt_v => s_cuda_mlt_v
procedure, pass(y) :: mlt_a => s_cuda_mlt_a
procedure, pass(z) :: mlt_a_2 => s_cuda_mlt_a_2
procedure, pass(z) :: mlt_v_2 => s_cuda_mlt_v_2
procedure, pass(x) :: scal => s_cuda_scal
procedure, pass(x) :: nrm2 => s_cuda_nrm2
procedure, pass(x) :: amax => s_cuda_amax
procedure, pass(x) :: asum => s_cuda_asum
procedure, pass(x) :: absval1 => s_cuda_absval1
procedure, pass(x) :: absval2 => s_cuda_absval2
final :: s_cuda_vect_finalize
end type psb_s_vect_cuda
public :: psb_s_vect_cuda_
private :: constructor
interface psb_s_vect_cuda_
module procedure constructor
end interface psb_s_vect_cuda_
contains
function constructor(x) result(this)
real(psb_spk_) :: x(:)
type(psb_s_vect_cuda) :: this
integer(psb_ipk_) :: info
this%v = x
call this%asb(size(x),info)
end function constructor
subroutine s_cuda_device_wait()
call psb_cudaSync()
end subroutine s_cuda_device_wait
subroutine s_cuda_new_buffer(n,x,info)
use psb_realloc_mod
use psb_cuda_env_mod
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n
integer(psb_ipk_), intent(out) :: info
if (psb_cuda_DeviceHasUVA()) then
if (allocated(x%combuf)) then
if (size(x%combuf)<n) then
call inner_unregister(x%combuf)
deallocate(x%combuf,stat=info)
end if
end if
if (.not.allocated(x%combuf)) then
call psb_realloc(n,x%combuf,info)
if (info == 0) info = inner_register(x%combuf,x%dt_p_buf)
end if
else
call psb_realloc(n,x%combuf,info)
end if
if (c_associated(x%dt_buf)) then
if (x%dt_buf_sz < n) then
call freeFloat(x%dt_buf)
x%dt_buf = c_null_ptr
x%dt_buf_sz = 0
end if
end if
if (.not.c_associated(x%dt_buf)) then
info = allocateFloat(x%dt_buf,n)
x%dt_buf_sz = n
end if
if (c_associated(x%i_buf)) then
if (x%i_buf_sz < n) then
call freeInt(x%i_buf)
x%i_buf = c_null_ptr
x%i_buf_sz = 0
end if
end if
if (.not.c_associated(x%i_buf)) then
info = allocateInt(x%i_buf,n)
x%i_buf_sz = n
end if
end subroutine s_cuda_new_buffer
subroutine s_cuda_maybe_free_buffer(x,info)
use psb_realloc_mod
use psb_cuda_env_mod
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
info = 0
if (psb_cuda_get_maybe_free_buffer()) &
& call x%free_buffer(info)
end subroutine s_cuda_maybe_free_buffer
subroutine s_cuda_free_buffer(x,info)
use psb_realloc_mod
use psb_cuda_env_mod
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
if (allocated(x%pinned_buffer)) then
call inner_unregister(x%pinned_buffer)
deallocate(x%pinned_buffer, stat=info)
end if
if (allocated(x%combuf)) then
if (psb_cuda_DeviceHasUVA()) &
& call inner_unregister(x%combuf)
deallocate(x%combuf, stat=info)
end if
if (allocated(x%buffer)) then
deallocate(x%buffer, stat=info)
end if
if (c_associated(x%dt_buf)) then
call freeFloat(x%dt_buf)
x%dt_buf = c_null_ptr
end if
if (c_associated(x%i_buf)) then
call freeInt(x%i_buf)
x%i_buf = c_null_ptr
end if
x%dt_buf_sz=0
x%i_buf_sz=0
end subroutine s_cuda_free_buffer
subroutine s_cuda_gthzv_x(i,n,idx,x,y)
use psb_cuda_env_mod
use psi_serial_mod
integer(psb_ipk_) :: i,n
class(psb_i_base_vect_type) :: idx
real(psb_spk_) :: y(:)
class(psb_s_vect_cuda) :: x
integer :: info, ni
info = 0
select type(ii=> idx)
class is (psb_i_vect_cuda)
if (ii%is_host()) call ii%sync()
if (x%is_host()) call x%sync()
if (psb_cuda_DeviceHasUVA()) then
!
! Only need a sync in this branch; in the others
! cudamemCpy acts as a sync point.
!
if (allocated(x%pinned_buffer)) then
if (size(x%pinned_buffer) < n) then
call inner_unregister(x%pinned_buffer)
deallocate(x%pinned_buffer, stat=info)
end if
end if
if (.not.allocated(x%pinned_buffer)) then
allocate(x%pinned_buffer(n),stat=info)
if (info == 0) info = inner_register(x%pinned_buffer,x%dt_p_buf)
if (info /= 0) &
& write(0,*) 'Error from inner_register ',info
endif
info = igathMultiVecDeviceFloatVecIdx(x%deviceVect,&
& 0, n, i, ii%deviceVect, 1, x%dt_p_buf, 1)
call psb_cudaSync()
y(1:n) = x%pinned_buffer(1:n)
else
if (allocated(x%buffer)) then
if (size(x%buffer) < n) then
deallocate(x%buffer, stat=info)
end if
end if
if (.not.allocated(x%buffer)) then
allocate(x%buffer(n),stat=info)
end if
if (x%dt_buf_sz < n) then
if (c_associated(x%dt_buf)) then
call freeFloat(x%dt_buf)
x%dt_buf = c_null_ptr
end if
info = allocateFloat(x%dt_buf,n)
x%dt_buf_sz=n
end if
if (info == 0) &
& info = igathMultiVecDeviceFloatVecIdx(x%deviceVect,&
& 0, n, i, ii%deviceVect, 1, x%dt_buf, 1)
if (info == 0) &
& info = readFloat(x%dt_buf,y,n)
endif
class default
! Do not go for brute force, but move the index vector
ni = size(ii%v)
if (x%i_buf_sz < ni) then
if (c_associated(x%i_buf)) then
call freeInt(x%i_buf)
x%i_buf = c_null_ptr
end if
info = allocateInt(x%i_buf,ni)
x%i_buf_sz=ni
end if
if (allocated(x%buffer)) then
if (size(x%buffer) < n) then
deallocate(x%buffer, stat=info)
end if
end if
if (.not.allocated(x%buffer)) then
allocate(x%buffer(n),stat=info)
end if
if (x%dt_buf_sz < n) then
if (c_associated(x%dt_buf)) then
call freeFloat(x%dt_buf)
x%dt_buf = c_null_ptr
end if
info = allocateFloat(x%dt_buf,n)
x%dt_buf_sz=n
end if
if (info == 0) &
& info = writeInt(x%i_buf,ii%v,ni)
if (info == 0) &
& info = igathMultiVecDeviceFloat(x%deviceVect,&
& 0, n, i, x%i_buf, 1, x%dt_buf, 1)
if (info == 0) &
& info = readFloat(x%dt_buf,y,n)
end select
end subroutine s_cuda_gthzv_x
subroutine s_cuda_gthzbuf(i,n,idx,x)
use psb_cuda_env_mod
use psi_serial_mod
integer(psb_ipk_) :: i,n
class(psb_i_base_vect_type) :: idx
class(psb_s_vect_cuda) :: x
integer :: info, ni
info = 0
!!$ write(0,*) 'Starting gth_zbuf'
if (.not.allocated(x%combuf)) then
call psb_errpush(psb_err_alloc_dealloc_,'gthzbuf')
return
end if
select type(ii=> idx)
class is (psb_i_vect_cuda)
if (ii%is_host()) call ii%sync()
if (x%is_host()) call x%sync()
if (psb_cuda_DeviceHasUVA()) then
info = igathMultiVecDeviceFloatVecIdx(x%deviceVect,&
& 0, n, i, ii%deviceVect, i,x%dt_p_buf, 1)
else
info = igathMultiVecDeviceFloatVecIdx(x%deviceVect,&
& 0, n, i, ii%deviceVect, i,x%dt_buf, 1)
if (info == 0) &
& info = readFloat(i,x%dt_buf,x%combuf(i:),n,1)
endif
class default
! Do not go for brute force, but move the index vector
ni = size(ii%v)
info = 0
if (.not.c_associated(x%i_buf)) then
info = allocateInt(x%i_buf,ni)
x%i_buf_sz=ni
end if
if (info == 0) &
& info = writeInt(i,x%i_buf,ii%v(i:),n,1)
if (info == 0) &
& info = igathMultiVecDeviceFloat(x%deviceVect,&
& 0, n, i, x%i_buf, i,x%dt_buf, 1)
if (info == 0) &
& info = readFloat(i,x%dt_buf,x%combuf(i:),n,1)
end select
end subroutine s_cuda_gthzbuf
subroutine s_cuda_sctb(n,idx,x,beta,y)
implicit none
!use psb_const_mod
integer(psb_ipk_) :: n, idx(:)
real(psb_spk_) :: beta, x(:)
class(psb_s_vect_cuda) :: y
integer(psb_ipk_) :: info
if (n == 0) return
if (y%is_dev()) call y%sync()
call y%psb_s_base_vect_type%sctb(n,idx,x,beta)
call y%set_host()
end subroutine s_cuda_sctb
subroutine s_cuda_sctb_x(i,n,idx,x,beta,y)
use psb_cuda_env_mod
use psi_serial_mod
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
real(psb_spk_) :: beta, x(:)
class(psb_s_vect_cuda) :: y
integer :: info, ni
select type(ii=> idx)
class is (psb_i_vect_cuda)
if (ii%is_host()) call ii%sync()
if (y%is_host()) call y%sync()
!
if (psb_cuda_DeviceHasUVA()) then
if (allocated(y%pinned_buffer)) then
if (size(y%pinned_buffer) < n) then
call inner_unregister(y%pinned_buffer)
deallocate(y%pinned_buffer, stat=info)
end if
end if
if (.not.allocated(y%pinned_buffer)) then
allocate(y%pinned_buffer(n),stat=info)
if (info == 0) info = inner_register(y%pinned_buffer,y%dt_p_buf)
if (info /= 0) &
& write(0,*) 'Error from inner_register ',info
endif
y%pinned_buffer(1:n) = x(1:n)
info = iscatMultiVecDeviceFloatVecIdx(y%deviceVect,&
& 0, n, i, ii%deviceVect, 1, y%dt_p_buf, 1,beta)
else
if (allocated(y%buffer)) then
if (size(y%buffer) < n) then
deallocate(y%buffer, stat=info)
end if
end if
if (.not.allocated(y%buffer)) then
allocate(y%buffer(n),stat=info)
end if
if (y%dt_buf_sz < n) then
if (c_associated(y%dt_buf)) then
call freeFloat(y%dt_buf)
y%dt_buf = c_null_ptr
end if
info = allocateFloat(y%dt_buf,n)
y%dt_buf_sz=n
end if
info = writeFloat(y%dt_buf,x,n)
info = iscatMultiVecDeviceFloatVecIdx(y%deviceVect,&
& 0, n, i, ii%deviceVect, 1, y%dt_buf, 1,beta)
end if
class default
ni = size(ii%v)
if (y%i_buf_sz < ni) then
if (c_associated(y%i_buf)) then
call freeInt(y%i_buf)
y%i_buf = c_null_ptr
end if
info = allocateInt(y%i_buf,ni)
y%i_buf_sz=ni
end if
if (allocated(y%buffer)) then
if (size(y%buffer) < n) then
deallocate(y%buffer, stat=info)
end if
end if
if (.not.allocated(y%buffer)) then
allocate(y%buffer(n),stat=info)
end if
if (y%dt_buf_sz < n) then
if (c_associated(y%dt_buf)) then
call freeFloat(y%dt_buf)
y%dt_buf = c_null_ptr
end if
info = allocateFloat(y%dt_buf,n)
y%dt_buf_sz=n
end if
if (info == 0) &
& info = writeInt(y%i_buf,ii%v(i:i+n-1),n)
info = writeFloat(y%dt_buf,x,n)
info = iscatMultiVecDeviceFloat(y%deviceVect,&
& 0, n, 1, y%i_buf, 1, y%dt_buf, 1,beta)
end select
!
! Need a sync here to make sure we are not reallocating
! the buffers before iscatMulti has finished.
!
call psb_cudaSync()
call y%set_dev()
end subroutine s_cuda_sctb_x
subroutine s_cuda_sctb_buf(i,n,idx,beta,y)
use psi_serial_mod
use psb_cuda_env_mod
implicit none
integer(psb_ipk_) :: i, n
class(psb_i_base_vect_type) :: idx
real(psb_spk_) :: beta
class(psb_s_vect_cuda) :: y
integer(psb_ipk_) :: info, ni
!!$ write(0,*) 'Starting sctb_buf'
if (.not.allocated(y%combuf)) then
call psb_errpush(psb_err_alloc_dealloc_,'sctb_buf')
return
end if
select type(ii=> idx)
class is (psb_i_vect_cuda)
if (ii%is_host()) call ii%sync()
if (y%is_host()) call y%sync()
if (psb_cuda_DeviceHasUVA()) then
info = iscatMultiVecDeviceFloatVecIdx(y%deviceVect,&
& 0, n, i, ii%deviceVect, i, y%dt_p_buf, 1,beta)
else
info = writeFloat(i,y%dt_buf,y%combuf(i:),n,1)
info = iscatMultiVecDeviceFloatVecIdx(y%deviceVect,&
& 0, n, i, ii%deviceVect, i, y%dt_buf, 1,beta)
end if
class default
!call y%sct(n,ii%v(i:),x,beta)
ni = size(ii%v)
info = 0
if (.not.c_associated(y%i_buf)) then
info = allocateInt(y%i_buf,ni)
y%i_buf_sz=ni
end if
if (info == 0) &
& info = writeInt(i,y%i_buf,ii%v(i:),n,1)
if (info == 0) &
& info = writeFloat(i,y%dt_buf,y%combuf(i:),n,1)
if (info == 0) info = iscatMultiVecDeviceFloat(y%deviceVect,&
& 0, n, i, y%i_buf, i, y%dt_buf, 1,beta)
end select
!!$ write(0,*) 'Done sctb_buf'
end subroutine s_cuda_sctb_buf
subroutine s_cuda_bld_x(x,this)
use psb_base_mod
real(psb_spk_), intent(in) :: this(:)
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_) :: info
call psb_realloc(size(this),x%v,info)
if (info /= 0) then
info=psb_err_alloc_request_
call psb_errpush(info,'s_cuda_bld_x',&
& i_err=(/size(this),izero,izero,izero,izero/))
end if
x%v(:) = this(:)
call x%set_host()
call x%sync()
end subroutine s_cuda_bld_x
subroutine s_cuda_bld_mn(x,n)
integer(psb_mpk_), intent(in) :: n
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_) :: info
call x%all(n,info)
if (info /= 0) then
call psb_errpush(info,'s_cuda_bld_n',i_err=(/n,n,n,n,n/))
end if
end subroutine s_cuda_bld_mn
subroutine s_cuda_set_host(x)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
x%state = is_host
end subroutine s_cuda_set_host
subroutine s_cuda_set_dev(x)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
x%state = is_dev
end subroutine s_cuda_set_dev
subroutine s_cuda_set_sync(x)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
x%state = is_sync
end subroutine s_cuda_set_sync
function s_cuda_is_dev(x) result(res)
implicit none
class(psb_s_vect_cuda), intent(in) :: x
logical :: res
res = (x%state == is_dev)
end function s_cuda_is_dev
function s_cuda_is_host(x) result(res)
implicit none
class(psb_s_vect_cuda), intent(in) :: x
logical :: res
res = (x%state == is_host)
end function s_cuda_is_host
function s_cuda_is_sync(x) result(res)
implicit none
class(psb_s_vect_cuda), intent(in) :: x
logical :: res
res = (x%state == is_sync)
end function s_cuda_is_sync
function s_cuda_get_nrows(x) result(res)
implicit none
class(psb_s_vect_cuda), intent(in) :: x
integer(psb_ipk_) :: res
res = 0
if (allocated(x%v)) res = size(x%v)
end function s_cuda_get_nrows
function s_cuda_get_fmt() result(res)
implicit none
character(len=5) :: res
res = 'sGPU'
end function s_cuda_get_fmt
subroutine s_cuda_all(n, x, info)
use psi_serial_mod
use psb_realloc_mod
implicit none
integer(psb_ipk_), intent(in) :: n
class(psb_s_vect_cuda), intent(out) :: x
integer(psb_ipk_), intent(out) :: info
call psb_realloc(n,x%v,info)
if (info == 0) call x%set_host()
if (info == 0) call x%sync_space(info)
if (info /= 0) then
info=psb_err_alloc_request_
call psb_errpush(info,'s_cuda_all',&
& i_err=(/n,n,n,n,n/))
end if
end subroutine s_cuda_all
subroutine s_cuda_zero(x)
use psi_serial_mod
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
! Since we are overwriting, make sure to do it
! on the GPU side
call x%set_dev()
call x%set_scal(szero)
end subroutine s_cuda_zero
subroutine s_cuda_asb_m(n, x, info)
use psi_serial_mod
use psb_realloc_mod
implicit none
integer(psb_mpk_), intent(in) :: n
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
integer(psb_mpk_) :: nd
if (x%is_dev()) then
nd = getMultiVecDeviceSize(x%deviceVect)
if (nd < n) then
call x%sync()
call x%psb_s_base_vect_type%asb(n,info)
if (info == psb_success_) call x%sync_space(info)
call x%set_host()
end if
else !
if (x%get_nrows()<n) then
call x%psb_s_base_vect_type%asb(n,info)
if (info == psb_success_) call x%sync_space(info)
call x%set_host()
end if
end if
end subroutine s_cuda_asb_m
subroutine s_cuda_sync_space(x,info)
use psb_base_mod, only : psb_realloc
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nh, nd
info = 0
if (x%is_dev()) then
!
if (.not.allocated(x%v)) then
nh = 0
else
nh = size(x%v)
end if
nd = getMultiVecDeviceSize(x%deviceVect)
if (nh < nd ) then
call psb_realloc(nd,x%v,info)
end if
else ! if (x%is_host()) then
if (.not.allocated(x%v)) then
nh = 0
else
nh = size(x%v)
end if
if (c_associated(x%deviceVect)) then
nd = getMultiVecDeviceSize(x%deviceVect)
if (nd < nh ) then
call freeMultiVecDevice(x%deviceVect)
x%deviceVect=c_null_ptr
end if
end if
if (.not.c_associated(x%deviceVect)) then
info = FallocMultiVecDevice(x%deviceVect,1,nh,spgpu_type_float)
if (info /= 0) then
if (info == spgpu_outofmem) then
info = psb_err_alloc_request_
end if
end if
end if
end if
end subroutine s_cuda_sync_space
subroutine s_cuda_sync(x)
use psb_base_mod, only : psb_realloc
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_) :: n,info
info = 0
if (x%is_host()) then
if (.not.c_associated(x%deviceVect)) then
n = size(x%v)
info = FallocMultiVecDevice(x%deviceVect,1,n,spgpu_type_float)
end if
if (info == 0) &
& info = writeMultiVecDevice(x%deviceVect,x%v)
else if (x%is_dev()) then
n = getMultiVecDeviceSize(x%deviceVect)
if (.not.allocated(x%v)) then
!!$ write(0,*) 'Incoherent situation : x%v not allocated'
call psb_realloc(n,x%v,info)
end if
if ((n > size(x%v)).or.(n > x%get_nrows())) then
!!$ write(0,*) 'Incoherent situation : sizes',n,size(x%v),x%get_nrows()
call psb_realloc(n,x%v,info)
end if
info = readMultiVecDevice(x%deviceVect,x%v)
end if
if (info == 0) call x%set_sync()
if (info /= 0) then
info=psb_err_internal_error_
call psb_errpush(info,'s_cuda_sync')
end if
end subroutine s_cuda_sync
subroutine s_cuda_free(x, info)
use psi_serial_mod
use psb_realloc_mod
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
info = 0
if (allocated(x%v)) deallocate(x%v, stat=info)
if (c_associated(x%deviceVect)) then
!!$ write(0,*)'d_cuda_free Calling freeMultiVecDevice'
call freeMultiVecDevice(x%deviceVect)
x%deviceVect=c_null_ptr
end if
call x%free_buffer(info)
call x%set_sync()
end subroutine s_cuda_free
subroutine s_cuda_set_scal(x,val,first,last)
class(psb_s_vect_cuda), intent(inout) :: x
real(psb_spk_), intent(in) :: val
integer(psb_ipk_), optional :: first, last
integer(psb_ipk_) :: info, first_, last_
first_ = 1
last_ = x%get_nrows()
if (present(first)) first_ = max(1,first)
if (present(last)) last_ = min(last,last_)
info = setScalDevice(val,first_,last_,1,x%deviceVect)
call x%set_dev()
end subroutine s_cuda_set_scal
!!$
!!$ subroutine s_cuda_set_vect(x,val)
!!$ class(psb_s_vect_cuda), intent(inout) :: x
!!$ real(psb_spk_), intent(in) :: val(:)
!!$ integer(psb_ipk_) :: nr
!!$ integer(psb_ipk_) :: info
!!$
!!$ if (x%is_dev()) call x%sync()
!!$ call x%psb_s_base_vect_type%set_vect(val)
!!$ call x%set_host()
!!$
!!$ end subroutine s_cuda_set_vect
function s_cuda_dot_v(n,x,y) result(res)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(in) :: n
real(psb_spk_) :: res
real(psb_spk_), external :: ddot
integer(psb_ipk_) :: info
res = szero
!
! Note: this is the gpu implementation.
! When we get here, we are sure that X is of
! TYPE psb_s_vect
!
select type(yy => y)
type is (psb_s_base_vect_type)
if (x%is_dev()) call x%sync()
res = ddot(n,x%v,1,yy%v,1)
type is (psb_s_vect_cuda)
if (x%is_host()) call x%sync()
if (yy%is_host()) call yy%sync()
info = dotMultiVecDevice(res,n,x%deviceVect,yy%deviceVect)
if (info /= 0) then
info = psb_err_internal_error_
call psb_errpush(info,'s_cuda_dot_v')
end if
class default
! y%sync is done in dot_a
call x%sync()
res = y%dot(n,x%v)
end select
end function s_cuda_dot_v
function s_cuda_dot_a(n,x,y) result(res)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
real(psb_spk_), intent(in) :: y(:)
integer(psb_ipk_), intent(in) :: n
real(psb_spk_) :: res
real(psb_spk_), external :: ddot
if (x%is_dev()) call x%sync()
res = ddot(n,y,1,x%v,1)
end function s_cuda_dot_a
subroutine s_cuda_axpby_v(m,alpha, x, beta, y, info)
use psi_serial_mod
implicit none
integer(psb_ipk_), intent(in) :: m
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_vect_cuda), intent(inout) :: y
real(psb_spk_), intent (in) :: alpha, beta
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nx, ny
info = psb_success_
select type(xx => x)
type is (psb_s_vect_cuda)
! Do something different here
if ((beta /= szero).and.y%is_host())&
& call y%sync()
if (xx%is_host()) call xx%sync()
nx = getMultiVecDeviceSize(xx%deviceVect)
ny = getMultiVecDeviceSize(y%deviceVect)
if ((nx<m).or.(ny<m)) then
info = psb_err_internal_error_
else
info = axpbyMultiVecDevice(m,alpha,xx%deviceVect,beta,y%deviceVect)
end if
call y%set_dev()
class default
! Do it on the host side
if ((alpha /= szero).and.(x%is_dev()))&
& call x%sync()
call y%axpby(m,alpha,x%v,beta,info)
end select
end subroutine s_cuda_axpby_v
subroutine s_cuda_abgdxyz(m,alpha, beta, gamma,delta,x, y, z, info)
use psi_serial_mod
implicit none
integer(psb_ipk_), intent(in) :: m
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
class(psb_s_vect_cuda), intent(inout) :: z
real(psb_spk_), intent (in) :: alpha, beta, gamma, delta
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nx, ny, nz
logical :: gpu_done
info = psb_success_
if (.true.) then
gpu_done = .false.
select type(xx => x)
class is (psb_s_vect_cuda)
select type(yy => y)
class is (psb_s_vect_cuda)
select type(zz => z)
class is (psb_s_vect_cuda)
! Do something different here
if ((beta /= szero).and.yy%is_host())&
& call yy%sync()
if ((delta /= szero).and.zz%is_host())&
& call zz%sync()
if (xx%is_host()) call xx%sync()
nx = getMultiVecDeviceSize(xx%deviceVect)
ny = getMultiVecDeviceSize(yy%deviceVect)
nz = getMultiVecDeviceSize(zz%deviceVect)
if ((nx<m).or.(ny<m).or.(nz<m)) then
info = psb_err_internal_error_
else
info = abgdxyzMultiVecDevice(m,alpha,beta,gamma,delta,&
& xx%deviceVect,yy%deviceVect,zz%deviceVect)
end if
call yy%set_dev()
call zz%set_dev()
gpu_done = .true.
end select
end select
end select
if (.not.gpu_done) then
if (x%is_host()) call x%sync()
if (y%is_host()) call y%sync()
if (z%is_host()) call z%sync()
call y%axpby(m,alpha,x,beta,info)
call z%axpby(m,gamma,y,delta,info)
end if
else
if (x%is_host()) call x%sync()
if (y%is_host()) call y%sync()
if (z%is_host()) call z%sync()
call y%axpby(m,alpha,x,beta,info)
call z%axpby(m,gamma,y,delta,info)
end if
end subroutine s_cuda_abgdxyz
subroutine s_cuda_xyzw(m,a,b,c,d,e,f,x, y, z,w, info)
use psi_serial_mod
implicit none
integer(psb_ipk_), intent(in) :: m
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
class(psb_s_base_vect_type), intent(inout) :: z
class(psb_s_vect_cuda), intent(inout) :: w
real(psb_spk_), intent (in) :: a,b,c,d,e,f
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nx, ny, nz, nw
logical :: gpu_done
info = psb_success_
gpu_done = .false.
if ((a==szero).or.(b==szero).or. &
& (c==szero).or.(d==szero).or.&
& (e==szero).or.(f==szero)) then
write(0,*) 'XYZW assumes a,b,c,d,e,f are all nonzero'
else
select type(xx => x)
class is (psb_s_vect_cuda)
select type(yy => y)
class is (psb_s_vect_cuda)
select type(zz => z)
class is (psb_s_vect_cuda)
! Do something different here
if (xx%is_host()) call xx%sync()
if (yy%is_host()) call yy%sync()
if (zz%is_host()) call zz%sync()
if (w%is_host()) call w%sync()
nx = getMultiVecDeviceSize(xx%deviceVect)
ny = getMultiVecDeviceSize(yy%deviceVect)
nz = getMultiVecDeviceSize(zz%deviceVect)
nw = getMultiVecDeviceSize(w%deviceVect)
if ((nx<m).or.(ny<m).or.(nz<m).or.(nw<m)) then
info = psb_err_internal_error_
else
info = xyzwMultiVecDevice(m,a,b,c,d,e,f,&
& xx%deviceVect,yy%deviceVect,zz%deviceVect,w%deviceVect)
end if
call yy%set_dev()
call zz%set_dev()
call w%set_dev()
gpu_done = .true.
end select
end select
end select
if (.not.gpu_done) then
if (x%is_host()) call x%sync()
if (y%is_host()) call y%sync()
if (z%is_host()) call z%sync()
if (w%is_host()) call w%sync()
call y%axpby(m,a,x,b,info)
call z%axpby(m,c,y,d,info)
call w%axpby(m,e,z,f,info)
end if
end if
end subroutine s_cuda_xyzw
subroutine s_cuda_axpby_a(m,alpha, x, beta, y, info)
use psi_serial_mod
implicit none
integer(psb_ipk_), intent(in) :: m
real(psb_spk_), intent(in) :: x(:)
class(psb_s_vect_cuda), intent(inout) :: y
real(psb_spk_), intent (in) :: alpha, beta
integer(psb_ipk_), intent(out) :: info
if ((beta /= szero).and.(y%is_dev()))&
& call y%sync()
call psb_geaxpby(m,alpha,x,beta,y%v,info)
call y%set_host()
end subroutine s_cuda_axpby_a
subroutine s_cuda_mlt_v(x, y, info)
use psi_serial_mod
implicit none
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_vect_cuda), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, n
info = 0
n = min(x%get_nrows(),y%get_nrows())
select type(xx => x)
type is (psb_s_base_vect_type)
if (y%is_dev()) call y%sync()
do i=1, n
y%v(i) = y%v(i) * xx%v(i)
end do
call y%set_host()
type is (psb_s_vect_cuda)
! Do something different here
if (y%is_host()) call y%sync()
if (xx%is_host()) call xx%sync()
info = axyMultiVecDevice(n,sone,xx%deviceVect,y%deviceVect)
call y%set_dev()
class default
if (xx%is_dev()) call xx%sync()
if (y%is_dev()) call y%sync()
call y%mlt(xx%v,info)
call y%set_host()
end select
end subroutine s_cuda_mlt_v
subroutine s_cuda_mlt_a(x, y, info)
use psi_serial_mod
implicit none
real(psb_spk_), intent(in) :: x(:)
class(psb_s_vect_cuda), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, n
info = 0
if (y%is_dev()) call y%sync()
call y%psb_s_base_vect_type%mlt(x,info)
! set_host() is invoked in the base method
end subroutine s_cuda_mlt_a
subroutine s_cuda_mlt_a_2(alpha,x,y,beta,z,info)
use psi_serial_mod
implicit none
real(psb_spk_), intent(in) :: alpha,beta
real(psb_spk_), intent(in) :: x(:)
real(psb_spk_), intent(in) :: y(:)
class(psb_s_vect_cuda), intent(inout) :: z
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, n
info = 0
if (z%is_dev()) call z%sync()
call z%psb_s_base_vect_type%mlt(alpha,x,y,beta,info)
! set_host() is invoked in the base method
end subroutine s_cuda_mlt_a_2
subroutine s_cuda_mlt_v_2(alpha,x,y, beta,z,info,conjgx,conjgy)
use psi_serial_mod
use psb_string_mod
implicit none
real(psb_spk_), intent(in) :: alpha,beta
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
class(psb_s_vect_cuda), intent(inout) :: z
integer(psb_ipk_), intent(out) :: info
character(len=1), intent(in), optional :: conjgx, conjgy
integer(psb_ipk_) :: i, n
logical :: conjgx_, conjgy_
if (.false.) then
! These are present just for coherence with the
! complex versions; they do nothing here.
conjgx_=.false.
if (present(conjgx)) conjgx_ = (psb_toupper(conjgx)=='C')
conjgy_=.false.
if (present(conjgy)) conjgy_ = (psb_toupper(conjgy)=='C')
end if
n = min(x%get_nrows(),y%get_nrows(),z%get_nrows())
!
! Need to reconsider BETA in the GPU side
! of things.
!
info = 0
select type(xx => x)
type is (psb_s_vect_cuda)
select type (yy => y)
type is (psb_s_vect_cuda)
if (xx%is_host()) call xx%sync()
if (yy%is_host()) call yy%sync()
if ((beta /= szero).and.(z%is_host())) call z%sync()
info = axybzMultiVecDevice(n,alpha,xx%deviceVect,&
& yy%deviceVect,beta,z%deviceVect)
call z%set_dev()
class default
if (xx%is_dev()) call xx%sync()
if (yy%is_dev()) call yy%sync()
if ((beta /= szero).and.(z%is_dev())) call z%sync()
call z%psb_s_base_vect_type%mlt(alpha,xx,yy,beta,info)
call z%set_host()
end select
class default
if (x%is_dev()) call x%sync()
if (y%is_dev()) call y%sync()
if ((beta /= szero).and.(z%is_dev())) call z%sync()
call z%psb_s_base_vect_type%mlt(alpha,x,y,beta,info)
call z%set_host()
end select
end subroutine s_cuda_mlt_v_2
subroutine s_cuda_scal(alpha, x)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
real(psb_spk_), intent (in) :: alpha
integer(psb_ipk_) :: info
if (x%is_host()) call x%sync()
info = scalMultiVecDevice(alpha,x%deviceVect)
call x%set_dev()
end subroutine s_cuda_scal
function s_cuda_nrm2(n,x) result(res)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n
real(psb_spk_) :: res
integer(psb_ipk_) :: info
! WARNING: this should be changed.
if (x%is_host()) call x%sync()
info = nrm2MultiVecDevice(res,n,x%deviceVect)
end function s_cuda_nrm2
function s_cuda_amax(n,x) result(res)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n
real(psb_spk_) :: res
integer(psb_ipk_) :: info
if (x%is_host()) call x%sync()
info = amaxMultiVecDevice(res,n,x%deviceVect)
end function s_cuda_amax
function s_cuda_asum(n,x) result(res)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n
real(psb_spk_) :: res
integer(psb_ipk_) :: info
if (x%is_host()) call x%sync()
info = asumMultiVecDevice(res,n,x%deviceVect)
end function s_cuda_asum
subroutine s_cuda_absval1(x)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_) :: n
integer(psb_ipk_) :: info
if (x%is_host()) call x%sync()
n=x%get_nrows()
info = absMultiVecDevice(n,sone,x%deviceVect)
end subroutine s_cuda_absval1
subroutine s_cuda_absval2(x,y)
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
integer(psb_ipk_) :: n
integer(psb_ipk_) :: info
n=min(x%get_nrows(),y%get_nrows())
select type (yy=> y)
class is (psb_s_vect_cuda)
if (x%is_host()) call x%sync()
if (yy%is_host()) call yy%sync()
info = absMultiVecDevice(n,sone,x%deviceVect,yy%deviceVect)
class default
if (x%is_dev()) call x%sync()
if (y%is_dev()) call y%sync()
call x%psb_s_base_vect_type%absval(y)
end select
end subroutine s_cuda_absval2
subroutine s_cuda_vect_finalize(x)
use psi_serial_mod
use psb_realloc_mod
implicit none
type(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_) :: info
info = 0
call x%free(info)
end subroutine s_cuda_vect_finalize
subroutine s_cuda_ins_v(n,irl,val,dupl,x,info)
use psi_serial_mod
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n, dupl
class(psb_i_base_vect_type), intent(inout) :: irl
class(psb_s_base_vect_type), intent(inout) :: val
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i, isz
logical :: done_cuda
info = 0
if (psb_errstatus_fatal()) return
done_cuda = .false.
select type(virl => irl)
class is (psb_i_vect_cuda)
select type(vval => val)
class is (psb_s_vect_cuda)
if (vval%is_host()) call vval%sync()
if (virl%is_host()) call virl%sync()
if (x%is_host()) call x%sync()
info = geinsMultiVecDeviceFloat(n,virl%deviceVect,&
& vval%deviceVect,dupl,1,x%deviceVect)
call x%set_dev()
done_cuda=.true.
end select
end select
if (.not.done_cuda) then
if (irl%is_dev()) call irl%sync()
if (val%is_dev()) call val%sync()
call x%ins(n,irl%v,val%v,dupl,info)
end if
if (info /= 0) then
call psb_errpush(info,'cuda_vect_ins')
return
end if
end subroutine s_cuda_ins_v
subroutine s_cuda_ins_a(n,irl,val,dupl,x,info)
use psi_serial_mod
implicit none
class(psb_s_vect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n, dupl
integer(psb_ipk_), intent(in) :: irl(:)
real(psb_spk_), intent(in) :: val(:)
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i
info = 0
if (x%is_dev()) call x%sync()
call x%psb_s_base_vect_type%ins(n,irl,val,dupl,info)
call x%set_host()
end subroutine s_cuda_ins_a
end module psb_s_cuda_vect_mod
!
! Multivectors
!
module psb_s_cuda_multivect_mod
use iso_c_binding
use psb_const_mod
use psb_error_mod
use psb_s_multivect_mod
use psb_s_base_multivect_mod
use psb_cuda_env_mod
use psb_i_multivect_mod
use psb_i_cuda_multivect_mod
use psb_s_vectordev_mod
integer(psb_ipk_), parameter, private :: is_host = -1
integer(psb_ipk_), parameter, private :: is_sync = 0
integer(psb_ipk_), parameter, private :: is_dev = 1
type, extends(psb_s_base_multivect_type) :: psb_s_multivect_cuda
integer(psb_ipk_) :: state = is_host, m_nrows=0, m_ncols=0
type(c_ptr) :: deviceVect = c_null_ptr
real(c_double), allocatable :: buffer(:,:)
type(c_ptr) :: dt_buf = c_null_ptr
contains
procedure, pass(x) :: get_nrows => s_cuda_multi_get_nrows
procedure, pass(x) :: get_ncols => s_cuda_multi_get_ncols
procedure, nopass :: get_fmt => s_cuda_multi_get_fmt
!!$ procedure, pass(x) :: dot_v => s_cuda_multi_dot_v
!!$ procedure, pass(x) :: dot_a => s_cuda_multi_dot_a
!!$ procedure, pass(y) :: axpby_v => s_cuda_multi_axpby_v
!!$ procedure, pass(y) :: axpby_a => s_cuda_multi_axpby_a
!!$ procedure, pass(y) :: mlt_v => s_cuda_multi_mlt_v
!!$ procedure, pass(y) :: mlt_a => s_cuda_multi_mlt_a
!!$ procedure, pass(z) :: mlt_a_2 => s_cuda_multi_mlt_a_2
!!$ procedure, pass(z) :: mlt_v_2 => s_cuda_multi_mlt_v_2
!!$ procedure, pass(x) :: scal => s_cuda_multi_scal
!!$ procedure, pass(x) :: nrm2 => s_cuda_multi_nrm2
!!$ procedure, pass(x) :: amax => s_cuda_multi_amax
!!$ procedure, pass(x) :: asum => s_cuda_multi_asum
procedure, pass(x) :: all => s_cuda_multi_all
procedure, pass(x) :: zero => s_cuda_multi_zero
procedure, pass(x) :: asb => s_cuda_multi_asb
procedure, pass(x) :: sync => s_cuda_multi_sync
procedure, pass(x) :: sync_space => s_cuda_multi_sync_space
procedure, pass(x) :: bld_x => s_cuda_multi_bld_x
procedure, pass(x) :: bld_n => s_cuda_multi_bld_n
procedure, pass(x) :: free => s_cuda_multi_free
procedure, pass(x) :: ins => s_cuda_multi_ins
procedure, pass(x) :: is_host => s_cuda_multi_is_host
procedure, pass(x) :: is_dev => s_cuda_multi_is_dev
procedure, pass(x) :: is_sync => s_cuda_multi_is_sync
procedure, pass(x) :: set_host => s_cuda_multi_set_host
procedure, pass(x) :: set_dev => s_cuda_multi_set_dev
procedure, pass(x) :: set_sync => s_cuda_multi_set_sync
procedure, pass(x) :: set_scal => s_cuda_multi_set_scal
procedure, pass(x) :: set_vect => s_cuda_multi_set_vect
!!$ procedure, pass(x) :: gthzv_x => s_cuda_multi_gthzv_x
!!$ procedure, pass(y) :: sctb => s_cuda_multi_sctb
!!$ procedure, pass(y) :: sctb_x => s_cuda_multi_sctb_x
final :: s_cuda_multi_vect_finalize
end type psb_s_multivect_cuda
public :: psb_s_multivect_cuda
private :: constructor
interface psb_s_multivect_cuda
module procedure constructor
end interface
contains
function constructor(x) result(this)
real(psb_spk_) :: x(:,:)
type(psb_s_multivect_cuda) :: this
integer(psb_ipk_) :: info
this%v = x
call this%asb(size(x,1),size(x,2),info)
end function constructor
!!$ subroutine s_cuda_multi_gthzv_x(i,n,idx,x,y)
!!$ use psi_serial_mod
!!$ integer(psb_ipk_) :: i,n
!!$ class(psb_i_base_multivect_type) :: idx
!!$ real(psb_spk_) :: y(:)
!!$ class(psb_s_multivect_cuda) :: x
!!$
!!$ select type(ii=> idx)
!!$ class is (psb_i_vect_cuda)
!!$ if (ii%is_host()) call ii%sync()
!!$ if (x%is_host()) call x%sync()
!!$
!!$ if (allocated(x%buffer)) then
!!$ if (size(x%buffer) < n) then
!!$ call inner_unregister(x%buffer)
!!$ deallocate(x%buffer, stat=info)
!!$ end if
!!$ end if
!!$
!!$ if (.not.allocated(x%buffer)) then
!!$ allocate(x%buffer(n),stat=info)
!!$ if (info == 0) info = inner_register(x%buffer,x%dt_buf)
!!$ endif
!!$ info = igathMultiVecDeviceDouble(x%deviceVect,&
!!$ & 0, i, n, ii%deviceVect, x%dt_buf, 1)
!!$ call psb_cudaSync()
!!$ y(1:n) = x%buffer(1:n)
!!$
!!$ class default
!!$ call x%gth(n,ii%v(i:),y)
!!$ end select
!!$
!!$
!!$ end subroutine s_cuda_multi_gthzv_x
!!$
!!$
!!$
!!$ subroutine s_cuda_multi_sctb(n,idx,x,beta,y)
!!$ implicit none
!!$ !use psb_const_mod
!!$ integer(psb_ipk_) :: n, idx(:)
!!$ real(psb_spk_) :: beta, x(:)
!!$ class(psb_s_multivect_cuda) :: y
!!$ integer(psb_ipk_) :: info
!!$
!!$ if (n == 0) return
!!$
!!$ if (y%is_dev()) call y%sync()
!!$
!!$ call y%psb_s_base_multivect_type%sctb(n,idx,x,beta)
!!$ call y%set_host()
!!$
!!$ end subroutine s_cuda_multi_sctb
!!$
!!$ subroutine s_cuda_multi_sctb_x(i,n,idx,x,beta,y)
!!$ use psi_serial_mod
!!$ integer(psb_ipk_) :: i, n
!!$ class(psb_i_base_multivect_type) :: idx
!!$ real(psb_spk_) :: beta, x(:)
!!$ class(psb_s_multivect_cuda) :: y
!!$
!!$ select type(ii=> idx)
!!$ class is (psb_i_vect_cuda)
!!$ if (ii%is_host()) call ii%sync()
!!$ if (y%is_host()) call y%sync()
!!$
!!$ if (allocated(y%buffer)) then
!!$ if (size(y%buffer) < n) then
!!$ call inner_unregister(y%buffer)
!!$ deallocate(y%buffer, stat=info)
!!$ end if
!!$ end if
!!$
!!$ if (.not.allocated(y%buffer)) then
!!$ allocate(y%buffer(n),stat=info)
!!$ if (info == 0) info = inner_register(y%buffer,y%dt_buf)
!!$ endif
!!$ y%buffer(1:n) = x(1:n)
!!$ info = iscatMultiVecDeviceDouble(y%deviceVect,&
!!$ & 0, i, n, ii%deviceVect, y%dt_buf, 1,beta)
!!$
!!$ call y%set_dev()
!!$ call psb_cudaSync()
!!$
!!$ class default
!!$ call y%sct(n,ii%v(i:),x,beta)
!!$ end select
!!$
!!$ end subroutine s_cuda_multi_sctb_x
subroutine s_cuda_multi_bld_x(x,this)
use psb_base_mod
real(psb_spk_), intent(in) :: this(:,:)
class(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_) :: info, m, n
m=size(this,1)
n=size(this,2)
x%m_nrows = m
x%m_ncols = n
call psb_realloc(m,n,x%v,info)
if (info /= 0) then
info=psb_err_alloc_request_
call psb_errpush(info,'s_cuda_multi_bld_x',&
& i_err=(/size(this,1),size(this,2),izero,izero,izero,izero/))
end if
x%v(1:m,1:n) = this(1:m,1:n)
call x%set_host()
call x%sync()
end subroutine s_cuda_multi_bld_x
subroutine s_cuda_multi_bld_n(x,m,n)
integer(psb_ipk_), intent(in) :: m,n
class(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_) :: info
call x%all(m,n,info)
if (info /= 0) then
call psb_errpush(info,'s_cuda_multi_bld_n',i_err=(/m,n,n,n,n/))
end if
end subroutine s_cuda_multi_bld_n
subroutine s_cuda_multi_set_host(x)
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
x%state = is_host
end subroutine s_cuda_multi_set_host
subroutine s_cuda_multi_set_dev(x)
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
x%state = is_dev
end subroutine s_cuda_multi_set_dev
subroutine s_cuda_multi_set_sync(x)
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
x%state = is_sync
end subroutine s_cuda_multi_set_sync
function s_cuda_multi_is_dev(x) result(res)
implicit none
class(psb_s_multivect_cuda), intent(in) :: x
logical :: res
res = (x%state == is_dev)
end function s_cuda_multi_is_dev
function s_cuda_multi_is_host(x) result(res)
implicit none
class(psb_s_multivect_cuda), intent(in) :: x
logical :: res
res = (x%state == is_host)
end function s_cuda_multi_is_host
function s_cuda_multi_is_sync(x) result(res)
implicit none
class(psb_s_multivect_cuda), intent(in) :: x
logical :: res
res = (x%state == is_sync)
end function s_cuda_multi_is_sync
function s_cuda_multi_get_nrows(x) result(res)
implicit none
class(psb_s_multivect_cuda), intent(in) :: x
integer(psb_ipk_) :: res
res = x%m_nrows
end function s_cuda_multi_get_nrows
function s_cuda_multi_get_ncols(x) result(res)
implicit none
class(psb_s_multivect_cuda), intent(in) :: x
integer(psb_ipk_) :: res
res = x%m_ncols
end function s_cuda_multi_get_ncols
function s_cuda_multi_get_fmt() result(res)
implicit none
character(len=5) :: res
res = 'sGPU'
end function s_cuda_multi_get_fmt
!!$ function s_cuda_multi_dot_v(n,x,y) result(res)
!!$ implicit none
!!$ class(psb_s_multivect_cuda), intent(inout) :: x
!!$ class(psb_s_base_multivect_type), intent(inout) :: y
!!$ integer(psb_ipk_), intent(in) :: n
!!$ real(psb_spk_) :: res
!!$ real(psb_spk_), external :: ddot
!!$ integer(psb_ipk_) :: info
!!$
!!$ res = szero
!!$ !
!!$ ! Note: this is the gpu implementation.
!!$ ! When we get here, we are sure that X is of
!!$ ! TYPE psb_s_vect
!!$ !
!!$ select type(yy => y)
!!$ type is (psb_s_base_multivect_type)
!!$ if (x%is_dev()) call x%sync()
!!$ res = ddot(n,x%v,1,yy%v,1)
!!$ type is (psb_s_multivect_cuda)
!!$ if (x%is_host()) call x%sync()
!!$ if (yy%is_host()) call yy%sync()
!!$ info = dotMultiVecDevice(res,n,x%deviceVect,yy%deviceVect)
!!$ if (info /= 0) then
!!$ info = psb_err_internal_error_
!!$ call psb_errpush(info,'s_cuda_multi_dot_v')
!!$ end if
!!$
!!$ class default
!!$ ! y%sync is done in dot_a
!!$ call x%sync()
!!$ res = y%dot(n,x%v)
!!$ end select
!!$
!!$ end function s_cuda_multi_dot_v
!!$
!!$ function s_cuda_multi_dot_a(n,x,y) result(res)
!!$ implicit none
!!$ class(psb_s_multivect_cuda), intent(inout) :: x
!!$ real(psb_spk_), intent(in) :: y(:)
!!$ integer(psb_ipk_), intent(in) :: n
!!$ real(psb_spk_) :: res
!!$ real(psb_spk_), external :: ddot
!!$
!!$ if (x%is_dev()) call x%sync()
!!$ res = ddot(n,y,1,x%v,1)
!!$
!!$ end function s_cuda_multi_dot_a
!!$
!!$ subroutine s_cuda_multi_axpby_v(m,alpha, x, beta, y, info)
!!$ use psi_serial_mod
!!$ implicit none
!!$ integer(psb_ipk_), intent(in) :: m
!!$ class(psb_s_base_multivect_type), intent(inout) :: x
!!$ class(psb_s_multivect_cuda), intent(inout) :: y
!!$ real(psb_spk_), intent (in) :: alpha, beta
!!$ integer(psb_ipk_), intent(out) :: info
!!$ integer(psb_ipk_) :: nx, ny
!!$
!!$ info = psb_success_
!!$
!!$ select type(xx => x)
!!$ type is (psb_s_base_multivect_type)
!!$ if ((beta /= szero).and.(y%is_dev()))&
!!$ & call y%sync()
!!$ call psb_geaxpby(m,alpha,xx%v,beta,y%v,info)
!!$ call y%set_host()
!!$ type is (psb_s_multivect_cuda)
!!$ ! Do something different here
!!$ if ((beta /= szero).and.y%is_host())&
!!$ & call y%sync()
!!$ if (xx%is_host()) call xx%sync()
!!$ nx = getMultiVecDeviceSize(xx%deviceVect)
!!$ ny = getMultiVecDeviceSize(y%deviceVect)
!!$ if ((nx<m).or.(ny<m)) then
!!$ info = psb_err_internal_error_
!!$ info = psb_err_internal_error_
!!$ else
!!$ info = axpbyMultiVecDevice(m,alpha,xx%deviceVect,beta,y%deviceVect)
!!$ end if
!!$ call y%set_dev()
!!$ class default
!!$ call x%sync()
!!$ call y%axpby(m,alpha,x%v,beta,info)
!!$ end select
!!$
!!$ end subroutine s_cuda_multi_axpby_v
!!$
!!$ subroutine s_cuda_multi_axpby_a(m,alpha, x, beta, y, info)
!!$ use psi_serial_mod
!!$ implicit none
!!$ integer(psb_ipk_), intent(in) :: m
!!$ real(psb_spk_), intent(in) :: x(:)
!!$ class(psb_s_multivect_cuda), intent(inout) :: y
!!$ real(psb_spk_), intent (in) :: alpha, beta
!!$ integer(psb_ipk_), intent(out) :: info
!!$
!!$ if (y%is_dev()) call y%sync()
!!$ call psb_geaxpby(m,alpha,x,beta,y%v,info)
!!$ call y%set_host()
!!$ end subroutine s_cuda_multi_axpby_a
!!$
!!$ subroutine s_cuda_multi_mlt_v(x, y, info)
!!$ use psi_serial_mod
!!$ implicit none
!!$ class(psb_s_base_multivect_type), intent(inout) :: x
!!$ class(psb_s_multivect_cuda), intent(inout) :: y
!!$ integer(psb_ipk_), intent(out) :: info
!!$
!!$ integer(psb_ipk_) :: i, n
!!$
!!$ info = 0
!!$ n = min(x%get_nrows(),y%get_nrows())
!!$ select type(xx => x)
!!$ type is (psb_s_base_multivect_type)
!!$ if (y%is_dev()) call y%sync()
!!$ do i=1, n
!!$ y%v(i) = y%v(i) * xx%v(i)
!!$ end do
!!$ call y%set_host()
!!$ type is (psb_s_multivect_cuda)
!!$ ! Do something different here
!!$ if (y%is_host()) call y%sync()
!!$ if (xx%is_host()) call xx%sync()
!!$ info = axyMultiVecDevice(n,done,xx%deviceVect,y%deviceVect)
!!$ call y%set_dev()
!!$ class default
!!$ call xx%sync()
!!$ call y%mlt(xx%v,info)
!!$ call y%set_host()
!!$ end select
!!$
!!$ end subroutine s_cuda_multi_mlt_v
!!$
!!$ subroutine s_cuda_multi_mlt_a(x, y, info)
!!$ use psi_serial_mod
!!$ implicit none
!!$ real(psb_spk_), intent(in) :: x(:)
!!$ class(psb_s_multivect_cuda), intent(inout) :: y
!!$ integer(psb_ipk_), intent(out) :: info
!!$ integer(psb_ipk_) :: i, n
!!$
!!$ info = 0
!!$ call y%sync()
!!$ call y%psb_s_base_multivect_type%mlt(x,info)
!!$ call y%set_host()
!!$ end subroutine s_cuda_multi_mlt_a
!!$
!!$ subroutine s_cuda_multi_mlt_a_2(alpha,x,y,beta,z,info)
!!$ use psi_serial_mod
!!$ implicit none
!!$ real(psb_spk_), intent(in) :: alpha,beta
!!$ real(psb_spk_), intent(in) :: x(:)
!!$ real(psb_spk_), intent(in) :: y(:)
!!$ class(psb_s_multivect_cuda), intent(inout) :: z
!!$ integer(psb_ipk_), intent(out) :: info
!!$ integer(psb_ipk_) :: i, n
!!$
!!$ info = 0
!!$ if (z%is_dev()) call z%sync()
!!$ call z%psb_s_base_multivect_type%mlt(alpha,x,y,beta,info)
!!$ call z%set_host()
!!$ end subroutine s_cuda_multi_mlt_a_2
!!$
!!$ subroutine s_cuda_multi_mlt_v_2(alpha,x,y, beta,z,info,conjgx,conjgy)
!!$ use psi_serial_mod
!!$ use psb_string_mod
!!$ implicit none
!!$ real(psb_spk_), intent(in) :: alpha,beta
!!$ class(psb_s_base_multivect_type), intent(inout) :: x
!!$ class(psb_s_base_multivect_type), intent(inout) :: y
!!$ class(psb_s_multivect_cuda), intent(inout) :: z
!!$ integer(psb_ipk_), intent(out) :: info
!!$ character(len=1), intent(in), optional :: conjgx, conjgy
!!$ integer(psb_ipk_) :: i, n
!!$ logical :: conjgx_, conjgy_
!!$
!!$ if (.false.) then
!!$ ! These are present just for coherence with the
!!$ ! complex versions; they do nothing here.
!!$ conjgx_=.false.
!!$ if (present(conjgx)) conjgx_ = (psb_toupper(conjgx)=='C')
!!$ conjgy_=.false.
!!$ if (present(conjgy)) conjgy_ = (psb_toupper(conjgy)=='C')
!!$ end if
!!$
!!$ n = min(x%get_nrows(),y%get_nrows(),z%get_nrows())
!!$
!!$ !
!!$ ! Need to reconsider BETA in the GPU side
!!$ ! of things.
!!$ !
!!$ info = 0
!!$ select type(xx => x)
!!$ type is (psb_s_multivect_cuda)
!!$ select type (yy => y)
!!$ type is (psb_s_multivect_cuda)
!!$ if (xx%is_host()) call xx%sync()
!!$ if (yy%is_host()) call yy%sync()
!!$ ! Z state is irrelevant: it will be done on the GPU.
!!$ info = axybzMultiVecDevice(n,alpha,xx%deviceVect,&
!!$ & yy%deviceVect,beta,z%deviceVect)
!!$ call z%set_dev()
!!$ class default
!!$ call xx%sync()
!!$ call yy%sync()
!!$ call z%psb_s_base_multivect_type%mlt(alpha,xx,yy,beta,info)
!!$ call z%set_host()
!!$ end select
!!$
!!$ class default
!!$ call x%sync()
!!$ call y%sync()
!!$ call z%psb_s_base_multivect_type%mlt(alpha,x,y,beta,info)
!!$ call z%set_host()
!!$ end select
!!$ end subroutine s_cuda_multi_mlt_v_2
subroutine s_cuda_multi_set_scal(x,val)
class(psb_s_multivect_cuda), intent(inout) :: x
real(psb_spk_), intent(in) :: val
integer(psb_ipk_) :: info
if (x%is_dev()) call x%sync()
call x%psb_s_base_multivect_type%set_scal(val)
call x%set_host()
end subroutine s_cuda_multi_set_scal
subroutine s_cuda_multi_set_vect(x,val)
class(psb_s_multivect_cuda), intent(inout) :: x
real(psb_spk_), intent(in) :: val(:,:)
integer(psb_ipk_) :: nr
integer(psb_ipk_) :: info
if (x%is_dev()) call x%sync()
call x%psb_s_base_multivect_type%set_vect(val)
call x%set_host()
end subroutine s_cuda_multi_set_vect
!!$ subroutine s_cuda_multi_scal(alpha, x)
!!$ implicit none
!!$ class(psb_s_multivect_cuda), intent(inout) :: x
!!$ real(psb_spk_), intent (in) :: alpha
!!$
!!$ if (x%is_dev()) call x%sync()
!!$ call x%psb_s_base_multivect_type%scal(alpha)
!!$ call x%set_host()
!!$ end subroutine s_cuda_multi_scal
!!$
!!$
!!$ function s_cuda_multi_nrm2(n,x) result(res)
!!$ implicit none
!!$ class(psb_s_multivect_cuda), intent(inout) :: x
!!$ integer(psb_ipk_), intent(in) :: n
!!$ real(psb_spk_) :: res
!!$ integer(psb_ipk_) :: info
!!$ ! WARNING: this should be changed.
!!$ if (x%is_host()) call x%sync()
!!$ info = nrm2MultiVecDevice(res,n,x%deviceVect)
!!$
!!$ end function s_cuda_multi_nrm2
!!$
!!$ function s_cuda_multi_amax(n,x) result(res)
!!$ implicit none
!!$ class(psb_s_multivect_cuda), intent(inout) :: x
!!$ integer(psb_ipk_), intent(in) :: n
!!$ real(psb_spk_) :: res
!!$
!!$ if (x%is_dev()) call x%sync()
!!$ res = maxval(abs(x%v(1:n)))
!!$
!!$ end function s_cuda_multi_amax
!!$
!!$ function s_cuda_multi_asum(n,x) result(res)
!!$ implicit none
!!$ class(psb_s_multivect_cuda), intent(inout) :: x
!!$ integer(psb_ipk_), intent(in) :: n
!!$ real(psb_spk_) :: res
!!$
!!$ if (x%is_dev()) call x%sync()
!!$ res = sum(abs(x%v(1:n)))
!!$
!!$ end function s_cuda_multi_asum
subroutine s_cuda_multi_all(m,n, x, info)
use psi_serial_mod
use psb_realloc_mod
implicit none
integer(psb_ipk_), intent(in) :: m,n
class(psb_s_multivect_cuda), intent(out) :: x
integer(psb_ipk_), intent(out) :: info
call psb_realloc(m,n,x%v,info,pad=szero)
x%m_nrows = m
x%m_ncols = n
if (info == 0) call x%set_host()
if (info == 0) call x%sync_space(info)
if (info /= 0) then
info=psb_err_alloc_request_
call psb_errpush(info,'s_cuda_multi_all',&
& i_err=(/m,n,n,n,n/))
end if
end subroutine s_cuda_multi_all
subroutine s_cuda_multi_zero(x)
use psi_serial_mod
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
if (allocated(x%v)) x%v=szero
call x%set_host()
end subroutine s_cuda_multi_zero
subroutine s_cuda_multi_asb(m,n, x, info)
use psi_serial_mod
use psb_realloc_mod
implicit none
integer(psb_ipk_), intent(in) :: m,n
class(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nd, nc
x%m_nrows = m
x%m_ncols = n
if (x%is_host()) then
call x%psb_s_base_multivect_type%asb(m,n,info)
if (info == psb_success_) call x%sync_space(info)
else if (x%is_dev()) then
nd = getMultiVecDevicePitch(x%deviceVect)
nc = getMultiVecDeviceCount(x%deviceVect)
if ((nd < m).or.(nc<n)) then
call x%sync()
call x%psb_s_base_multivect_type%asb(m,n,info)
if (info == psb_success_) call x%sync_space(info)
call x%set_host()
end if
end if
end subroutine s_cuda_multi_asb
subroutine s_cuda_multi_sync_space(x,info)
use psb_realloc_mod
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: mh,nh,md,nd
info = 0
if (x%is_host()) then
if (allocated(x%v)) then
mh = size(x%v,1)
nh = size(x%v,2)
else
mh=0
nh=0
end if
if (c_associated(x%deviceVect)) then
md = getMultiVecDevicePitch(x%deviceVect)
nd = getMultiVecDeviceCount(x%deviceVect)
if ((md < mh).or.(nd<nh)) then
call freeMultiVecDevice(x%deviceVect)
x%deviceVect=c_null_ptr
end if
end if
if (.not.c_associated(x%deviceVect)) then
info = FallocMultiVecDevice(x%deviceVect,nh,mh,spgpu_type_float)
if (info == 0) &
& call psb_realloc(getMultiVecDevicePitch(x%deviceVect),&
& getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=szero)
if (info /= 0) then
!!$ write(0,*) 'Error from FallocMultiVecDevice',info,n
if (info == spgpu_outofmem) then
info = psb_err_alloc_request_
end if
end if
end if
else if (x%is_dev()) then
!
if (allocated(x%v)) then
mh = size(x%v,1)
nh = size(x%v,2)
else
mh=0
nh=0
end if
md = getMultiVecDevicePitch(x%deviceVect)
nd = getMultiVecDeviceCount(x%deviceVect)
if ((mh /= md).or.(nh /= nd)) then
call psb_realloc(getMultiVecDevicePitch(x%deviceVect),&
& getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=szero)
end if
end if
end subroutine s_cuda_multi_sync_space
subroutine s_cuda_multi_sync(x)
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_) :: n,info
info = 0
if (x%is_host()) then
if (.not.c_associated(x%deviceVect)) then
call x%sync_space(info)
end if
if (info == 0) &
& info = writeMultiVecDevice(x%deviceVect,x%v,size(x%v,1))
else if (x%is_dev()) then
info = readMultiVecDevice(x%deviceVect,x%v,size(x%v,1))
end if
if (info == 0) call x%set_sync()
if (info /= 0) then
info=psb_err_internal_error_
call psb_errpush(info,'s_cuda_multi_sync')
end if
end subroutine s_cuda_multi_sync
subroutine s_cuda_multi_free(x, info)
use psi_serial_mod
use psb_realloc_mod
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(out) :: info
info = 0
if (c_associated(x%deviceVect)) then
call freeMultiVecDevice(x%deviceVect)
x%deviceVect=c_null_ptr
end if
if (allocated(x%buffer)) then
!!$ call inner_unregister(x%buffer)
deallocate(x%buffer, stat=info)
end if
if (allocated(x%v)) deallocate(x%v, stat=info)
call x%set_sync()
end subroutine s_cuda_multi_free
subroutine s_cuda_multi_vect_finalize(x)
use psi_serial_mod
use psb_realloc_mod
implicit none
type(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_) :: info
info = 0
if (c_associated(x%deviceVect)) then
call freeMultiVecDevice(x%deviceVect)
x%deviceVect=c_null_ptr
end if
if (allocated(x%buffer)) then
!!$ call inner_unregister(x%buffer)
deallocate(x%buffer, stat=info)
end if
if (allocated(x%v)) deallocate(x%v, stat=info)
call x%set_sync()
end subroutine s_cuda_multi_vect_finalize
subroutine s_cuda_multi_ins(n,irl,val,dupl,x,info)
use psi_serial_mod
implicit none
class(psb_s_multivect_cuda), intent(inout) :: x
integer(psb_ipk_), intent(in) :: n, dupl
integer(psb_ipk_), intent(in) :: irl(:)
real(psb_spk_), intent(in) :: val(:,:)
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: i
info = 0
if (x%is_dev()) call x%sync()
call x%psb_s_base_multivect_type%ins(n,irl,val,dupl,info)
call x%set_host()
end subroutine s_cuda_multi_ins
end module psb_s_cuda_multivect_mod