Multivector Product GPU

psblas-bgmres
gabrielequatrana 4 months ago
parent 34a2a7ddbc
commit f7d44a70d6

@ -98,37 +98,23 @@ module psb_d_psblas_mod
end interface
interface psb_geprod
function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
subroutine psb_dprod_multivect(x,y,res,desc_a,info,global)
import :: psb_desc_type, psb_dpk_, psb_ipk_, &
& psb_d_multivect_type, psb_dspmat_type
real(psb_dpk_), allocatable :: res(:,:)
type(psb_d_multivect_type), intent(inout) :: x, y
type(psb_d_multivect_type), intent(inout) :: x, y, res
type(psb_desc_type), intent(in) :: desc_a
integer(psb_ipk_), intent(out) :: info
logical, intent(in), optional :: trans
logical, intent(in), optional :: global
end function psb_dprod_multivect
function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
end subroutine psb_dprod_multivect
subroutine psb_dprod_multivect_a(x,y,res,desc_a,info,global)
import :: psb_desc_type, psb_dpk_, psb_ipk_, &
& psb_d_multivect_type, psb_dspmat_type
real(psb_dpk_), allocatable :: res(:,:)
type(psb_d_multivect_type), intent(inout) :: x
type(psb_d_multivect_type), intent(inout) :: x, res
real(psb_dpk_), intent(in) :: y(:,:)
type(psb_desc_type), intent(in) :: desc_a
integer(psb_ipk_), intent(out) :: info
logical, intent(in), optional :: trans
logical, intent(in), optional :: global
end function psb_dprod_multivect_a
function psb_dprod_m(x,y,desc_a,info,trans,global) result(res)
import :: psb_desc_type, psb_dpk_, psb_ipk_, &
& psb_d_multivect_type, psb_dspmat_type
real(psb_dpk_), allocatable :: res(:,:)
real(psb_dpk_), intent(in) :: x(:,:), y(:,:)
type(psb_desc_type), intent(in) :: desc_a
integer(psb_ipk_), intent(out) :: info
logical, intent(in), optional :: trans
logical, intent(in), optional :: global
end function psb_dprod_m
end subroutine psb_dprod_multivect_a
end interface
interface psb_geaxpby

@ -2593,8 +2593,12 @@ contains
implicit none
class(psb_d_base_multivect_type), intent(inout) :: x
if (allocated(x%v)) x%v=dzero
if (allocated(x%v)) then
!$omp workshare
x%v(:,:)=dzero
!$omp end workshare
end if
call x%set_host()
end subroutine d_base_mlv_zero
@ -2839,13 +2843,37 @@ contains
!! \brief Set all entries
!! \param val The value to set
!!
subroutine d_base_mlv_set_scal(x,val)
subroutine d_base_mlv_set_scal(x,val,first_row,last_row,first_col,last_col)
implicit none
class(psb_d_base_multivect_type), intent(inout) :: x
real(psb_dpk_), intent(in) :: val
integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
integer(psb_ipk_) :: first_row_, last_row_, first_col_, last_col_, i, j
integer(psb_ipk_) :: info
x%v = val
first_row_=1
last_row_=size(x%v,1)
first_col_=1
last_col_=size(x%v,2)
if (present(first_row)) first_row_ = max(first_row,first_row_)
if (present(last_row)) last_row_ = min(last_row,last_row_)
if (present(first_col)) first_col_ = max(first_col,first_col_)
if (present(last_col)) last_col_ = min(last_col,last_col_)
if (x%is_dev()) call x%sync()
#if defined(OPENMP)
!$omp parallel do private (i,j)
do i = first_row_, last_row_
do j = first_col_, last_col_
x%v(i,j) = val
end do
end do
#else
x%v(first_row_:last_row_,first_col_:last_col_) = val
#endif
call x%set_host()
end subroutine d_base_mlv_set_scal
@ -2855,22 +2883,42 @@ contains
!! \brief Set all entries
!! \param val(:) The vector to be copied in
!!
subroutine d_base_mlv_set_vect(x,val)
subroutine d_base_mlv_set_vect(x,val,first_row,last_row,first_col,last_col)
implicit none
class(psb_d_base_multivect_type), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: x
real(psb_dpk_), intent(in) :: val(:,:)
integer(psb_ipk_) :: nr, nc
integer(psb_ipk_) :: info
if (allocated(x%v)) then
nr = min(size(x%v,1),size(val,1))
nc = min(size(x%v,2),size(val,2))
integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
integer(psb_ipk_) :: first_row_, last_row_, first_col_, last_col_, i, j, info
x%v(1:nr,1:nc) = val(1:nr,1:nc)
else
x%v = val
if (.not.allocated(x%v)) then
call psb_realloc(size(val,1),size(val,2),x%v,info)
end if
first_row_=1
last_row_=size(x%v,1)
first_col_=1
last_col_=size(x%v,2)
if (present(first_row)) first_row_ = max(first_row,first_row_)
if (present(last_row)) last_row_ = min(last_row,last_row_)
if (present(first_col)) first_col_ = max(first_col,first_col_)
if (present(last_col)) last_col_ = min(last_col,last_col_)
if (x%is_dev()) call x%sync()
#if defined(OPENMP)
!$omp parallel do private(i,j)
do i = first_row_, last_row_
do j = first_col_, last_col_
x%v(i,j) = val(i-first_row_+1,j-first_col_+1)
end do
end do
#else
x%v(first_row_:last_row_,first_col_:last_col_) = val(1:last_row_-first_row_+1,1:last_col_-first_col_+1)
#endif
call x%set_host()
end subroutine d_base_mlv_set_vect
!
@ -2882,18 +2930,18 @@ contains
!! \brief Product by a mlv
!! \param nr Number of rows to be considered
!! \param y The other (base_mlv_vect) to be multiplied by
!! \param trans If true, x is transposed
!! \param res Result vector
!!
function d_base_mlv_prod_v(nr,x,y,trans) result(res)
subroutine d_base_mlv_prod_v(nr,x,y,res)
implicit none
class(psb_d_base_multivect_type), intent(inout) :: x, y
class(psb_d_base_multivect_type), intent(inout) :: x, y, res
integer(psb_ipk_), intent(in) :: nr
logical, optional, intent(in) :: trans
real(psb_dpk_), allocatable :: res(:,:)
external :: dgemm
integer(psb_ipk_) :: x_n, y_n, lda, ldb
write(*,*) 'CPU PROD'
if (x%is_dev()) call x%sync()
if (y%is_dev()) call y%sync()
select type(yy => y)
@ -2901,22 +2949,12 @@ contains
x_n = x%get_ncols()
y_n = y%get_ncols()
lda = x%get_nrows()
if (trans) then
allocate(res(x_n,y_n))
res = dzero
ldb = y%get_nrows()
call dgemm('T','N',x_n,y_n,nr,done,x%v,lda,y%v,ldb,dzero,res,x_n)
else
allocate(res(x%get_nrows(),y_n))
res = dzero
ldb = y_n
call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y%v,ldb,dzero,res,lda)
end if
ldb = y_n
call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y%v,ldb,dzero,res%v,lda)
class default
res = x%prod(nr,y%v,trans)
call x%prod(nr,y%v,res)
end select
end function d_base_mlv_prod_v
end subroutine d_base_mlv_prod_v
!
! Multivectors product base
@ -2930,33 +2968,24 @@ contains
!! \param trans If true, x is transposed
!! \param res Result vector
!!
function d_base_mlv_prod_a(nr,x,y,trans) result(res)
subroutine d_base_mlv_prod_a(nr,x,y,res)
implicit none
class(psb_d_base_multivect_type), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: x, res
real(psb_dpk_), intent(in) :: y(:,:)
integer(psb_ipk_), intent(in) :: nr
logical, optional, intent(in) :: trans
real(psb_dpk_), allocatable :: res(:,:)
external :: dgemm
integer(psb_ipk_) :: x_n, y_n, lda, ldb
write(*,*) 'CPU PROD'
if (x%is_dev()) call x%sync()
x_n = x%get_ncols()
y_n = size(y,dim=2)
lda = x%get_nrows()
if (trans) then
allocate(res(x_n,y_n))
res = dzero
ldb = size(y,dim=1)
call dgemm('T','N',x_n,y_n,nr,done,x%v,lda,y,ldb,dzero,res,x_n)
else
allocate(res(x%get_nrows(),y_n))
res = dzero
ldb = x_n
call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y,ldb,dzero,res,lda)
end if
end function d_base_mlv_prod_a
ldb = x_n
call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y,ldb,dzero,res%v,lda)
end subroutine d_base_mlv_prod_a
!
! Dot products

@ -1627,21 +1627,23 @@ contains
end if
end function d_vect_get_vect
subroutine d_vect_set_scal(x,val)
subroutine d_vect_set_scal(x,val,first_row,last_row,first_col,last_col)
class(psb_d_multivect_type), intent(inout) :: x
real(psb_dpk_), intent(in) :: val
integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
integer(psb_ipk_) :: info
if (allocated(x%v)) call x%v%set(val)
if (allocated(x%v)) call x%v%set(val,first_row,last_row,first_col,last_col)
end subroutine d_vect_set_scal
subroutine d_vect_set_vect(x,val)
subroutine d_vect_set_vect(x,val,first_row,last_row,first_col,last_col)
class(psb_d_multivect_type), intent(inout) :: x
real(psb_dpk_), intent(in) :: val(:,:)
integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
integer(psb_ipk_) :: info
if (allocated(x%v)) call x%v%set(val)
if (allocated(x%v)) call x%v%set(val,first_row,last_row,first_col,last_col)
end subroutine d_vect_set_vect
@ -1893,30 +1895,26 @@ contains
call move_alloc(tmp,x%v)
end subroutine d_vect_cnv
function d_vect_prod_v(nr,x,y,trans) result(res)
subroutine d_vect_prod_v(nr,x,y,res)
implicit none
class(psb_d_multivect_type), intent(inout) :: x, y
class(psb_d_multivect_type), intent(inout) :: x, y, res
integer(psb_ipk_), intent(in) :: nr
logical, optional, intent(in) :: trans
real(psb_dpk_), allocatable :: res(:,:)
if (allocated(x%v).and.allocated(y%v)) &
& res = x%v%prod(nr,y%v,trans)
& call x%v%prod(nr,y%v,res%v)
end function d_vect_prod_v
end subroutine d_vect_prod_v
function d_vect_prod_a(nr,x,y,trans) result(res)
subroutine d_vect_prod_a(nr,x,y,res)
implicit none
class(psb_d_multivect_type), intent(inout) :: x
class(psb_d_multivect_type), intent(inout) :: x, res
real(psb_dpk_), intent(in) :: y(:,:)
integer(psb_ipk_), intent(in) :: nr
logical, optional, intent(in) :: trans
real(psb_dpk_), allocatable :: res(:,:)
if (allocated(x%v)) &
& res = x%v%prod(nr,y,trans)
& call x%v%prod(nr,y,res%v)
end function d_vect_prod_a
end subroutine d_vect_prod_a
function d_vect_dot_v(nr,x,y) result(res)
implicit none

@ -43,14 +43,13 @@
! y - type(psb_d_multivect_type) The input vector containing the entries of sub( Y ).
! desc_a - type(psb_desc_type). The communication descriptor.
! info - integer. Return code
! trans - logical(optional) Whether multivector X is transposed, default: .false.
! global - logical(optional) Whether to perform the global reduce, default: .true.
!
! Note: from a functional point of view, X and Y are input, but here
! they are declared INOUT because of the sync() methods.
!
!
function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
subroutine psb_dprod_multivect(x,y,res,desc_a,info,global)
use psb_desc_mod
use psb_d_base_mat_mod
use psb_check_mod
@ -59,11 +58,9 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
use psb_d_vect_mod
use psb_d_psblas_mod, psb_protect_name => psb_dprod_multivect
implicit none
real(psb_dpk_), allocatable :: res(:,:)
type(psb_d_multivect_type), intent(inout) :: x, y
type(psb_d_multivect_type), intent(inout) :: x, y, res
type(psb_desc_type), intent(in) :: desc_a
integer(psb_ipk_), intent(out) :: info
logical, intent(in), optional :: trans
logical, intent(in), optional :: global
! locals
@ -71,7 +68,7 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
integer(psb_ipk_) :: np, me, idx, ndm,&
& err_act, iix, jjx, iiy, jjy, i, j, nr
integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n
logical :: global_, trans_
logical :: global_
character(len=20) :: name, ch_err
name='psb_dprod_multivect'
@ -99,12 +96,6 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
goto 9999
endif
if (present(trans)) then
trans_ = trans
else
trans_ = .false.
end if
if (present(global)) then
global_ = global
else
@ -140,25 +131,26 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
nr = desc_a%get_local_rows()
if (nr > 0) then
res = x%prod(nr,y,trans_)
call x%prod(nr,y,res)
! adjust dot_local because overlapped elements are computed more than once
if (size(desc_a%ovrlap_elem,1)>0) then
if (x%v%is_dev()) call x%sync()
if (y%v%is_dev()) call y%sync()
if (res%v%is_dev()) call res%sync()
do j=1,x%get_ncols()
do i=1,size(desc_a%ovrlap_elem,1)
idx = desc_a%ovrlap_elem(i,1)
ndm = desc_a%ovrlap_elem(i,2)
res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y%v%v(idx,:))
res%v%v(j,:) = res%v%v(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y%v%v(idx,:))
end do
end do
end if
else
res = dzero
call res%zero()
end if
! compute global sum
if (global_) call psb_sum(ctxt, res)
if (global_) call psb_sum(ctxt, res%v%v)
call psb_erractionrestore(err_act)
return
@ -167,7 +159,7 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
return
end function psb_dprod_multivect
end subroutine psb_dprod_multivect
!
! Function: psb_dprod_multivect_a
! psb_dprod computes the product of two distributed multivectors,
@ -181,14 +173,13 @@ end function psb_dprod_multivect
! y - real(:,:) The input vector containing the entries of sub( Y ).
! desc_a - type(psb_desc_type). The communication descriptor.
! info - integer. Return code
! trans - logical(optional) Whether multivector X is transposed, default: .false.
! global - logical(optional) Whether to perform the global reduce, default: .true.
!
! Note: from a functional point of view, X and Y are input, but here
! they are declared INOUT because of the sync() methods.
!
!
function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
subroutine psb_dprod_multivect_a(x,y,res,desc_a,info,global)
use psb_desc_mod
use psb_d_base_mat_mod
use psb_check_mod
@ -197,12 +188,10 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
use psb_d_vect_mod
use psb_d_psblas_mod, psb_protect_name => psb_dprod_multivect_a
implicit none
real(psb_dpk_), allocatable :: res(:,:)
type(psb_d_multivect_type), intent(inout) :: x
type(psb_d_multivect_type), intent(inout) :: x, res
real(psb_dpk_), intent(in) :: y(:,:)
type(psb_desc_type), intent(in) :: desc_a
integer(psb_ipk_), intent(out) :: info
logical, intent(in), optional :: trans
logical, intent(in), optional :: global
! locals
@ -210,7 +199,7 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
integer(psb_ipk_) :: np, me, idx, ndm,&
& err_act, iix, jjx, iiy, jjy, i, j, nr
integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n
logical :: global_, trans_
logical :: global_
character(len=20) :: name, ch_err
name='psb_dprod_multivect'
@ -233,12 +222,6 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
goto 9999
endif
if (present(trans)) then
trans_ = trans
else
trans_ = .false.
end if
if (present(global)) then
global_ = global
else
@ -268,164 +251,25 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
nr = desc_a%get_local_rows()
if (nr > 0) then
res = x%prod(nr,y,trans_)
call x%prod(nr,y,res)
! adjust dot_local because overlapped elements are computed more than once
if (size(desc_a%ovrlap_elem,1)>0) then
if (x%v%is_dev()) call x%sync()
if (res%v%is_dev()) call res%sync()
do j=1,x%get_ncols()
do i=1,size(desc_a%ovrlap_elem,1)
idx = desc_a%ovrlap_elem(i,1)
ndm = desc_a%ovrlap_elem(i,2)
res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y(idx,:))
res%v%v(j,:) = res%v%v(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y(idx,:))
end do
end do
end if
else
res = dzero
end if
! compute global sum
if (global_) call psb_sum(ctxt, res)
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(ctxt,err_act)
return
end function psb_dprod_multivect_a
!
! Function: psb_dprod_m
! psb_dprod computes the product of two distributed multivectors,
!
! prod := ( X ) * ( Y ) or
! prod := ( X )**C * ( Y )
!
!
! Arguments:
! x - real(:,:) The input vector containing the entries of sub( X ).
! y - real(:,:) The input vector containing the entries of sub( Y ).
! desc_a - type(psb_desc_type). The communication descriptor.
! info - integer. Return code
! trans - logical(optional) Whether multivector X is transposed, default: .false.
! global - logical(optional) Whether to perform the global reduce, default: .true.
!
! Note: from a functional point of view, X and Y are input, but here
! they are declared INOUT because of the sync() methods.
!
!
function psb_dprod_m(x,y,desc_a,info,trans,global) result(res)
use psb_desc_mod
use psb_d_base_mat_mod
use psb_check_mod
use psb_error_mod
use psb_penv_mod
use psb_d_vect_mod
use psb_d_psblas_mod, psb_protect_name => psb_dprod_m
implicit none
real(psb_dpk_), allocatable :: res(:,:)
real(psb_dpk_), intent(in) :: x(:,:), y(:,:)
type(psb_desc_type), intent(in) :: desc_a
integer(psb_ipk_), intent(out) :: info
logical, intent(in), optional :: trans
logical, intent(in), optional :: global
! locals
type(psb_ctxt_type) :: ctxt
integer(psb_ipk_) :: np, me, idx, ndm,&
& err_act, iix, jjx, iiy, jjy, i, j, nr, x_n, y_n, lda, ldb
integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n
logical :: global_, trans_
character(len=20) :: name, ch_err
name='psb_dprod_multivect'
info=psb_success_
call psb_erractionsave(err_act)
if (psb_errstatus_fatal()) then
info = psb_err_internal_error_ ; goto 9999
end if
ctxt=desc_a%get_context()
call psb_info(ctxt, me, np)
if (np == -ione) then
info = psb_err_context_error_
call psb_errpush(info,name)
goto 9999
endif
if (present(trans)) then
trans_ = trans
else
trans_ = .false.
end if
if (present(global)) then
global_ = global
else
global_ = .false.
end if
ix = ione
ijx = ione
m = desc_a%get_global_rows()
! check vector correctness
n = size(x,2)
call psb_chkvect(m,n,size(x,1),ix,ijx,desc_a,info,iix,jjx)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_chkvect'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
if ((iix /= ione)) then
info=psb_err_ix_n1_iy_n1_unsupported_
call psb_errpush(info,name)
goto 9999
end if
nr = desc_a%get_local_rows()
x_n = size(x,2)
y_n = size(y,2)
lda = size(x,1)
if (nr > 0) then
if (trans_) then
allocate(res(x_n,y_n))
res = dzero
ldb = size(y,1)
call dgemm('T','N',x_n,y_n,nr,done,x,lda,y,ldb,dzero,res,x_n)
else
allocate(res(lda,y_n))
res = dzero
ldb = x_n
call dgemm('N','N',nr,y_n,x_n,done,x,lda,y,ldb,dzero,res,lda)
end if
! adjust dot_local because overlapped elements are computed more than once
if (size(desc_a%ovrlap_elem,1)>0) then
do j=1,x_n
do i=1,size(desc_a%ovrlap_elem,1)
idx = desc_a%ovrlap_elem(i,1)
ndm = desc_a%ovrlap_elem(i,2)
res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x(idx,:)*y(idx,:))
end do
end do
end if
else
if (trans_) then
allocate(res(x_n,y_n))
else
allocate(res(lda,y_n))
end if
res = dzero
call res%zero()
end if
! compute global sum
if (global_) call psb_sum(ctxt, res)
if (global_) call psb_sum(ctxt, res%v%v)
call psb_erractionrestore(err_act)
return
@ -434,4 +278,4 @@ function psb_dprod_m(x,y,desc_a,info,trans,global) result(res)
return
end function psb_dprod_m
end subroutine psb_dprod_multivect_a

@ -89,10 +89,37 @@ int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
return(i);
}
int prodMultiVecDeviceDouble(char transa, int m, int n, int k,
double *alpha, void *deviceX, void* deviceY,
double *beta, void* deviceZ)
{
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
struct MultiVectDevice *z = (struct MultiVectDevice *) deviceZ;
int status;
cublasHandle_t handle=psb_cudaGetCublasHandle();
cublasOperation_t trans=((transa == 'N')? CUBLAS_OP_N:((transa=='T')? CUBLAS_OP_T:CUBLAS_OP_C));
/* Note: the M,N,K choices according to TRANS have already been handled in the caller */
if (n == 1) {
status = cublasDgemv(handle, trans, m, k,
alpha, x->v_, x->pitch_, y->v_, 1,
beta, z->v_, 1);
} else {
status = cublasDgemm(handle, trans, CUBLAS_OP_N, m, n, k,
alpha, x->v_, x->pitch_, y->v_, y->pitch_,
beta, z->v_, z->pitch_);
}
if (status == CUBLAS_STATUS_SUCCESS)
return SPGPU_SUCCESS;
else
return SPGPU_UNSUPPORTED;
}
int setscalMultiVecDeviceDouble(double val, int first, int last,
int indexBase, void* devMultiVecX)
{ int i=0;
int pitch = 0;
struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX;
spgpuHandle_t handle=psb_cudaGetHandle();
@ -101,6 +128,20 @@ int setscalMultiVecDeviceDouble(double val, int first, int last,
return(i);
}
int setscalMultiVecDeviceDoubleR2(double val, int first_row, int last_row,
int first_col, int last_col, int indexBase, void* devMultiVecX)
{ int i=0;
struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX;
spgpuHandle_t handle=psb_cudaGetHandle();
int pitch = devVecX->pitch_;
for (int j=first_col; j<=last_col; j++) {
spgpuDsetscal(handle, first_row, last_row, indexBase, val, ((double *) devVecX->v_)+(j-1)*pitch);
}
return(i);
}
int geinsMultiVecDeviceDouble(int n, void* devMultiVecIrl, void* devMultiVecVal,
int dupl, int indexBase, void* devMultiVecX)

@ -42,8 +42,14 @@ int writeMultiVecDeviceDoubleR2(void* deviceMultiVec, double* hostMultiVec, int
int readMultiVecDeviceDouble(void* deviceMultiVec, double* hostMultiVec);
int readMultiVecDeviceDoubleR2(void* deviceMultiVec, double* hostMultiVec, int ld);
int prodMultiVecDeviceDouble(char transa, int m, int n, int k,
double *alpha, void *deviceX, void* deviceY,
double *beta, void* deviceZ);
int setscalMultiVecDeviceDouble(double val, int first, int last,
int indexBase, void* devVecX);
int indexBase, void* devVecX);
int setscalMultiVecDeviceDoubleR2(double val, int first_row, int last_row,
int first_col, int last_col, int indexBase, void* devVecX);
int geinsMultiVecDeviceDouble(int n, void* devVecIrl, void* devVecVal,
int dupl, int indexBase, void* devVecX);

@ -1354,41 +1354,82 @@ module psb_d_cuda_multivect_mod
real(c_double), allocatable :: buffer(:,:)
type(c_ptr) :: dt_buf = c_null_ptr
contains
!
! Constructors/allocators
!
procedure, pass(x) :: bld_x => d_cuda_multi_bld_x
procedure, pass(x) :: bld_n => d_cuda_multi_bld_n
procedure, pass(x) :: all => d_cuda_multi_all
!
! Insert/set. Assembly and free.
! Assembly does almost nothing here, but is important
! in derived classes.
!
procedure, pass(x) :: ins => d_cuda_multi_ins
procedure, pass(x) :: zero => d_cuda_multi_zero
procedure, pass(x) :: asb => d_cuda_multi_asb
procedure, pass(x) :: free => d_cuda_multi_free
!
! Sync: centerpiece of handling of external storage.
! Any derived class having extra storage upon sync
! will guarantee that both fortran/host side and
! external side contain the same data. The base
! version is only a placeholder.
!
procedure, pass(x) :: sync => d_cuda_multi_sync
procedure, pass(x) :: sync_space => d_cuda_multi_sync_space
procedure, pass(x) :: is_host => d_cuda_multi_is_host
procedure, pass(x) :: is_dev => d_cuda_multi_is_dev
procedure, pass(x) :: is_sync => d_cuda_multi_is_sync
procedure, pass(x) :: set_host => d_cuda_multi_set_host
procedure, pass(x) :: set_dev => d_cuda_multi_set_dev
procedure, pass(x) :: set_sync => d_cuda_multi_set_sync
!
! Basic info
!
procedure, pass(x) :: get_nrows => d_cuda_multi_get_nrows
procedure, pass(x) :: get_ncols => d_cuda_multi_get_ncols
procedure, nopass :: get_fmt => d_cuda_multi_get_fmt
!
! Set/get data from/to an external array; also
! overload assignment.
!
procedure, pass(x) :: set_scal => d_cuda_multi_set_scal
procedure, pass(x) :: set_vect => d_cuda_multi_set_vect
!
! Product, dot-product (col-by-col) and AXPBY
!
procedure, pass(x) :: prod_v => d_cuda_multi_prod_v
procedure, pass(x) :: prod_a => d_cuda_multi_prod_a
procedure, pass(x) :: dot_v => d_cuda_multi_dot_v
procedure, pass(x) :: dot_a => d_cuda_multi_dot_a
procedure, pass(y) :: axpby_v => d_cuda_multi_axpby_v
procedure, pass(y) :: axpby_a => d_cuda_multi_axpby_a
!
! MultiVector by vector/multivector multiplication. Need all variants
! to handle multiple requirements from preconditioners
!
!!$ procedure, pass(y) :: mlt_v => d_cuda_multi_mlt_v
!!$ procedure, pass(y) :: mlt_a => d_cuda_multi_mlt_a
!!$ procedure, pass(z) :: mlt_a_2 => d_cuda_multi_mlt_a_2
!!$ procedure, pass(z) :: mlt_v_2 => d_cuda_multi_mlt_v_2
!!$ procedure, pass(x) :: scal => d_cuda_multi_scal
!
! Scaling and norms
!
procedure, pass(x) :: nrm2 => d_cuda_multi_nrm2
procedure, pass(x) :: amax => d_cuda_multi_amax
procedure, pass(x) :: asum => d_cuda_multi_asum
procedure, pass(x) :: all => d_cuda_multi_all
procedure, pass(x) :: zero => d_cuda_multi_zero
procedure, pass(x) :: asb => d_cuda_multi_asb
procedure, pass(x) :: sync => d_cuda_multi_sync
procedure, pass(x) :: sync_space => d_cuda_multi_sync_space
procedure, pass(x) :: bld_x => d_cuda_multi_bld_x
procedure, pass(x) :: bld_n => d_cuda_multi_bld_n
procedure, pass(x) :: free => d_cuda_multi_free
procedure, pass(x) :: ins => d_cuda_multi_ins
procedure, pass(x) :: is_host => d_cuda_multi_is_host
procedure, pass(x) :: is_dev => d_cuda_multi_is_dev
procedure, pass(x) :: is_sync => d_cuda_multi_is_sync
procedure, pass(x) :: set_host => d_cuda_multi_set_host
procedure, pass(x) :: set_dev => d_cuda_multi_set_dev
procedure, pass(x) :: set_sync => d_cuda_multi_set_sync
procedure, pass(x) :: set_scal => d_cuda_multi_set_scal
procedure, pass(x) :: set_vect => d_cuda_multi_set_vect
!!$ procedure, pass(x) :: scal => d_cuda_multi_scal
!
! Gather/scatter. These are needed for MPI interfacing.
! May have to be reworked.
!
!!$ procedure, pass(x) :: gthzv_x => d_cuda_multi_gthzv_x
!!$ procedure, pass(y) :: sctb => d_cuda_multi_sctb
!!$ procedure, pass(y) :: sctb_x => d_cuda_multi_sctb_x
!
! Finalize
!
final :: d_cuda_multi_vect_finalize
end type psb_d_multivect_cuda
@ -1607,13 +1648,63 @@ contains
res = 'dGPU'
end function d_cuda_multi_get_fmt
subroutine d_cuda_multi_prod_v(nr,x,y,res)
implicit none
class(psb_d_multivect_cuda), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y, res
integer(psb_ipk_), intent(in) :: nr
integer(psb_ipk_) :: info
select type(yy => y)
type is (psb_d_multivect_cuda)
select type(zz => res)
type is (psb_d_multivect_cuda)
if (x%is_host()) call x%sync()
if (yy%is_host()) call yy%sync()
if (zz%is_host()) call zz%sync()
if (info == 0) &
& info = prodMultiVecDevice('N',nr,yy%get_ncols(),x%get_ncols(),done,x%deviceVect,yy%deviceVect,done,zz%deviceVect)
call zz%set_dev()
class default
! TODO
end select
class default
call x%prod(nr,y%v,res)
end select
end subroutine d_cuda_multi_prod_v
subroutine d_cuda_multi_prod_a(nr,x,y,res)
implicit none
class(psb_d_multivect_cuda), intent(inout) :: x
real(psb_dpk_), intent(in) :: y(:,:)
class(psb_d_base_multivect_type), intent(inout) :: res
integer(psb_ipk_), intent(in) :: nr
type(c_ptr) :: gpY
integer(psb_ipk_) :: info, i
select type(zz => res)
type is (psb_d_multivect_cuda)
if (x%is_host()) call x%sync()
if (zz%is_host()) call zz%sync()
if (info == 0) &
& info = FallocMultiVecDevice(gpY,size(y,2),size(y,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = prodMultiVecDevice('N',nr,size(y,2),x%get_ncols(),done,x%deviceVect,gpY,dzero,zz%deviceVect)
call freeMultiVecDevice(gpY)
call zz%set_dev()
class default
! TODO
end select
end subroutine d_cuda_multi_prod_a
function d_cuda_multi_dot_v(nr,x,y) result(res)
implicit none
class(psb_d_multivect_cuda), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(in) :: nr
real(psb_dpk_), allocatable :: res(:,:)
real(psb_dpk_), external :: ddot
integer(psb_ipk_) :: info
!
@ -1824,22 +1915,33 @@ contains
!!$ end select
!!$ end subroutine d_cuda_multi_mlt_v_2
subroutine d_cuda_multi_set_scal(x,val)
subroutine d_cuda_multi_set_scal(x,val,first_row,last_row,first_col,last_col)
class(psb_d_multivect_cuda), intent(inout) :: x
real(psb_dpk_), intent(in) :: val
integer(psb_ipk_) :: info
real(psb_dpk_), intent(in) :: val
integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
integer(psb_ipk_) :: info, first_row_, last_row_, first_col_, last_col_
first_row_=1
last_row_=size(x%v,1)
first_col_=1
last_col_=size(x%v,2)
if (present(first_row)) first_row_ = max(first_row,first_row_)
if (present(last_row)) last_row_ = min(last_row,last_row_)
if (present(first_col)) first_col_ = max(first_col,first_col_)
if (present(last_col)) last_col_ = min(last_col,last_col_)
info = setScalDevice(val,first_row_,last_row_,first_col_,last_col_,1,x%deviceVect)
call x%set_dev()
if (x%is_dev()) call x%sync()
call x%psb_d_base_multivect_type%set_scal(val)
call x%set_host()
end subroutine d_cuda_multi_set_scal
subroutine d_cuda_multi_set_vect(x,val)
subroutine d_cuda_multi_set_vect(x,val,first_row,last_row,first_col,last_col)
class(psb_d_multivect_cuda), intent(inout) :: x
real(psb_dpk_), intent(in) :: val(:,:)
integer(psb_ipk_) :: nr
integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
integer(psb_ipk_) :: info
if (x%is_dev()) call x%sync()
@ -1848,8 +1950,6 @@ contains
end subroutine d_cuda_multi_set_vect
!!$ subroutine d_cuda_multi_scal(alpha, x)
!!$ implicit none
!!$ class(psb_d_multivect_cuda), intent(inout) :: x
@ -1870,7 +1970,6 @@ contains
if (x%is_host()) call x%sync()
allocate(res(x%get_ncols()))
info = nrm2MultiVecDevice(res,nr,x%deviceVect)
end function d_cuda_multi_nrm2
! TODO CUDA
@ -1887,7 +1986,6 @@ contains
do i=1,nr
res = max(res,sum(abs(x%v(i,1:nc))))
end do
end function d_cuda_multi_amax
! TODO CUDA
@ -1903,7 +2001,6 @@ contains
do j=1,x%get_ncols()
res = max(res,sum(abs(x%v(1:nr,j))))
end do
end function d_cuda_multi_asum
subroutine d_cuda_multi_all(m,n, x, info)
@ -1931,8 +2028,8 @@ contains
implicit none
class(psb_d_multivect_cuda), intent(inout) :: x
if (allocated(x%v)) x%v=dzero
call x%set_host()
call x%set_dev()
call x%set_scal(dzero)
end subroutine d_cuda_multi_zero
subroutine d_cuda_multi_asb(m,n, x, info)
@ -1944,7 +2041,6 @@ contains
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: nd, nc
x%m_nrows = m
x%m_ncols = n
if (x%is_host()) then
@ -2035,7 +2131,7 @@ contains
else if (x%is_dev()) then
info = readMultiVecDevice(x%deviceVect,x%v,size(x%v,1))
end if
if (info == 0) call x%set_sync()
if (info == 0) call x%set_sync()
if (info /= 0) then
info=psb_err_internal_error_
call psb_errpush(info,'d_cuda_multi_sync')

@ -80,6 +80,18 @@ module psb_d_vectordev_mod
end function readMultiVecDeviceDoubleR2
end interface
interface prodMultiVecDevice
function prodMultiVecDeviceDouble(transa,m,n,k,alpha,deviceVecA, &
& deviceVecB,beta,deviceVecC) result(res) bind(c,name='prodMultiVecDeviceDouble')
use iso_c_binding
character(c_char), value :: transa
integer(c_int), value :: m, n, k
integer(c_int) :: res
type(c_ptr), value :: deviceVecA, deviceVecB, deviceVecC
real(c_double) :: alpha, beta
end function prodMultiVecDeviceDouble
end interface
interface allocateDouble
function allocateDouble(didx,n) &
& result(res) bind(c,name='allocateDouble')
@ -159,7 +171,6 @@ module psb_d_vectordev_mod
end subroutine freeDouble
end interface
interface setScalDevice
function setScalMultiVecDeviceDouble(val, first, last, &
& indexBase, deviceVecX) result(res) &
@ -170,6 +181,15 @@ module psb_d_vectordev_mod
real(c_double), value :: val
type(c_ptr), value :: deviceVecX
end function setScalMultiVecDeviceDouble
function setScalMultiVecDeviceDoubleR2(val, first_row, last_row, &
& first_col, last_col, indexBase, deviceVecX) result(res) &
& bind(c,name='setscalMultiVecDeviceDoubleR2')
use iso_c_binding
integer(c_int) :: res
integer(c_int), value :: first_row,last_row,first_col,last_col,indexbase
real(c_double), value :: val
type(c_ptr), value :: deviceVecX
end function setScalMultiVecDeviceDoubleR2
end interface
interface

@ -63,7 +63,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
real(psb_dpk_), allocatable :: aux(:), h(:,:), beta(:,:), y(:,:)
type(psb_d_multivect_type), allocatable :: v(:)
type(psb_d_multivect_type) :: w, r0, rm, temp
type(psb_d_multivect_type) :: w, r0, rm, pd
integer(psb_ipk_) :: naux, itrace_, n_row, n_col, nrhs, nrep
integer(psb_lpk_) :: mglob, n_add, ncv
@ -153,12 +153,12 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
if (info == psb_success_) call psb_geall(w,desc_a,info,n=nrhs)
if (info == psb_success_) call psb_geall(r0,desc_a,info,n=nrhs)
if (info == psb_success_) call psb_geall(rm,desc_a,info,n=nrhs)
if (info == psb_success_) call psb_geall(temp,desc_a,info,n=nrhs)
if (info == psb_success_) call psb_geall(pd,desc_a,info,n=nrhs)
if (info == psb_success_) call psb_geasb(v,desc_a,info,mold=x%v,n=nrhs)
if (info == psb_success_) call psb_geasb(w,desc_a,info,mold=x%v,n=nrhs)
if (info == psb_success_) call psb_geasb(r0,desc_a,info,mold=x%v,n=nrhs)
if (info == psb_success_) call psb_geasb(rm,desc_a,info,mold=x%v,n=nrhs)
if (info == psb_success_) call psb_geasb(temp,desc_a,info,mold=x%v,n=nrhs)
if (info == psb_success_) call psb_geasb(pd,desc_a,info,mold=x%v,n=nrhs)
if (info /= psb_success_) then
info=psb_err_from_subroutine_non_
call psb_errpush(info,name)
@ -261,8 +261,8 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
! STEP 7: Compute W = W - V(i)*H(i,j)
! Compute temp product V(i)*H(i,j)
call mv_prod(v(i)%get_vect(),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add))
! Compute product V(i)*H(i,j)
call psb_geprod(v(i),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add),pd,desc_a,info)
if (info /= psb_success_) then
info=psb_err_from_subroutine_non_
call psb_errpush(info,name)
@ -270,7 +270,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
end if
! Compute W
call psb_geaxpby(-done,temp,done,w,desc_a,info)
call psb_geaxpby(-done,pd,done,w,desc_a,info)
if (info /= psb_success_) then
info=psb_err_from_subroutine_non_
call psb_errpush(info,name)
@ -327,16 +327,16 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
! Compute index for Y products
idx = (i-1)*nrhs+1
! Compute temp product V(i)*Y(i)
call mv_prod(v(i)%get_vect(),y(idx:idx+n_add,:))
! Compute product V(i)*Y(i)
call psb_geprod(v(i),y(idx:idx+n_add,:),pd,desc_a,info)
if (info /= psb_success_) then
info=psb_err_from_subroutine_non_
call psb_errpush(info,name)
goto 9999
end if
! Add temp to X(m)
call psb_geaxpby(done,temp,done,x,desc_a,info)
! Add product to X(m)
call psb_geaxpby(done,pd,done,x,desc_a,info)
if (info /= psb_success_) then
info=psb_err_from_subroutine_non_
call psb_errpush(info,name)
@ -356,6 +356,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
if (info == psb_success_) call psb_gefree(w,desc_a,info)
if (info == psb_success_) call psb_gefree(r0,desc_a,info)
if (info == psb_success_) call psb_gefree(rm,desc_a,info)
if (info == psb_success_) call psb_gefree(pd,desc_a,info)
if (info == psb_success_) deallocate(aux,h,y,r0n2,rmn2,stat=info)
if (info /= psb_success_) then
info=psb_err_from_subroutine_non_
@ -426,36 +427,7 @@ contains
end function qr_fact
! Multivectors product
subroutine mv_prod(x_mv,y_mv)
implicit none
! I/O parameters
real(psb_dpk_), intent(in) :: x_mv(:,:), y_mv(:,:)
! Utils
real(psb_dpk_), allocatable :: res(:,:)
integer(psb_ipk_) :: lda, ldb
! Initialize params
lda = size(x_mv,1)
ldb = size(y_mv,2)
allocate(res(lda,ldb))
res = dzero
! Compute product
if (n_row > 0) then
call dgemm('N','N',n_row,nrhs,nrhs,done,x_mv,lda,y_mv,ldb,dzero,res,lda)
end if
! Set temp multivector
call temp%set(res)
! Deallocate
deallocate(res)
end subroutine mv_prod
! TODO Loop con Givens rotation su ogni colonna
! Minimize Frobenius norm
function frobenius_norm_min(rep) result(res)
implicit none

@ -548,7 +548,7 @@ program dpdegen
! solver parameters
real(psb_dpk_) :: err, cond
integer(psb_epk_) :: amatsize, precsize, descsize
integer(psb_epk_) :: amatsize, precsize, descsize, annz
integer(psb_ipk_) :: iter, ierr, ircode
! molds
@ -599,7 +599,7 @@ program dpdegen
!
call psb_barrier(ctxt)
t1 = psb_wtime()
call psb_gen_pde3d(ctxt,idim,a,b_mv,x_mv,nrhs,desc_a,'CSR ',info,partition=3,tnd=tnd)
call psb_gen_pde3d(ctxt,idim,a,b_mv,x_mv,nrhs,desc_a,'CSR ',info,vmold=gpumold,partition=3,tnd=tnd)
call psb_barrier(ctxt)
t2 = psb_wtime() - t1
if(info /= psb_success_) then
@ -640,6 +640,12 @@ program dpdegen
stop
end if
! set random RHS
call random_number(b_mv%v%v)
b_mv%v%v = -10 + (20)*b_mv%v%v
call b_mv%v%set_host()
call b_mv%sync()
call psb_geall(r_mv,desc_a,info,nrhs)
call r_mv%zero()
call psb_geasb(r_mv,desc_a,info)
@ -694,10 +700,12 @@ program dpdegen
resmxp = psb_geamax(r_mv,desc_a,info)
amatsize = a%sizeof()
annz = a%get_nzeros()
descsize = desc_a%sizeof()
precsize = prec%sizeof()
call psb_sum(ctxt,amatsize)
call psb_sum(ctxt,annz)
call psb_sum(ctxt,descsize)
call psb_sum(ctxt,precsize)
@ -710,6 +718,9 @@ program dpdegen
call prec%descr(info)
write(psb_out_unit,'(" ")')
write(psb_out_unit,'("Computed solution on: ",i8," processors")')np
write(psb_out_unit,'("Dimesion of A: ",i12)')desc_a%get_global_rows()
write(psb_out_unit,'("Number of nonzeros: ",i12)')annz
write(psb_out_unit,'("Number of RHS: ",i12)')nrhs
write(psb_out_unit,'("Storage format for A: ",a)')a%get_fmt()
write(psb_out_unit,'("Storage format for DESC_A: ",a)')desc_a%get_fmt()
write(psb_out_unit,'("Total memory occupation for A: ",i12)')amatsize

@ -596,7 +596,7 @@ program pdegenmm
! solver parameters
integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes
real(psb_dpk_) :: err, eps
integer, parameter :: ntests=50, ngpu=1, ncnv=1
integer, parameter :: ntests=20, ngpu=1, ncnv=1
type(psb_d_coo_sparse_mat), target :: acoo
type(psb_d_csr_sparse_mat), target :: acsr
type(psb_d_ell_sparse_mat), target :: aell

Loading…
Cancel
Save