You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
psblas3/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c

250 lines
9.6 KiB
C

/*
* Sp3MM_for_AlgebraicMultiGrid
* (C) Copyright 2021-2022
* Andrea Di Iorio
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions, and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
* not be used to endorse or promote products derived from this
* software without specific written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )*/
#ifndef OFF_F
#error generic implementation requires OFF_F defined
#endif
#ifndef SP3MM_OMP_SYMB
#define SP3MM_OMP_SYMB
//allocCSRSpMatSymbStep aux function for IRP set by symb step output
static inline idx_t _setCSR_IRP_1DPartitioing(spmat* m, idx_t* rowSizes){
idx_t r,cumulSize;
for (r=0,cumulSize=0; r<m->M; cumulSize += rowSizes[r++])
m->IRP[r] = cumulSize;
DEBUGCHECKS assert(cumulSize == rowSizes[m->M]);
m->IRP[m->M] = cumulSize;
return cumulSize;
}
static inline idx_t _setCSR_IRP_2DPartitioing(spmat* m,idx_t* rowSizes,ushort gridCols){
idx_t cumulSize=0;
for (idx_t r=0,cumulSizeOld; r<m->M; r++){
m->IRP[r] = cumulSize;
for (ushort gc=0; gc<gridCols; gc++){
cumulSizeOld = cumulSize;
cumulSize += rowSizes[ IDX2D(r,gc,gridCols) ];
//inplace update 2D rowParts len matrix as IRP like start offsets
rowSizes[ IDX2D(r,gc,gridCols) ] = cumulSizeOld;
}
}
DEBUGCHECKS assert(cumulSize == rowSizes[m->M*gridCols]);
m->IRP[m->M] = cumulSize;
return cumulSize;
}
/*
* allocate CSR mat @m, setting up correctly IRP and other buffers allocations
* if @gridCols == 1: (1D partitioning of @m) -> @rowsSizes will be an array of row lenghts
* if @gridCols > 1: (2D partitioning of @m) -> @rowsSizes will be a matrix with
* elem i,j = len of j-th colPartition (out of @gridCols) of i-th row
*/
static inline int allocCSRSpMatSymbStep(spmat* m,idx_t* rowSizes,ushort gridCols){
//setup IRP and get cumul, whole size of @m
if (!gridCols) return EXIT_FAILURE;
idx_t cumulSize;
if (gridCols == 1) cumulSize = _setCSR_IRP_1DPartitioing(m,rowSizes);
/*if (gridCols > 1} -- static analizer miss unsignedness...*/
else cumulSize = _setCSR_IRP_2DPartitioing(m,rowSizes,gridCols);
m->IRP[m->M] = cumulSize;
m->NZ = cumulSize;
if (!(m->AS = malloc(rowSizes[m->M] * sizeof(*m->AS))) ){
ERRPRINT("allocCSRSpMatSymbStep m->AS malloc errd\n");
return EXIT_FAILURE;
}
if (!(m->JA = malloc(rowSizes[m->M] * sizeof(*m->JA))) ){
ERRPRINT("allocCSRSpMatSymbStep m->JA malloc errd\n");
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
#endif
//////////////////// COMPUTE CORE Sp[3]MM SYMB-NUMB PHASE //////////////////////////
////////Sp3MM as 2 x SpMM
///1D
spmat* CAT(spmmRowByRow_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
ACC_DENSE *accVects = NULL,*acc;
SPMM_ACC* outAccumul=NULL;
idx_t* rowsSizes = NULL;
///init AB matrix with SPMM heuristic preallocation
spmat* AB = allocSpMatrix(A->M,B->N);
if (!AB) goto _err;
///aux structures alloc
if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
ERRPRINT("accVects init failed\n");
goto _err;
}
///SYMBOLIC STEP
if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
goto _err;
if (allocCSRSpMatSymbStep(AB,rowsSizes,1)) goto _err;
///NUMERIC STEP
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (AB->M,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
#pragma omp parallel for schedule(runtime) private(acc)
for (idx_t r=0; r<A->M; r++){ //row-by-row formulation
acc = accVects + omp_get_thread_num();
_resetAccVect(acc); //rezero for the next A row
//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
/* direct use of sparse scalar vector multiplication
for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; ja<A->IRP[r+1]-OFF_F; ja++){
ca = A->JA[ja] - OFF_F;
jb = B->IRP[ca] - OFF_F;
bRowLen = B->IRP[ca+1] - B->IRP[ca];
CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
} */
for (ulong ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul
CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
//direct sparsify: trasform accumulated dense vector to a CSR row
sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
}
#if OFF_F != 0
C_FortranShiftIdxs(AB);
#endif
AUDIT_INTERNAL_TIMES End=omp_get_wtime();
DEBUG checkOverallocPercent(rowsSizes,AB);
goto _free;
_err:
if(AB) freeSpmat(AB);
AB=NULL; //nothing'll be returned
_free:
free(rowsSizes);
if(accVects) freeAccsDense(accVects,cfg->threadNum);
if(outAccumul) freeSpMMAcc(outAccumul);
return AB;
}
spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
ACC_DENSE *accVects = NULL,*acc;
SPMM_ACC* outAccumul=NULL;
idx_t* rowsSizes = NULL;
///init AB matrix with SPMM heuristic preallocation
spmat* AB = allocSpMatrix(A->M,B->N);
if (!AB) goto _err;
///aux structures alloc
if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
ERRPRINT("accVects init failed\n");
goto _err;
}
///SYMBOLIC STEP
if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
goto _err;
if (allocCSRSpMatSymbStep(AB,rowsSizes,1))
goto _err;
///NUMERIC STEP
//perform Gustavson over rows blocks -> M / @cfg->gridRows
ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
ulong b,startRow,block; //omp for aux vars
#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
for (b=0; b < cfg->gridRows; b++){
block = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
startRow = UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem);
for (idx_t r=startRow; r<startRow+block; r++){
acc = accVects + omp_get_thread_num();
_resetAccVect(acc); //rezero for the next A row
//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
/* direct use of sparse scalar vector multiplication
for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; ja<A->IRP[r+1]-OFF_F; ja++){
ca = A->JA[ja] - OFF_F;
jb = B->IRP[ca] - OFF_F;
bRowLen = B->IRP[ca+1] - B->IRP[ca];
CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
} */
for (ulong ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul
CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
//direct sparsify: trasform accumulated dense vector to a CSR row
sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
}
}
#if OFF_F != 0
C_FortranShiftIdxs(AB);
#endif
AUDIT_INTERNAL_TIMES End=omp_get_wtime();
DEBUG checkOverallocPercent(rowsSizes,AB);
goto _free;
_err:
if(AB) freeSpmat(AB);
AB=NULL; //nothing'll be returned
_free:
if(rowsSizes) free(rowsSizes);
if(accVects) freeAccsDense(accVects,cfg->threadNum);
if(outAccumul) freeSpMMAcc(outAccumul);
return AB;
}
spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
idx_t* bColOffsets = NULL; //B group columns starting offset for each row
ACC_DENSE *accVectors=NULL,*accV;
SPACC* accRowPart;
spmat* AB = allocSpMatrix(A->M,B->N);
SPMM_ACC* outAccumul=NULL;
idx_t *rowsPartsSizes=NULL, *rowSizes=NULL; //for rows' cols partition, correct len
if (!AB) goto _err;
//2D indexing aux vars
ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
////get bColOffsets for B column groups
if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
goto _err;
///TODO TODO
_err:
if (AB) freeSpmat(AB);
AB = NULL;
_free:
free(rowsPartsSizes);
free(bColOffsets);
if (accVectors) freeAccsDense(accVectors,gridSize);
if (outAccumul) freeSpMMAcc(outAccumul);
return AB;
}
/* TODO
///2D
//PARTITIONS NOT ALLOCATED
spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
///SP3MM
spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
////////Sp3MM direct
///1D
spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
*/