You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
psblas3/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h

450 lines
16 KiB
C

/*
* Sp3MM_for_AlgebraicMultiGrid
* (C) Copyright 2021-2022
* Andrea Di Iorio
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions, and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
* not be used to endorse or promote products derived from this
* software without specific written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
///scalar-vector multiply
//TODO che guadagno si ha ad utilizzare solo la versione generica delle successive 2 funzioni
/*
* Sparse vector part <->scalar multiplication in a dense output
* sparse vector part will hold nnz values in @vectVals (AS subvector)
* with corresponding indexes in @vectIdxs in range [0,@vectLen] (JA subvector)
* resulting vector accumulated in a dense array in @aux->v, along with nnzIdx
* both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
*/
inline void CAT(scSparseVectMul_,OFF_F)
(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux){
for (ulong i=0,j; i<vectLen; i++){
j = vectIdxs[i]-OFF_F;
DEBUGCHECKS{
if (j>=aux->vLen){
fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
assert(j < aux->vLen);
}
}
aux->v[j] += vectVals[i] * scalar; //accumulate
//append new nonzero index to auxVNNZeroIdxs for quick sparsify
//if (!(aux->v[j])) aux->nnzIdx[ aux->nnzIdxMap.len++ ] = j; //TODO numerical zero may then cause readd
if(!spVect_idx_in(j,&aux->nnzIdxMap))
aux->nnzIdx[ aux->nnzIdxMap.len-1 ] = j;
}
}
/* Same as above scSparseVectMul_ , but consider an initial offset to remove from each idx
* Sparse vector part <->scalar multiplication in a dense output
* @vectVals: sparse vector part values ( from spmat.AS )
* with [at least] @vectLen corresponding
* target idxs in @vectIdxs (from spmat.JA ) starting from @startIdx
*
* Resulting vector accumulated in a dense array in @aux->v, along with nnzIdx
* all nnz values indexes will be shifted back of @startIdx in @aux
* both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
*/
inline void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,
ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux){
for (ulong i=0,j; i<vectLen; i++){
j = vectIdxs[i]-OFF_F - startIdx;
DEBUGCHECKS{
if (j>=aux->vLen){
fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
assert(j < aux->vLen);
}
}
aux->v[j] += vectVals[i] * scalar; //accumulate
//append new nonzero index to auxVNNZeroIdxs for quick sparsify
//if (!(aux->v[j])) aux->nnzIdx[ aux->nnzIdxMap.len++ ] = j;
////TODO numerical zero may then cause readd
if(!spVect_idx_in(j,&aux->nnzIdxMap))
aux->nnzIdx[ aux->nnzIdxMap.len-1 ] = j;
}
}
///MultiImplementations functions
#ifndef OFF_F
#error generic implementations requires OFF_F defined
#endif
///TODO IMPLEMENT SCALAR<->ROW MUL AS GENERIC SPARSE VECTOR<->SCALAR MUL
//inline void scSparseRowMul(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
inline void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
ulong rowStartIdx = mat->IRP[trgtR]-OFF_F,rowLen;
#ifdef ROWLENS
rowLen = mat->RL[trgtR];
#else
rowLen = mat->IRP[trgtR+1] - mat->IRP[trgtR];
#endif
CAT(scSparseVectMul_,OFF_F)(scalar,mat->AS+rowStartIdx,mat->JA+rowStartIdx,rowLen,aux);
//TODO check impact of generic version use
}
///OUTPUT SIZE PREDICTION
/* O(A.NZ [+B.M] )
* return array of upper bounded row sizes of the product @A * @B
* also appended at the end for the cumulative total size of the matrix AB = A*B
*/
inline idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B){
AUDIT_INTERNAL_TIMES Start = omp_get_wtime();
idx_t* rowSizes = calloc((A->M+1), sizeof(*rowSizes));
if (!rowSizes){
ERRPRINT("spMMSizeUpperbound: rowSizes calloc errd\n");
return NULL;
}
idx_t fullMatBound = 0;
#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
for (idx_t r=0; r<A->M; r++){
for (idx_t jj=A->IRP[r] - OFF_F,j,rlen; jj<A->IRP[r+1] - OFF_F; jj++){
j = A->JA[jj] - OFF_F;
#ifdef ROWLENS
rlen = B->RL[j];
#else
rlen = B->IRP[j+1] - B->IRP[j]; //OFF_F: delta is offset independent
#endif
rowSizes[r] += rlen;
fullMatBound += rlen;
//rowSizes[A->M] += rlen; //just below omp reduction sum
}
}
rowSizes[A->M] = fullMatBound;
AUDIT_INTERNAL_TIMES End= omp_get_wtime();
VERBOSE
printf("spMMSizeUpperbound:%lu\t%le s\n",rowSizes[A->M],End-Start);
return rowSizes;
}
/* O(A.NZ + B.NZ)
* return matrix @A.M x gridCols of upper bound
* for each of the @gridCols col partitions of the output matrix AB = @A * @B
* also appended at the end for the cumulative total size of the matrix AB
*/
inline idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)
(spmat* A,spmat* B,ushort gridCols,idx_t* bColPartOffsets){
AUDIT_INTERNAL_TIMES Start = omp_get_wtime();
idx_t* rowPartsSizes = calloc((A->M*gridCols +1), sizeof(*rowPartsSizes));
if (!rowPartsSizes){
ERRPRINT("spMMSizeUpperbound: rowPartsSizes calloc errd\n");
return NULL;
}
idx_t fullMatBound = 0;
#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
for (idx_t r=0; r<A->M; r++){
//for each A.row -> sum B.colParts lens
for (idx_t jj=A->IRP[r]-OFF_F,j,rlen; jj<A->IRP[r+1]-OFF_F; jj++){
j = A->JA[jj] - OFF_F;
for (idx_t gc=0,bPartID=IDX2D(j,gc,gridCols); gc < gridCols; gc++,bPartID++){
rlen = bColPartOffsets[bPartID+1] - bColPartOffsets[bPartID];
rowPartsSizes[ IDX2D(r,gc,gridCols) ] += rlen;
fullMatBound += rlen;
}
}
}
rowPartsSizes[ A->M*gridCols ] = fullMatBound;
AUDIT_INTERNAL_TIMES End= omp_get_wtime();
VERBOSE
printf("spMMSizeUpperboundColParts_:%lu\t%le s\n",rowPartsSizes[A->M],End-Start);
return rowPartsSizes;
}
////Single implementations funcs
#ifndef SPMMUTILS_H_SINGLE_IMPLEMENTATION
#define SPMMUTILS_H_SINGLE_IMPLEMENTATION
#include "utils.h"
///Allocs - Free
//SpMM holder of accumulats
inline SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum){
SPMM_ACC* out = calloc(1,sizeof(*out));
if (!out){
ERRPRINT("initSpMMAcc: out calloc errd\n");
return NULL;
}
out->size = entriesNum;
if (!(out->JA = malloc(entriesNum * sizeof(*(out->JA))))){
ERRPRINT("initSpMMAcc: JA malloc errd\n");
goto _err;
}
if (!(out->AS = malloc(entriesNum * sizeof(*(out->AS))))){
ERRPRINT("initSpMMAcc: AS malloc errd\n");
goto _err;
}
if (!(out->accs = malloc(accumulatorsNum * sizeof(*(out->accs))))){
ERRPRINT("initSpMMAcc: accs malloc errd\n");
goto _err;
}
return out;
_err:
if (out->JA) free(out->JA);
if (out->AS) free(out->AS);
if (out->accs) free(out->accs);
if (out) free(out);
return NULL;
}
inline void freeSpMMAcc(SPMM_ACC* acc){
free(acc->JA);
free(acc->AS);
free(acc->accs);
free(acc);
}
///// dense acc sparsify functions
/*
* sparsify dense accumulated vector @accV (with shifted of @startColAcc)
* into sparse accumulator @accSparse that'll use space for nnz entries from @acc
*/
///DIRECT OUT MATRIX SPARSIFY
/*
* sparsify @accV directly inside row @row of matrix @m
* considering, if given not NULL, 2D partitioning with
* @gridCols cols groups and colGroups offsets per row matrix @colPartsOffsets
* :Returns inplace modify of @m
*/
static inline void sparsifyDirect(ACC_DENSE* accV,spmat* m,idx_t row){
idx_t nnz = accV->nnzIdxMap.len, sparsifyStart = m->IRP[row], sparsifyEnd = m->IRP[row+1];
sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
DEBUGCHECKS assertArrNoRepetitions(accV->nnzIdx,nnz);
DEBUGCHECKS assert(nnz <= (sparsifyEnd - sparsifyStart));
for (idx_t i=0,j; i < nnz; i++){
j = accV->nnzIdx[i];
m -> JA[sparsifyStart + i] = j; //+ startColAcc;
m -> AS[sparsifyStart + i] = accV->v[j];
}
}
//TODO 2D partitioning - colParts version of above...best to use multi impl trick of symbStep
static inline void sparsifyDirectColParts(ACC_DENSE* accV,spmat* m,idx_t row,
ushort colGroupId,ushort gridCols, idx_t* colPartsOffsets,idx_t startCol){
idx_t nnz = accV->nnzIdxMap.len;
ushort partID = IDX2D(row,colGroupId,gridCols);
idx_t sparsifyStart = colPartsOffsets[partID], sparsifyEnd = colPartsOffsets[partID+1];
sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
DEBUGCHECKS assertArrNoRepetitions(accV->nnzIdx,nnz);
DEBUGCHECKS assert( nnz == sparsifyEnd - sparsifyStart );
sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
for (idx_t i=0,j; i < nnz; i++){
j = accV->nnzIdx[i];
m -> JA[sparsifyStart + i] = j + startCol;
m -> AS[sparsifyStart + i] = accV->v[j];
}
}
///UB SPACE SPARSIFY
//internal sparsivy dense acc inside (prepared) sparse acc struct
static inline void _sparsifyUB(ACC_DENSE* accV,SPACC* accSparse,idx_t startColAcc){
idx_t nnz = accV->nnzIdxMap.len;
sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
DEBUGCHECKS assertArrNoRepetitions(accV->nnzIdx,nnz);
for (idx_t i=0,j; i < nnz; i++){
j = accV->nnzIdx[i];
accSparse -> JA[i] = j + startColAcc;
accSparse -> AS[i] = accV->v[j];
}
accSparse -> len = nnz;
}
//row[Part] sparsified in a thread safe (exactly long) reserved area using atomics
static inline void sparsifyUBNoPartsBounds
(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc){
//sort nnz indexes of dense accumulator
idx_t nnz = accV->nnzIdxMap.len;
idx_t sparsifyStartV; //start index(inside @accSparse) of @accV to sparsify
//sparsifyStartV = __atomic_fetch_add(&(acc->lastAssigned),nnz,__ATOMIC_ACQ_REL);
#pragma omp atomic capture
{ //fetch and add like ....
sparsifyStartV = acc->lastAssigned;
acc->lastAssigned += nnz;
}
DEBUGCHECKS{
if (acc->lastAssigned >= acc->size){
ERRPRINT("OMP ATOMIC OR SG ERRD IN SPACE ASSIGNMENTS...\n");
assert(acc->lastAssigned < acc->size);
}
}
//
accSparse -> AS = acc->AS + sparsifyStartV;
accSparse -> JA = acc->JA + sparsifyStartV;
_sparsifyUB(accV,accSparse,startColAcc);
}
////output-gather functons
/*
* merge @conf->gridCols*@mat->M sparse rows partitions into @mat
* EXPECTED rowsParts @rowsParts to be sorted in accord to the
* 2D rowMajor computing grid given in @conf
* allocd arrays to hold non zero values and indexes into @mat
*/
inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
CONFIG* conf){
ulong nzNum=0,j,rLen,idx,partsNum = mat->M * conf->gridCols;
//TODO PARALLEL MERGE - SAVE STARTING offset OF EACH PARTITION IN THE OUT MATRIX
ulong* rowsPartsOffsets=alloca(partsNum*sizeof(*rowsPartsOffsets));
///count nnz entries and alloc arrays for them
for (ulong r=0; r<mat->M; r++){
//for each partition ->get len -> outMat.IRP and aux offsets
for (j=0,rLen=0; j<conf->gridCols; j++){
idx = IDX2D(r,j,conf->gridCols);
rowsPartsOffsets[idx]=nzNum+rLen;//part start=prev accumulated end
rLen += rowsParts[idx].len;
}
nzNum += rLen;
mat->IRP[r+1] = nzNum;
#ifdef ROWLENS
mat->RL[r] = rLen;
#endif
}
mat->NZ = nzNum;
if (!(mat->AS = malloc(nzNum * sizeof(*(mat->AS))))){
ERRPRINT("merged sparse matrix AS alloc errd\n");
return EXIT_FAILURE;
}
if (!(mat->JA = malloc(nzNum * sizeof(*(mat->JA))))){
ERRPRINT("merged sparse matrix JA alloc errd\n");
return EXIT_FAILURE;
}
///popolate with rows nnz values and indexes
ulong pLen; //omp for aux vars
#pragma omp parallel for schedule(static) private(pLen)
for (ulong i=0; i<partsNum; i++){
pLen = rowsParts[i].len;
memcpy(mat->AS + rowsPartsOffsets[i],rowsParts[i].AS,pLen*sizeof(*(mat->AS)));
memcpy(mat->JA + rowsPartsOffsets[i],rowsParts[i].JA,pLen*sizeof(*(mat->JA)));
}
CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
for (ulong i=0,w=0; i<mat->M; i++){
if (mat->IRP[i] != w)
{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
for (j=0; j<conf->gridCols; j++){
SPACC r = rowsParts[IDX2D(i,j,conf->gridCols)];
for (ulong jj=0; jj<r.len; jj++,w++){
if (mat->AS[w]!= r.AS[jj]){
ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
if (mat->JA[w]!= r.JA[jj]){
ERRPRINT("MERGE ROW ERR JA\n"); return -1;}
}
}
}
}
return EXIT_SUCCESS;
}
/*
* merge @mat->M sparse rows @rows in sparse matrix @mat
* EXPECTED @rows to be sorted in accord to trgt matrix row index @r
* allocd arrays to hold non zero values and indexes into @mat
*/
inline int mergeRows(SPACC* rows,spmat* mat){
ulong nzNum=0;
//count nnz entries and alloc arrays for them
for (ulong r=0; r<mat->M; ++r){
nzNum += rows[r].len;
mat->IRP[r+1] = nzNum;
#ifdef ROWLENS
mat->RL[r] = rows[r].len
#endif
}
mat->NZ = nzNum;
if (!(mat->AS = malloc(nzNum * sizeof(*(mat->AS))))){
ERRPRINT("merged sparse matrix AS alloc errd\n");
return EXIT_FAILURE;
}
if (!(mat->JA = malloc(nzNum * sizeof(*(mat->JA))))){
ERRPRINT("merged sparse matrix JA alloc errd\n");
return EXIT_FAILURE;
}
///POPOLATE WITH ROWS NNZ VALUES AND INDEXES
//TODO PARALLEL COPY
#pragma omp parallel for schedule(static)
for (ulong r=0; r<mat->M; r++){
memcpy(mat->AS+mat->IRP[r], rows[r].AS, rows[r].len*sizeof(*(mat->AS)));
memcpy(mat->JA+mat->IRP[r], rows[r].JA, rows[r].len*sizeof(*(mat->JA)));
}
CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
for (ulong r=0,i=0; r<mat->M; r++){
if (i != mat->IRP[r])
{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
for (ulong j=0; j<rows[r].len; j++,i++){
if (mat->AS[i]!= rows[r].AS[j]){
ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
if (mat->JA[i] != rows[r].JA[j]){
ERRPRINT("MERGE ROW ERR JA\n"); return -1;}
}
}
}
return EXIT_SUCCESS;
}
/***************************************************************
* The Following code is a modified version of the mergeRows
* function.
* The function is split into two parts :
* - the first one calculates the sizes of the arrays
* and returns it
* - the second one fills them accordingly
* This allows for allocation of the arrays as allocatables in
* Fortran
*/
inline idx_t calculateSize(SPACC* rows, spmat *mat){
ulong nzNum=0;
//count nnz entries and alloc arrays for them
for (ulong r=0; r<mat->M; ++r){
nzNum += rows[r].len;
mat->IRP[r+1] = nzNum;
#ifdef ROWLENS
mat->RL[r] = rows[r].len
#endif
}
mat->NZ = nzNum;
return nzNum;
}
inline int mergeRowsPopulate(SPACC* rows, spmat* mat, double** AS, idx_t** JA, idx_t** IRP) {
// Populate with rows' non-zero values and indexes
// TODO PARALLEL COPY
#pragma omp parallel for schedule(static)
for (ulong r = 0; r < mat->M; r++) {
memcpy(*AS + mat->IRP[r], rows[r].AS, rows[r].len * sizeof(double));
memcpy(*JA + mat->IRP[r], rows[r].JA, rows[r].len * sizeof(idx_t));
}
memcpy(*IRP, mat->IRP, (mat->M + 1) * sizeof(idx_t));
// Consistency checks
// TODO: Add your consistency checks here
return EXIT_SUCCESS;
}
#endif //SPMMUTILS_H_SINGLE_IMPLEMENTATION