You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
595 lines
19 KiB
C
595 lines
19 KiB
C
/*
|
|
* Sp3MM_for_AlgebraicMultiGrid
|
|
* (C) Copyright 2021-2022
|
|
* Andrea Di Iorio
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions, and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
|
|
* not be used to endorse or promote products derived from this
|
|
* software without specific written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* CSR Sp[3]MM Symbolic step implementations
|
|
* target: compute the output matrix size and the row lens for preallocation
|
|
* direct write out partial results
|
|
* See interfaces in respective header
|
|
*/
|
|
|
|
|
|
/*#pragma message( "compiling Sp3MM_CSR_OMP_Symb_Generic.c with config as:" \
|
|
STR(OFF_F) " - " STR(OUT_IDXS) " - " STR(COL_PARTS) )*/
|
|
#ifndef OFF_F
|
|
#error generic implementation requires OFF_F defined
|
|
#endif
|
|
|
|
///setup aux macros for different signatures implementation via #if arith expr
|
|
#pragma push_macro("OUT_IDXS")
|
|
#pragma push_macro("_OUT_IDXS")
|
|
#pragma push_macro("COL_PARTS")
|
|
#pragma push_macro("_COL_PARTS")
|
|
|
|
#ifdef OUT_IDXS
|
|
#define _OUT_IDXS TRUE
|
|
#else
|
|
#define _OUT_IDXS FALSE
|
|
#define OUT_IDXS _UNDEF
|
|
#endif
|
|
#ifdef COL_PARTS
|
|
#define _COL_PARTS TRUE
|
|
#else
|
|
#define _COL_PARTS FALSE
|
|
#define COL_PARTS _UNDEF
|
|
#endif
|
|
///
|
|
|
|
//////SpMM - rowByrow
|
|
///1row->matrix->outRow
|
|
//RBTREE based
|
|
/*
|
|
* Compute symbolic product of (nnz indexes of) row @aRowJA and matrix @b
|
|
* insert nnz indexes of the mul. result row as nodes in a rbtree rooted at @root
|
|
* with nodes in @nodes which have to be enough for the mul result row (use an UB)
|
|
* Retuns: multiplication result row NNZ number,se CONFIG_MACROS below for more
|
|
*
|
|
* CONFIG_MACROS:
|
|
* if _OUT_IDXS == TRUE return mul.result row nnz idxs in @outIdxs
|
|
* ifdef: OUT_IDXS_RBTREE_NODES: nnz indexes returned inplace sorting rbtree
|
|
* as nnz indexes(JA) of the mul result row
|
|
* else: stop at returning the mul. result row lenght
|
|
* if _COL_PARTS == TRUE return the number of nonzero elements in
|
|
* in each of the @gridCols column partitions inside @rowColPartsLens
|
|
* OFF_F: offset back indexes from fortran
|
|
* TODO also output indexes are shifted (see c_b )
|
|
*/
|
|
static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(
|
|
idx_t* aRowJA, idx_t aRowLen, spmat* b,rbRoot* root, rbNode* nodes
|
|
#if _OUT_IDXS == TRUE && !defined OUT_IDXS_RBTREE_NODES
|
|
,idx_t* outIdxs
|
|
#endif
|
|
#if _COL_PARTS == TRUE
|
|
,ushort gridCols,idx_t* rowColPartsLens
|
|
#endif
|
|
)
|
|
{
|
|
//Compute resulting ab's row non zero indexes and total lenght
|
|
idx_t abRowLen = 0; //mul.result row len, return value
|
|
for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){ //for each entry in a's row
|
|
c_a = aRowJA[i]-OFF_F;
|
|
//gather diffrent nnz indexes in corresponding b's `c_a`th row
|
|
for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){
|
|
c_b = b->JA[j]-OFF_F;
|
|
//check if c_b is nonzero index for mul.result row
|
|
inserted = rbInsertNewKey (root, nodes+abRowLen, c_b);
|
|
abRowLen += inserted; //inserted needed just after this
|
|
/* LESS EFFICIENT THEN BELOW (here no memory of last colPart)
|
|
#if _COL_PARTS == TRUE //keep track of which col partition is c_b in
|
|
if (inserted)
|
|
rowColPartsLens[ matchingUnifRangeIdx(c_b, b->N, gridCols) ]++;
|
|
#endif */
|
|
}
|
|
}
|
|
#if _OUT_IDXS == T && defined OUT_IDXS_RBTREE_NODES
|
|
/* return the non zero indexes of the mul.result row
|
|
* sorting inplace the nodes inserted in the rbtree */
|
|
sortRbNode(nodes,abRowLen);
|
|
#elif _OUT_IDXS == T || _COL_PARTS == T
|
|
uint i=0;
|
|
idx_t k;
|
|
#if _COL_PARTS == T
|
|
//colParts aux vars
|
|
idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
|
|
ushort gc=0;
|
|
idx_t gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
|
|
idx_t gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
|
|
#endif //_COL_PARTS == T
|
|
for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
|
|
k = rb_entry(n,rbNode,rb)->key;
|
|
#if _OUT_IDXS == T
|
|
//return the mul.result nnz index inside the rbNodes
|
|
outIdxs[ i++ ] = k;
|
|
#endif
|
|
#if _COL_PARTS == T
|
|
while (k >= gcEndCol ){ //see if the idx is in another col partition
|
|
// TODO also = since gcEndCol as k is 0based
|
|
gcEndCol = unifRemShareEnd(gc ,_colBlock, _colBlockRem);
|
|
gc++;
|
|
DEBUGCHECKS{ assert( gc < gridCols ); }
|
|
}
|
|
rowColPartsLens[gc]++;
|
|
#endif //_COL_PARTS == T
|
|
}
|
|
#endif //_OUT_IDXS == T ... _COL_PARTS == T
|
|
/*DEBUGCHECKS{ //TODO PRINT NNZ INDEXES FOR MANUAL (PAINFUL CHECK)
|
|
idx_t k;
|
|
for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
|
|
k = rb_entry(n,rbNode,rb)->key;
|
|
printf("%lu, ",k);
|
|
}
|
|
printf("\n");
|
|
}*/
|
|
|
|
return abRowLen;
|
|
}
|
|
|
|
//SPVECT_IDX_DENSE_MAP based TODO double implementation for trick syntax folding here...
|
|
/*
|
|
* SPVECT_IDX_DENSE_MAP based, as SpMM_Row_Symb_Rbtree but with idxMap aux idx keeping
|
|
* CONFIG_MACROS (new)
|
|
* IDX_RMUL_SYMB_RBTREE && ( _OUT_IDXS == T || _COL_PARTS == T ):
|
|
* (tmp) symb mult out indexes will be kept via a rbtree
|
|
* otherwise directly in the out array appending them and then sorting them
|
|
* (potentially same n log n)
|
|
*/
|
|
static inline idx_t CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(
|
|
idx_t* aRowJA, idx_t aRowLen, spmat* b, SPVECT_IDX_DENSE_MAP* idxsMapAcc
|
|
#if _OUT_IDXS == TRUE
|
|
,idx_t* outIdxs
|
|
#endif
|
|
#if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
|
|
,rbRoot* root, rbNode* nodes
|
|
#endif // _OUT_IDXS == TRUE
|
|
#if _COL_PARTS == TRUE
|
|
,ushort gridCols,idx_t* rowColPartsLens
|
|
#endif
|
|
)
|
|
{
|
|
//Compute resulting ab's row non zero indexes and total lenght
|
|
idx_t abRowLen = 0; //mul.result row len, return value
|
|
for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){ //for each entry in a's row
|
|
c_a = aRowJA[i]-OFF_F;
|
|
//gather diffrent nnz indexes in corresponding b's `c_a`th row
|
|
for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){
|
|
c_b = b->JA[j]-OFF_F;
|
|
//check if c_b is nonzero index for mul.result row
|
|
inserted = spVect_idx_in(c_b,idxsMapAcc);
|
|
#if _OUT_IDXS == T || _COL_PARTS == T //idxs HAS TO be accumulated
|
|
if (inserted)
|
|
#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F //add it in a RBTREE struct
|
|
rbInsertNewKey (root, nodes+idxsMapAcc->len, c_b);
|
|
#else //append it, then sort
|
|
outIdxs[idxsMapAcc->len] = c_b;
|
|
#endif //IDX_RMUL_SYMB_RBTREE == T //how accumulated key c_b
|
|
#endif //#if _OUT_IDXS == T || _COL_PARTS == T
|
|
}
|
|
}
|
|
abRowLen = idxsMapAcc->len;
|
|
//gather idxs or their sparsity struct in output row
|
|
#if _OUT_IDXS == T || _COL_PARTS == T
|
|
idx_t j = 0,k;
|
|
#if _COL_PARTS == T
|
|
//colParts aux vars
|
|
idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
|
|
ushort gc = 0;
|
|
idx_t gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
|
|
idx_t gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
|
|
#endif
|
|
#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F ///idxs recorded in a aux rbtree
|
|
for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
|
|
k = rb_entry(n,rbNode,rb)->key;
|
|
#if _OUT_IDXS == T
|
|
outIdxs[ j++ ] = k; //record ordered key sotred from aux rbtree
|
|
#endif
|
|
#else ///idxs recorded in aux append array
|
|
sort_idx_t(outIdxs,abRowLen);
|
|
for (; j < abRowLen; j++){
|
|
k = outIdxs[j]; //(OSS) already ordered in outIndexes arr
|
|
#endif //IDX_RMUL_SYMB_RBTREE == T
|
|
#if _COL_PARTS == T
|
|
while (k >= gcEndCol ){ //see if the idx is in another col partition
|
|
// TODO also = since gcEndCol as k is 0based
|
|
gcEndCol = unifRemShareEnd(gc ,_colBlock, _colBlockRem);
|
|
gc++;
|
|
DEBUGCHECKS{ assert( gc < gridCols ); }
|
|
}
|
|
rowColPartsLens[gc]++;
|
|
#endif //_COL_PARTS == T
|
|
}
|
|
#endif //_OUT_IDXS == T ... _COL_PARTS == T
|
|
return abRowLen;
|
|
}
|
|
|
|
//switch among 2 row_symb_XXX implemenetation aux
|
|
/*
|
|
* SpMM single row symbolic computation
|
|
* select one implementation via @implID
|
|
* among SpMM_Row_Symb_Rbtree or SpMM_Row_Symb_IdxMap
|
|
* args will be forwared accordingly
|
|
*/
|
|
static inline idx_t CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(
|
|
ROW_MMSYM_IMPL_MODE implID, idx_t* aRowJA, idx_t aRowLen, spmat* b,
|
|
rbRoot* root, rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc
|
|
#if _OUT_IDXS == TRUE
|
|
,idx_t* outIdxs
|
|
#endif
|
|
#if _COL_PARTS == TRUE
|
|
,ushort gridCols,idx_t* rowColPartsLens
|
|
#endif
|
|
)
|
|
{
|
|
if (implID == RBTREE) {
|
|
return CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(
|
|
aRowJA,aRowLen,b,root,nodes
|
|
#if _OUT_IDXS == TRUE && !defined OUT_IDXS_RBTREE_NODES
|
|
,outIdxs
|
|
#endif
|
|
#if _COL_PARTS == TRUE
|
|
,gridCols,rowColPartsLens
|
|
#endif
|
|
);
|
|
}
|
|
else { //IDXMAP
|
|
return CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(
|
|
aRowJA,aRowLen,b,idxsMapAcc
|
|
#if _OUT_IDXS == TRUE
|
|
,outIdxs
|
|
#endif
|
|
#if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
|
|
,root, nodes
|
|
#endif
|
|
#if _COL_PARTS == TRUE
|
|
,gridCols, rowColPartsLens
|
|
#endif
|
|
);
|
|
}
|
|
}
|
|
///SpMM row-by-row
|
|
idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(
|
|
ROW_MMSYM_IMPL_MODE symbRowImplID, spmat* a, spmat* b
|
|
#if _OUT_IDXS == TRUE
|
|
,idx_t*** outIdxs
|
|
#endif
|
|
#if _COL_PARTS == TRUE
|
|
,ushort gridCols, idx_t** rowColPartsLens
|
|
#endif
|
|
)
|
|
{
|
|
|
|
///initial allocations
|
|
rbRoot* rbRoots = NULL;
|
|
rbNode* rbNodes = NULL;
|
|
SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
|
|
idx_t *rowLens=NULL,*upperBoundedRowsLens=NULL,*upperBoundedSymMat=NULL;
|
|
idx_t maxRowLen=0;
|
|
int rbTreeUsed = (symbRowImplID == RBTREE ||
|
|
(IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS)) );
|
|
|
|
if ( !(rowLens = malloc(sizeof(*rowLens) * (a->M+1))) ){
|
|
ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
|
|
goto _err;
|
|
}
|
|
if (_OUT_IDXS == TRUE || rbTreeUsed ){
|
|
if (!(upperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))
|
|
goto _err;
|
|
}
|
|
#if _OUT_IDXS == TRUE
|
|
if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
|
|
ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
|
|
goto _err;
|
|
}
|
|
if (!(upperBoundedSymMat = malloc(
|
|
sizeof(*upperBoundedSymMat)*upperBoundedRowsLens[a->M]))){
|
|
ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
|
|
goto _err;
|
|
}
|
|
//write rows' start pointer from full matrix JA allocated
|
|
for (idx_t i=0,cumul=0; i<a->M; cumul += upperBoundedRowsLens[i++])
|
|
*outIdxs[i] = upperBoundedSymMat + cumul;
|
|
#endif //#if _OUT_IDXS == TRUE
|
|
#if _COL_PARTS == TRUE
|
|
if (!(*rowColPartsLens = malloc(a->M * gridCols * sizeof(**rowColPartsLens)))){
|
|
ERRPRINT("SpMM_Symb_ rowColPartsLens malloc errd\n");
|
|
goto _err;
|
|
}
|
|
#endif //_COL_PARTS
|
|
uint maxThreads = omp_get_max_threads(); //TODO FROM CFG
|
|
//index keeping aux struct
|
|
//rbtree implementation or idxMap with aux of symbTree for tmp outIdx keeping
|
|
if ( rbTreeUsed ){
|
|
maxRowLen = reductionMaxSeq(upperBoundedRowsLens, a->M);
|
|
//rbTrees for index keeping
|
|
rbRoots = malloc(maxThreads * sizeof(*rbRoots));
|
|
rbNodes = calloc(maxThreads * maxRowLen, sizeof(*rbNodes));
|
|
if( !rbRoots || !rbNodes ){
|
|
ERRPRINT("SpMM_Symb_ threads' aux rbTree mallocs errd\n");
|
|
goto _err;
|
|
}
|
|
//init roots
|
|
for (uint i=0; i<maxThreads; i++)
|
|
rbRoots[i] = RB_ROOT_CACHED;
|
|
}
|
|
if (symbRowImplID == IDXMAP){ //idxMap implementation && not outIdxs via rbtree
|
|
idxsMapAccs = calloc(maxThreads, sizeof(*idxsMapAccs));
|
|
if (!idxsMapAccs){
|
|
ERRPRINT("SpMM_Symb_ idxsMapAccs calloc errd\n");
|
|
goto _err;
|
|
}
|
|
//init idxs maps
|
|
for (uint i=0; i<maxThreads; i++){
|
|
if(initSpVectIdxDenseAcc(b->N,idxsMapAccs+i)) goto _err;
|
|
}
|
|
}
|
|
///rows parallel compute
|
|
idx_t* aRow;
|
|
idx_t aRowLen,rLen,abCumulLen=0;
|
|
int tid;
|
|
rbRoot* tRoot; rbNode* tNodes;
|
|
SPVECT_IDX_DENSE_MAP* tIdxsMapAcc = NULL;
|
|
#pragma omp parallel for schedule(static) \
|
|
private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:abCumulLen)
|
|
for(idx_t r=0; r<a->M; r++){
|
|
aRow = a->JA + a->IRP[r]-OFF_F;
|
|
aRowLen = a->IRP[r+1] - a->IRP[r];
|
|
tid = omp_get_thread_num();
|
|
//TODO low overhead pointer airth can be avoided with if (symbRowImplID .. && )
|
|
tIdxsMapAcc = idxsMapAccs + tid;
|
|
tRoot = rbRoots + tid;
|
|
tNodes = rbNodes + tid * maxRowLen;
|
|
|
|
rLen = CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(
|
|
symbRowImplID, aRow, aRowLen, b, tRoot,tNodes, tIdxsMapAcc
|
|
#if _OUT_IDXS == TRUE
|
|
,*outIdxs[r]
|
|
#endif
|
|
#if _COL_PARTS == TRUE
|
|
,gridCols, (*rowColPartsLens) + IDX2D(r,0,gridCols)
|
|
#endif
|
|
);
|
|
rowLens[r] = rLen;
|
|
abCumulLen += rLen;
|
|
///reset symb idxs keeping aux structs
|
|
if (symbRowImplID==RBTREE || (IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS))){
|
|
*tRoot = RB_ROOT_CACHED;
|
|
memset(tNodes,0,rLen * sizeof(*tNodes));
|
|
}
|
|
if (symbRowImplID == IDXMAP) _resetIdxMap(tIdxsMapAcc);
|
|
|
|
}
|
|
rowLens[a->M] = abCumulLen;
|
|
goto _free;
|
|
|
|
_err:
|
|
free(rowLens);
|
|
#if _OUT_IDXS == T
|
|
if (outIdxs) free(*outIdxs);
|
|
#endif
|
|
#if _COL_PARTS == T
|
|
if (rowColPartsLens) free(*rowColPartsLens);
|
|
#endif
|
|
rowLens = NULL;
|
|
_free:
|
|
free(upperBoundedRowsLens);
|
|
free(upperBoundedSymMat);
|
|
free(rbRoots);
|
|
free(rbNodes);
|
|
if (idxsMapAccs){
|
|
for (uint i=0; i<maxThreads; i++) free(idxsMapAccs[i].idxsMap);
|
|
free(idxsMapAccs);
|
|
}
|
|
|
|
return rowLens;
|
|
}
|
|
|
|
|
|
///Sp3MM - rowByrowByrow ////////////////////////////////////////////////////////
|
|
//TODO COL_PARTS VERSION?
|
|
#if _OUT_IDXS == TRUE && _COL_PARTS == FALSE
|
|
idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F)
|
|
(
|
|
ROW_MMSYM_IMPL_MODE symbMMRowImplID,idx_t* aRowJA,idx_t aRowLen,spmat* b,spmat* c,
|
|
rbRoot* root,rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc,idx_t* abRowJATmp
|
|
#if _OUT_IDXS == TRUE && !defined OUT_IDXS_RBTREE_NODES
|
|
,idx_t* outIdxs
|
|
#endif
|
|
)
|
|
{
|
|
struct rb_node* n;
|
|
idx_t abRowLen = CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(symbMMRowImplID,aRowJA,aRowLen,b,root,nodes,idxsMapAcc,abRowJATmp);
|
|
cleanRbNodes(root, nodes, abRowLen);
|
|
idx_t abcRowLen = CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
|
|
(symbMMRowImplID,abRowJATmp,abRowLen,c,root,nodes,idxsMapAcc,abRowJATmp);
|
|
|
|
#if _OUT_IDXS == TRUE
|
|
#ifndef OUT_IDXS_RBTREE_NODES
|
|
//return the mul.result nnz index inside the rbNodes
|
|
uint i=0;
|
|
//rbNodeOrderedVisit(n,root) outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
|
|
for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
|
|
outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
|
|
}
|
|
#else
|
|
/* return the non zero indexes of the mul.result row
|
|
* sorting inplace the nodes inserted in the rbtree */
|
|
sortRbNode(nodes,abcRowLen);
|
|
#endif
|
|
#endif
|
|
return abcRowLen;
|
|
}
|
|
|
|
idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F)
|
|
(
|
|
ROW_MMSYM_IMPL_MODE symbMMRowImplID, spmat* a, spmat* b, spmat* c
|
|
#if _OUT_IDXS == TRUE
|
|
,idx_t*** outIdxs
|
|
#endif
|
|
)
|
|
{
|
|
//idxs keeping aux buffs
|
|
idx_t* abRowsJATmp = NULL;
|
|
rbRoot* rbRoots = NULL; rbNode* rbNodes = NULL;
|
|
SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
|
|
|
|
idx_t *abUpperBoundedRowsLens = NULL, *upperBoundedSymMat = NULL;
|
|
///initial allocations
|
|
idx_t* rowLens = malloc(sizeof(*rowLens) * (a->M +1) ); //to return
|
|
if (!rowLens){
|
|
ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
|
|
goto _err;
|
|
}
|
|
|
|
#if _OUT_IDXS == TRUE
|
|
if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
|
|
ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
|
|
goto _err;
|
|
}
|
|
if (!(abUpperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))
|
|
goto _err;
|
|
/*TODO TODO instead of doing one sym product first to have a correct UB
|
|
* use an heuristics here to get output matrix size
|
|
*/
|
|
idx_t abcUBSize = abUpperBoundedRowsLens[a->M] * SP3MM_UB_HEURISTIC;
|
|
if (!(upperBoundedSymMat=malloc(sizeof(*upperBoundedSymMat)*abcUBSize))){
|
|
ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
|
|
goto _err;
|
|
}
|
|
//TODO heuristic TO UB rows bounds ... require compacting copy
|
|
for (idx_t i=0,cumul=0; i<a->M;
|
|
cumul += SP3MM_UB_HEURISTIC * abUpperBoundedRowsLens[i++])
|
|
*outIdxs[i] = upperBoundedSymMat + cumul;
|
|
#endif //#if _OUT_IDXS == TRUE
|
|
//rbTrees for index keeping
|
|
uint maxThreads = omp_get_max_threads(); //TODO FROM CFG
|
|
idx_t abMaxRowLen = reductionMaxSeq(abUpperBoundedRowsLens, a->M);
|
|
#ifdef HEURISTICS_UB
|
|
idx_t maxRowLenUB = abMaxRowLen * SP3MM_UB_HEURISTIC; //TODO UB HEURISTC
|
|
#else
|
|
idx_t maxRowLenUB = c->N;
|
|
#endif //HEURISTICS_UB
|
|
|
|
if (!(abRowsJATmp = malloc(maxThreads*maxRowLenUB*sizeof(*abRowsJATmp)) ) ){
|
|
ERRPRINT("Sp3MM_Symb_ abRowsJATmp malloc errd\n");
|
|
goto _err;
|
|
}
|
|
if (symbMMRowImplID == RBTREE || IDX_RMUL_SYMB_RBTREE ){
|
|
rbNodes = malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes));
|
|
rbRoots = malloc(maxThreads * sizeof(*rbRoots));
|
|
if (!rbRoots || !rbNodes ){
|
|
ERRPRINT("Sp3MM_Symb_ rbRoots || rbNodes malloc errd\n");
|
|
goto _err;
|
|
}
|
|
//init roots
|
|
for (uint i=0; i<maxThreads; i++) rbRoots[i] = RB_ROOT_CACHED;
|
|
if (!( rbNodes = malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes)) )){
|
|
ERRPRINT("Sp3MM_Symb_ rbNodes malloc errdn\n");
|
|
goto _err;
|
|
}
|
|
}
|
|
if (symbMMRowImplID == IDXMAP){
|
|
if (!( idxsMapAccs = calloc(maxThreads,sizeof(*idxsMapAccs)) )){
|
|
ERRPRINT("Sp3MM_Symb_ idxsMapAccs calloc errd\n");
|
|
goto _err;
|
|
}
|
|
//init idxs maps
|
|
for (uint i=0; i<maxThreads; i++){
|
|
if(initSpVectIdxDenseAcc(c->N,idxsMapAccs+i)) goto _err;
|
|
}
|
|
}
|
|
///rows parallel compute
|
|
idx_t* aRow;
|
|
idx_t aRowLen,rLen,outCumulLen=0;
|
|
//threads local pointers
|
|
int tid;
|
|
rbRoot* tRoot;
|
|
rbNode* tNodes;
|
|
SPVECT_IDX_DENSE_MAP* tIdxsMapAcc;
|
|
idx_t* tABRowJATmp;
|
|
|
|
#pragma omp parallel for schedule(static) \
|
|
private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:outCumulLen)
|
|
for(idx_t r=0; r<a->M; r++){
|
|
aRow = a->JA + a->IRP[r]-OFF_F;
|
|
aRowLen = a->IRP[r+1] - a->IRP[r];
|
|
tid = omp_get_thread_num();
|
|
tRoot = rbRoots + tid;
|
|
tNodes = rbNodes + tid * maxRowLenUB;
|
|
tIdxsMapAcc = NULL; //TODO
|
|
tABRowJATmp = abRowsJATmp + tid * maxRowLenUB;
|
|
rLen = CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F)
|
|
(symbMMRowImplID, aRow,aRowLen,b,c,tRoot,tNodes,tIdxsMapAcc,tABRowJATmp,
|
|
#if _OUT_IDXS == TRUE
|
|
*outIdxs[r]
|
|
#endif
|
|
);
|
|
outCumulLen += rLen;
|
|
rowLens[r] = rLen;
|
|
|
|
}
|
|
goto _free;
|
|
_err:
|
|
free(rowLens);
|
|
#if _OUT_IDXS == T
|
|
if (*outIdxs) free(*outIdxs);
|
|
#endif
|
|
rowLens = NULL;
|
|
_free:
|
|
free(abUpperBoundedRowsLens);
|
|
free(upperBoundedSymMat);
|
|
free(rbRoots);
|
|
free(rbNodes);
|
|
free(abRowsJATmp);
|
|
|
|
return rowLens;
|
|
}
|
|
|
|
#endif //#if !defined COL_PARTS && defined OUT_IDXS
|
|
|
|
|
|
///restore aux macros entry state
|
|
//#undef _OUT_ID
|
|
//#undef _COL_PARTS
|
|
#pragma pop_macro("OUT_IDXS")
|
|
#pragma pop_macro("_OUT_IDXS")
|
|
#pragma pop_macro("COL_PARTS")
|
|
#pragma pop_macro("_COL_PARTS")
|
|
|
|
|
|
|
|
|