added sp3mm4amg source code to the repository and created C wrapper routine to be used in the fortran interface

2 years ago · 0c88352530
parent 1a4ae1e973
commit 0c88352530
41 changed files with 10777 additions and 32 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,7 @@
+{
+    "fortran.fortls.path": "/usr/bin/fortls",
+    "files.associations": {
+        "sp3mm_csr_omp_multi.h": "c",
+        "sp3mm_csr_omp_ub_generic.h": "c"
+    }
+}
--- a/base/serial/impl/sp3mm4amg/COPYING
+++ b/base/serial/impl/sp3mm4amg/COPYING
@ -0,0 +1,15 @@
+Copyright Andrea Di Iorio 2022
+This file is part of Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN
+Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN.  If not, see <http://www.gnu.org/licenses/>.
+
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Multi.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Multi.c
@ -0,0 +1,66 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//Get multiple implementation for C-Fortran indexing by re-define & re-include
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <alloca.h> //TODO quick hold few CSR cols partition sizes
+#include <omp.h>
+//UB version deps
+#include "Sp3MM_CSR_OMP_Multi.h"
+#include "SpMMUtilsMulti.h"
+#include "sparseUtilsMulti.h"
+#include "ompChunksDivide.h"
+#include "parser.h"
+#include "utils.h"
+#include "macros.h"
+#include "sparseMatrix.h"
+
+#include "inlineExports.h"
+
+//Symb version deps
+#include "Sp3MM_CSR_OMP_Multi.h"
+
+
+//global vars	->	audit
+double Start,End,Elapsed,ElapsedInternal;
+
+#define OFF_F 0
+#include "inlineExports_Generic.h"
+#include "Sp3MM_CSR_OMP_UB_Generic.c"
+#include "Sp3MM_CSR_OMP_Num_Generic.c"
+#undef OFF_F
+
+#define OFF_F 1
+#include "inlineExports_Generic.h"
+#include "Sp3MM_CSR_OMP_UB_Generic.c"
+#include "Sp3MM_CSR_OMP_Num_Generic.c"
+#undef OFF_F
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
@ -0,0 +1,249 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+/*#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )*/
+#ifndef OFF_F
+    #error generic implementation requires OFF_F defined
+#endif
+
+#ifndef SP3MM_OMP_SYMB
+#define SP3MM_OMP_SYMB
+
+//allocCSRSpMatSymbStep aux function for IRP set by symb step output
+static inline idx_t _setCSR_IRP_1DPartitioing(spmat* m, idx_t* rowSizes){
+	idx_t r,cumulSize;
+	for (r=0,cumulSize=0; r<m->M; cumulSize += rowSizes[r++])
+		m->IRP[r] = cumulSize;
+	DEBUGCHECKS	assert(cumulSize == rowSizes[m->M]);
+	m->IRP[m->M] = cumulSize;
+	return cumulSize;
+}
+static inline idx_t _setCSR_IRP_2DPartitioing(spmat* m,idx_t* rowSizes,ushort gridCols){
+	idx_t cumulSize=0;
+	for (idx_t r=0,cumulSizeOld; r<m->M; r++){
+		m->IRP[r] = cumulSize;
+		for (ushort gc=0; gc<gridCols; gc++){
+			cumulSizeOld = cumulSize;
+			cumulSize += rowSizes[ IDX2D(r,gc,gridCols) ];
+			//inplace update 2D rowParts len matrix as IRP like	start offsets
+			rowSizes[ IDX2D(r,gc,gridCols) ] = cumulSizeOld; 
+		}
+	}
+	DEBUGCHECKS		assert(cumulSize == rowSizes[m->M*gridCols]);
+	m->IRP[m->M] = cumulSize;
+	return cumulSize;
+}
+/*
+ * allocate CSR mat @m, setting up correctly IRP and other buffers allocations
+ * if @gridCols == 1: (1D partitioning of @m) -> @rowsSizes will be an array of row lenghts
+ * if @gridCols >  1: (2D partitioning of @m) -> @rowsSizes will be a matrix with
+ * 		elem i,j = len of j-th colPartition (out of @gridCols) of i-th row
+ */
+static inline int allocCSRSpMatSymbStep(spmat* m,idx_t* rowSizes,ushort gridCols){
+	//setup IRP and get cumul, whole size of @m
+	if (!gridCols)	   return EXIT_FAILURE;
+	idx_t cumulSize;
+	if (gridCols == 1) cumulSize = _setCSR_IRP_1DPartitioing(m,rowSizes);
+	/*if (gridCols > 1} -- static analizer miss unsignedness...*/
+	else		   cumulSize = _setCSR_IRP_2DPartitioing(m,rowSizes,gridCols);
+	m->IRP[m->M] 	=  cumulSize;
+	m->NZ		=  cumulSize;
+
+	if (!(m->AS = malloc(rowSizes[m->M] * sizeof(*m->AS))) ){
+		ERRPRINT("allocCSRSpMatSymbStep m->AS malloc errd\n");
+		return EXIT_FAILURE;
+	}
+	if (!(m->JA = malloc(rowSizes[m->M] * sizeof(*m->JA))) ){
+		ERRPRINT("allocCSRSpMatSymbStep m->JA malloc errd\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+#endif
+//////////////////// COMPUTE CORE Sp[3]MM SYMB-NUMB PHASE //////////////////////////
+////////Sp3MM as 2 x SpMM
+///1D
+spmat* CAT(spmmRowByRow_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	///SYMBOLIC STEP
+	if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
+		goto _err;
+	if (allocCSRSpMatSymbStep(AB,rowsSizes,1))	goto _err;
+	
+	///NUMERIC STEP
+	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (AB->M,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	#pragma omp parallel for schedule(runtime) private(acc)
+	for (idx_t r=0;  r<A->M; r++){	//row-by-row formulation
+		acc = accVects + omp_get_thread_num();
+		_resetAccVect(acc);   //rezero for the next A row
+		//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
+		/*	direct use of sparse scalar vector multiplication
+		for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen;  ja<A->IRP[r+1]-OFF_F;  ja++){
+			ca 		= A->JA[ja]		- OFF_F;
+			jb 		= B->IRP[ca]	- OFF_F;
+			bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
+			CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
+		} */
+		for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
+			CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
+		//direct sparsify: trasform accumulated dense vector to a CSR row
+		sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
+	}
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG                   checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+
+}
+spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	///SYMBOLIC STEP
+	if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
+		goto _err;
+	if (allocCSRSpMatSymbStep(AB,rowsSizes,1))
+		goto _err;
+	
+	///NUMERIC STEP
+	//perform Gustavson over rows blocks -> M / @cfg->gridRows
+	ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
+	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong b,startRow,block; //omp for aux vars
+	#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
+	for (b=0;   b < cfg->gridRows; b++){
+		block	  = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
+		startRow   = UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem);
+		for (idx_t r=startRow;  r<startRow+block; r++){
+			acc = accVects + omp_get_thread_num();
+			_resetAccVect(acc);   //rezero for the next A row
+			//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
+			/*	direct use of sparse scalar vector multiplication
+			for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen;  ja<A->IRP[r+1]-OFF_F;  ja++){
+				ca 		= A->JA[ja]		- OFF_F;
+				jb 		= B->IRP[ca]	- OFF_F;
+				bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
+				CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
+			} */
+			for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
+				CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
+			//direct sparsify: trasform accumulated dense vector to a CSR row
+			sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
+		}
+	}
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG			checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	if(rowsSizes)   free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+
+}
+spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ 
+	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
+	idx_t* bColOffsets = NULL;   //B group columns starting offset for each row
+	ACC_DENSE *accVectors=NULL,*accV;
+	SPACC* accRowPart;
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	SPMM_ACC* outAccumul=NULL;
+	idx_t	*rowsPartsSizes=NULL, *rowSizes=NULL;	//for rows' cols partition, correct len
+	if (!AB)	goto _err;
+	//2D indexing aux vars
+	ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	////get bColOffsets for B column groups 
+	if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
+		goto _err;
+	///TODO TODO
+	_err:
+	if (AB) freeSpmat(AB);
+	AB = NULL; 
+	_free:
+	free(rowsPartsSizes);
+	free(bColOffsets);
+	if (accVectors)  freeAccsDense(accVectors,gridSize);
+	if (outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+}
+/*	TODO
+///2D
+//PARTITIONS NOT ALLOCATED
+spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+///SP3MM
+spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
+////////Sp3MM direct
+///1D
+spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
+*/
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
@ -0,0 +1,594 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+/*
+ * CSR Sp[3]MM Symbolic step implementations
+ * target: 	compute the output matrix size and the row lens for preallocation
+ * 			direct write out partial results
+ * See interfaces in respective header
+ */
+
+
+/*#pragma message( "compiling Sp3MM_CSR_OMP_Symb_Generic.c with config as:" \
+	STR(OFF_F) " - " STR(OUT_IDXS) " - " STR(COL_PARTS) )*/
+#ifndef OFF_F
+    #error generic implementation requires OFF_F defined
+#endif
+
+///setup aux macros for different signatures implementation via #if arith expr
+#pragma push_macro("OUT_IDXS")
+#pragma push_macro("_OUT_IDXS")
+#pragma push_macro("COL_PARTS")
+#pragma push_macro("_COL_PARTS")
+
+#ifdef OUT_IDXS 
+	#define _OUT_IDXS  	TRUE
+#else   
+	#define _OUT_IDXS 	FALSE
+	#define OUT_IDXS	_UNDEF
+#endif
+#ifdef COL_PARTS
+	#define _COL_PARTS	TRUE
+#else   
+	#define _COL_PARTS	FALSE
+	#define COL_PARTS	_UNDEF
+#endif
+///
+
+//////SpMM - rowByrow
+///1row->matrix->outRow
+//RBTREE based
+/*
+ * Compute symbolic product of (nnz indexes of) row @aRowJA and matrix @b
+ * insert nnz indexes of the mul. result row as nodes in a rbtree rooted at @root
+ * with nodes in @nodes which have to be enough for the mul result row (use an UB)
+ * Retuns: multiplication result row NNZ number,se CONFIG_MACROS below for more 
+ *
+ * CONFIG_MACROS:
+ * if _OUT_IDXS == TRUE	return mul.result row nnz idxs in @outIdxs
+ *	  ifdef: OUT_IDXS_RBTREE_NODES: nnz indexes returned inplace sorting rbtree
+ * 								as nnz indexes(JA) of the mul result row
+ * else:							stop at returning the mul. result row lenght
+ * if _COL_PARTS == TRUE	return the number of nonzero elements in
+ *		in each of the @gridCols column partitions inside @rowColPartsLens
+ * OFF_F:	offset back indexes from fortran
+ * 		TODO also output indexes are shifted (see c_b )
+ */
+static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   idx_t* aRowJA, idx_t aRowLen, spmat* b,rbRoot* root, rbNode* nodes
+   #if _OUT_IDXS  == TRUE && !defined OUT_IDXS_RBTREE_NODES 
+   ,idx_t* outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  )
+{
+	//Compute resulting ab's row non zero indexes and total lenght
+	idx_t abRowLen = 0;		//mul.result row len,	return value
+	for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){	//for each entry in a's row
+		c_a = aRowJA[i]-OFF_F;
+		//gather diffrent nnz indexes in corresponding b's `c_a`th row 
+		for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){ 
+			c_b = b->JA[j]-OFF_F;
+			//check if c_b is nonzero index for mul.result row
+			inserted 	 =  rbInsertNewKey (root, nodes+abRowLen, c_b);
+			abRowLen	+=	inserted; //inserted needed just after this
+			/*	LESS EFFICIENT THEN BELOW (here no memory of last colPart)
+			#if _COL_PARTS == TRUE //keep track of which col partition is c_b in
+			if (inserted)
+			 rowColPartsLens[ matchingUnifRangeIdx(c_b, b->N, gridCols) ]++;
+			#endif */
+		}
+	}
+	#if 	_OUT_IDXS == T  && defined OUT_IDXS_RBTREE_NODES
+	/* return the non zero indexes of the mul.result row
+ 	 * sorting inplace the nodes inserted in the rbtree */
+	sortRbNode(nodes,abRowLen);
+	#elif	_OUT_IDXS == T || _COL_PARTS == T
+	uint i=0;
+	idx_t k;
+	#if _COL_PARTS == T
+	//colParts aux vars
+	idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
+	ushort gc=0;
+	idx_t gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
+	idx_t gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
+	#endif	//_COL_PARTS == T
+  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+		k = rb_entry(n,rbNode,rb)->key;
+		#if _OUT_IDXS == T
+		//return the mul.result nnz index inside the rbNodes
+		outIdxs[ i++ ] = k;
+		#endif
+		#if _COL_PARTS == T
+		while (k >= gcEndCol ){	//see if the idx is in another col partition
+								//	TODO also = since gcEndCol as k is 0based
+			gcEndCol = 	unifRemShareEnd(gc ,_colBlock, _colBlockRem);
+			gc++;
+			DEBUGCHECKS{ assert( gc < gridCols ); }
+		}
+		rowColPartsLens[gc]++;
+		#endif //_COL_PARTS == T
+	}
+	#endif	//_OUT_IDXS == T ... _COL_PARTS == T
+	/*DEBUGCHECKS{	//TODO PRINT NNZ INDEXES FOR MANUAL (PAINFUL CHECK)
+		idx_t k;
+  		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+			k = rb_entry(n,rbNode,rb)->key;
+			printf("%lu, ",k);
+		}
+		printf("\n");
+	}*/
+
+	return abRowLen;
+}
+
+//SPVECT_IDX_DENSE_MAP based	TODO double implementation for trick syntax folding here...
+/*
+ * SPVECT_IDX_DENSE_MAP based, as SpMM_Row_Symb_Rbtree but with idxMap aux idx keeping
+ * CONFIG_MACROS (new)
+ * IDX_RMUL_SYMB_RBTREE && ( _OUT_IDXS == T || _COL_PARTS == T ):
+ * 	 (tmp) symb mult out indexes will be kept via a rbtree
+ * 	 otherwise directly in the out array appending them and then sorting them
+ * 	 		(potentially same n log n)
+ */
+static inline idx_t CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   idx_t* aRowJA, idx_t aRowLen, spmat* b, SPVECT_IDX_DENSE_MAP* idxsMapAcc
+   #if _OUT_IDXS  == TRUE 
+   ,idx_t* outIdxs
+   #endif
+   #if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
+   ,rbRoot* root, rbNode* nodes
+   #endif	// _OUT_IDXS == TRUE
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  )
+{
+	//Compute resulting ab's row non zero indexes and total lenght
+	idx_t abRowLen = 0;		//mul.result row len,	return value
+	for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){	//for each entry in a's row
+		c_a = aRowJA[i]-OFF_F;
+		//gather diffrent nnz indexes in corresponding b's `c_a`th row 
+		for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){ 
+			c_b = b->JA[j]-OFF_F;
+			//check if c_b is nonzero index for mul.result row
+			inserted  = spVect_idx_in(c_b,idxsMapAcc);
+			#if	_OUT_IDXS == T || _COL_PARTS == T //idxs HAS TO be accumulated
+			if (inserted)
+				#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F 	//add it in a RBTREE struct
+				rbInsertNewKey (root, nodes+idxsMapAcc->len, c_b);
+				#else						//append it, then sort
+				outIdxs[idxsMapAcc->len] = c_b;
+				#endif	//IDX_RMUL_SYMB_RBTREE == T //how accumulated key c_b
+			#endif	//#if	_OUT_IDXS == T || _COL_PARTS == T
+		}
+	}
+	abRowLen = idxsMapAcc->len;
+	//gather idxs or their sparsity struct in output row
+	#if	_OUT_IDXS == T || _COL_PARTS == T 
+	idx_t j = 0,k;
+	#if _COL_PARTS == T 
+	//colParts aux vars
+	idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
+	ushort gc = 0;
+	idx_t  gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
+	idx_t  gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
+	#endif
+	#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F	///idxs recorded in a aux rbtree
+  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+		k = rb_entry(n,rbNode,rb)->key;
+		#if _OUT_IDXS == T
+		outIdxs[ j++ ] = k;	//record ordered key sotred from aux rbtree
+		#endif
+	#else				///idxs recorded in aux append array
+	sort_idx_t(outIdxs,abRowLen);
+  	for (; j < abRowLen; j++){
+		k = outIdxs[j];		//(OSS) already ordered in outIndexes arr
+	#endif	//IDX_RMUL_SYMB_RBTREE == T
+		#if _COL_PARTS == T
+		while (k >= gcEndCol ){	//see if the idx is in another col partition
+				//	TODO also = since gcEndCol as k is 0based
+			gcEndCol = unifRemShareEnd(gc ,_colBlock, _colBlockRem);
+			gc++;
+			DEBUGCHECKS{ assert( gc < gridCols ); }
+		}
+		rowColPartsLens[gc]++;
+		#endif //_COL_PARTS == T
+	}
+	#endif	//_OUT_IDXS == T ... _COL_PARTS == T
+	return abRowLen;
+}
+
+//switch among 2 row_symb_XXX implemenetation aux
+/*
+ * SpMM single row symbolic computation
+ * select one implementation via @implID 
+ * among SpMM_Row_Symb_Rbtree or SpMM_Row_Symb_IdxMap
+ * args will be forwared accordingly
+ */
+static inline idx_t CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   ROW_MMSYM_IMPL_MODE implID, idx_t* aRowJA, idx_t aRowLen, spmat* b,
+   rbRoot* root, rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc
+   #if _OUT_IDXS  == TRUE 
+   ,idx_t* outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  )
+{
+	if (implID == RBTREE)	{
+		return CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)  
+			(
+				aRowJA,aRowLen,b,root,nodes
+   				#if _OUT_IDXS  == TRUE && !defined OUT_IDXS_RBTREE_NODES 
+				,outIdxs
+				#endif
+				#if _COL_PARTS == TRUE
+				,gridCols,rowColPartsLens
+				#endif
+			);
+	}
+	else { //IDXMAP
+		return CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
+			(
+				aRowJA,aRowLen,b,idxsMapAcc
+   				#if		 _OUT_IDXS  == TRUE 
+				,outIdxs
+				#endif
+   				#if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
+   				,root, nodes
+				#endif
+				#if _COL_PARTS == TRUE
+				,gridCols, rowColPartsLens
+				#endif
+			);
+	}
+}
+///SpMM row-by-row
+idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
+  (
+   ROW_MMSYM_IMPL_MODE symbRowImplID, spmat* a, spmat* b
+   #if _OUT_IDXS  == TRUE
+   ,idx_t*** outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols, idx_t** rowColPartsLens
+   #endif
+  )
+{
+
+	///initial allocations
+	rbRoot* rbRoots = NULL;	
+	rbNode* rbNodes	= NULL; 
+	SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
+	idx_t  *rowLens=NULL,*upperBoundedRowsLens=NULL,*upperBoundedSymMat=NULL;
+	idx_t  maxRowLen=0;
+	int rbTreeUsed = (symbRowImplID == RBTREE || 
+			  (IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS)) );
+	
+	if ( !(rowLens = malloc(sizeof(*rowLens) * (a->M+1))) ){ 
+		ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
+		goto _err;
+	}
+	if (_OUT_IDXS == TRUE || rbTreeUsed ){
+		if (!(upperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))
+			goto _err;
+	}
+	#if _OUT_IDXS  == TRUE
+	if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
+		ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
+		goto _err;
+	}
+	if (!(upperBoundedSymMat   = malloc(
+	  sizeof(*upperBoundedSymMat)*upperBoundedRowsLens[a->M]))){
+		ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
+		goto _err;
+	}
+	//write rows' start pointer from full matrix JA allocated
+	for (idx_t i=0,cumul=0; i<a->M; cumul += upperBoundedRowsLens[i++])
+		*outIdxs[i] = upperBoundedSymMat + cumul; 
+	#endif	//#if _OUT_IDXS  == TRUE
+	#if _COL_PARTS == TRUE
+	if (!(*rowColPartsLens = malloc(a->M * gridCols * sizeof(**rowColPartsLens)))){
+		ERRPRINT("SpMM_Symb_ rowColPartsLens malloc errd\n");
+		goto _err;
+	}
+	#endif //_COL_PARTS
+	uint maxThreads	= omp_get_max_threads();	//TODO FROM CFG
+	//index keeping aux struct
+	//rbtree implementation or idxMap with aux of symbTree for tmp outIdx keeping
+	if ( rbTreeUsed ){
+		maxRowLen = reductionMaxSeq(upperBoundedRowsLens, a->M);
+		//rbTrees for index keeping
+		rbRoots = malloc(maxThreads * sizeof(*rbRoots));
+		rbNodes	= calloc(maxThreads * maxRowLen, sizeof(*rbNodes));
+		if( !rbRoots || !rbNodes ){
+			ERRPRINT("SpMM_Symb_ threads' aux rbTree mallocs errd\n");
+			goto _err;
+		}
+		//init roots
+		for (uint i=0; i<maxThreads; i++)
+			rbRoots[i] = RB_ROOT_CACHED;
+	} 
+	if (symbRowImplID == IDXMAP){  //idxMap implementation && not outIdxs via rbtree
+		idxsMapAccs 	= calloc(maxThreads, sizeof(*idxsMapAccs));
+		if (!idxsMapAccs){
+			ERRPRINT("SpMM_Symb_ idxsMapAccs calloc errd\n");
+			goto _err;
+		}
+		//init idxs maps
+		for (uint i=0; i<maxThreads; i++){
+			if(initSpVectIdxDenseAcc(b->N,idxsMapAccs+i))	goto _err;
+		}
+	}
+	///rows parallel compute
+	idx_t* aRow;
+	idx_t  aRowLen,rLen,abCumulLen=0;
+	int tid;
+	rbRoot* tRoot;	rbNode* tNodes; 
+	SPVECT_IDX_DENSE_MAP* tIdxsMapAcc = NULL;
+	#pragma omp parallel for schedule(static) \
+	  private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:abCumulLen)
+	for(idx_t r=0; r<a->M; r++){
+		aRow		= a->JA + a->IRP[r]-OFF_F;
+		aRowLen		= a->IRP[r+1] - a->IRP[r];
+		tid 		= omp_get_thread_num();
+		//TODO low overhead pointer airth can be avoided with if (symbRowImplID .. && )
+		tIdxsMapAcc	= idxsMapAccs + tid;
+		tRoot 		= rbRoots + tid;
+		tNodes  	= rbNodes + tid * maxRowLen;
+
+		rLen = CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)  
+		(
+			symbRowImplID, aRow, aRowLen, b, tRoot,tNodes, tIdxsMapAcc
+   			#if _OUT_IDXS  == TRUE
+			,*outIdxs[r]
+			#endif
+			#if _COL_PARTS == TRUE
+			,gridCols, (*rowColPartsLens) + IDX2D(r,0,gridCols)
+			#endif
+		);
+		rowLens[r]  = rLen;
+		abCumulLen += rLen;
+		///reset symb idxs keeping aux structs
+		if (symbRowImplID==RBTREE || (IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS))){
+			*tRoot = RB_ROOT_CACHED;
+			memset(tNodes,0,rLen * sizeof(*tNodes));
+		}
+		if (symbRowImplID == IDXMAP)	_resetIdxMap(tIdxsMapAcc);
+		
+	}
+	rowLens[a->M]	= abCumulLen;
+	goto _free;
+
+	_err:
+	free(rowLens);
+	#if _OUT_IDXS  == T
+	if  (outIdxs)			free(*outIdxs);
+	#endif
+	#if _COL_PARTS == T
+	if  (rowColPartsLens)	free(*rowColPartsLens);
+	#endif
+	rowLens = NULL;
+	_free:
+	free(upperBoundedRowsLens);
+	free(upperBoundedSymMat);
+	free(rbRoots);
+	free(rbNodes);
+	if (idxsMapAccs){
+		for (uint i=0; i<maxThreads; i++)	free(idxsMapAccs[i].idxsMap);
+		free(idxsMapAccs);
+	}
+
+	return rowLens;
+}
+
+
+///Sp3MM - rowByrowByrow	////////////////////////////////////////////////////////
+//TODO COL_PARTS VERSION?
+#if _OUT_IDXS == TRUE && _COL_PARTS == FALSE
+idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID,idx_t* aRowJA,idx_t aRowLen,spmat* b,spmat* c, 
+	rbRoot* root,rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc,idx_t* abRowJATmp
+	#if _OUT_IDXS  == TRUE && !defined  OUT_IDXS_RBTREE_NODES 
+	,idx_t* outIdxs
+	#endif
+  )
+{
+	struct rb_node* n;
+	idx_t abRowLen	= CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
+	  (symbMMRowImplID,aRowJA,aRowLen,b,root,nodes,idxsMapAcc,abRowJATmp);
+	cleanRbNodes(root, nodes, abRowLen);
+	idx_t abcRowLen	= CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
+	  (symbMMRowImplID,abRowJATmp,abRowLen,c,root,nodes,idxsMapAcc,abRowJATmp);
+
+	#if 	_OUT_IDXS == TRUE
+	#ifndef OUT_IDXS_RBTREE_NODES	
+	//return the mul.result nnz index inside the rbNodes
+	uint i=0;
+	//rbNodeOrderedVisit(n,root)	outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
+  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+		outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
+	}
+	#else
+	/* return the non zero indexes of the mul.result row
+ 	 * sorting inplace the nodes inserted in the rbtree */
+	sortRbNode(nodes,abcRowLen);
+	#endif
+	#endif
+	return abcRowLen;
+}
+
+idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID, spmat* a, spmat* b, spmat* c
+	#if _OUT_IDXS  == TRUE
+	,idx_t*** outIdxs
+	#endif
+  )
+{
+	//idxs keeping aux buffs
+	idx_t*  abRowsJATmp	= NULL;
+	rbRoot* rbRoots 	= NULL; rbNode* rbNodes	= NULL; 
+	SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
+	
+	idx_t *abUpperBoundedRowsLens = NULL, *upperBoundedSymMat = NULL;
+	///initial allocations
+	idx_t* rowLens = malloc(sizeof(*rowLens) * (a->M +1) ); //to return
+	if (!rowLens){
+		ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
+		goto _err;
+	}
+	
+	#if _OUT_IDXS  == TRUE
+	if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
+		ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
+		goto _err;
+	}
+	if (!(abUpperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))	
+		goto _err;
+  	/*TODO TODO instead of doing one sym product first to have a correct UB
+	 *	use an heuristics here to get output matrix size
+	 */
+	idx_t abcUBSize = abUpperBoundedRowsLens[a->M] * SP3MM_UB_HEURISTIC;
+	if (!(upperBoundedSymMat=malloc(sizeof(*upperBoundedSymMat)*abcUBSize))){
+		ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
+		goto _err;
+	}
+	//TODO heuristic TO UB rows bounds ... require compacting copy
+	for (idx_t i=0,cumul=0; i<a->M; 
+	  cumul += SP3MM_UB_HEURISTIC * abUpperBoundedRowsLens[i++])
+		*outIdxs[i] = upperBoundedSymMat + cumul; 
+  	#endif	//#if _OUT_IDXS  == TRUE
+	//rbTrees for index keeping
+	uint maxThreads 	= omp_get_max_threads(); //TODO FROM CFG
+	idx_t abMaxRowLen 	= reductionMaxSeq(abUpperBoundedRowsLens, a->M);
+	#ifdef 	  HEURISTICS_UB
+	idx_t maxRowLenUB 	= abMaxRowLen * SP3MM_UB_HEURISTIC; //TODO UB HEURISTC
+	#else
+	idx_t maxRowLenUB 	= c->N;
+	#endif	//HEURISTICS_UB
+
+	if (!(abRowsJATmp	= malloc(maxThreads*maxRowLenUB*sizeof(*abRowsJATmp)) ) ){
+		ERRPRINT("Sp3MM_Symb_ abRowsJATmp malloc errd\n");
+		goto _err;
+	}
+	if (symbMMRowImplID == RBTREE || IDX_RMUL_SYMB_RBTREE ){
+		rbNodes	= malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes));
+		rbRoots = malloc(maxThreads * sizeof(*rbRoots));
+		if (!rbRoots || !rbNodes ){
+			ERRPRINT("Sp3MM_Symb_ rbRoots || rbNodes malloc errd\n");
+			goto _err;
+		}
+		//init roots
+		for (uint i=0; i<maxThreads; i++)	rbRoots[i] = RB_ROOT_CACHED;
+		if (!( rbNodes	= malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes)) )){
+			ERRPRINT("Sp3MM_Symb_ rbNodes malloc errdn\n");
+			goto _err;
+		}
+	}
+	if (symbMMRowImplID == IDXMAP){
+		if (!( idxsMapAccs = calloc(maxThreads,sizeof(*idxsMapAccs)) )){
+			ERRPRINT("Sp3MM_Symb_ idxsMapAccs calloc errd\n");
+			goto _err;
+		}	
+		//init idxs maps
+		for (uint i=0; i<maxThreads; i++){
+			if(initSpVectIdxDenseAcc(c->N,idxsMapAccs+i))	goto _err;
+		}
+	}
+	///rows parallel compute
+	idx_t* aRow;
+	idx_t  aRowLen,rLen,outCumulLen=0;
+	//threads local pointers
+	int tid;
+	rbRoot* tRoot;
+	rbNode* tNodes;
+	SPVECT_IDX_DENSE_MAP* tIdxsMapAcc;
+	idx_t* tABRowJATmp;
+
+	#pragma omp parallel for schedule(static) \
+	private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:outCumulLen)
+	for(idx_t r=0; r<a->M; r++){
+		aRow		= a->JA + a->IRP[r]-OFF_F;
+		aRowLen		= a->IRP[r+1] - a->IRP[r];
+		tid 		= omp_get_thread_num();
+		tRoot 		= rbRoots + tid;
+		tNodes  	= rbNodes + tid * maxRowLenUB;
+		tIdxsMapAcc = NULL;	//TODO
+		tABRowJATmp	= abRowsJATmp + tid * maxRowLenUB;
+		rLen = CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F)
+		  (symbMMRowImplID, aRow,aRowLen,b,c,tRoot,tNodes,tIdxsMapAcc,tABRowJATmp,
+		  	#if _OUT_IDXS == TRUE
+			*outIdxs[r]
+			#endif
+		  );
+		outCumulLen += rLen;
+		rowLens[r] = rLen;
+		
+	}
+	goto _free;
+	_err:
+	free(rowLens);
+	#if _OUT_IDXS == T
+	if (*outIdxs)	free(*outIdxs);
+	#endif
+	rowLens = NULL;
+	_free:
+	free(abUpperBoundedRowsLens);
+	free(upperBoundedSymMat);
+	free(rbRoots);
+	free(rbNodes);
+	free(abRowsJATmp);
+
+	return rowLens;
+}
+
+#endif	//#if !defined COL_PARTS && defined OUT_IDXS
+
+
+///restore aux macros entry state
+//#undef _OUT_ID
+//#undef _COL_PARTS
+#pragma pop_macro("OUT_IDXS")
+#pragma pop_macro("_OUT_IDXS")
+#pragma pop_macro("COL_PARTS")
+#pragma pop_macro("_COL_PARTS")
+
+
+
+
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Multi.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Multi.c
@ -0,0 +1,92 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "macros.h"
+#include "config.h"
+#include "sparseMatrix.h"
+#include "linuxK_rbtree_minimalized.h"
+
+#include "Sp3MM_CSR_OMP_SymbStep_Multi.h"
+
+#include "sparseUtilsMulti.h"
+#include "utils.h"
+#include <omp.h>
+
+//inline exports
+//ushort matchingUnifRangeIdx(idx_t idx, idx_t size, ushort rangesN);
+idx_t* CAT(spMMSizeUpperbound_,0)(spmat* ,spmat* );
+idx_t* CAT(spMMSizeUpperbound_,1)(spmat* ,spmat* );
+idx_t* CAT(spMMSizeUpperboundColParts_,0)(spmat* ,spmat* ,ushort);
+idx_t* CAT(spMMSizeUpperboundColParts_,1)(spmat* ,spmat* ,ushort);
+
+/* Multi implementation of symbolic product of sparse matrixes, config macros
+ * OFF_F:		C-Fortran spmat indexing
+ * OUT_IDXS:	indexes output
+ * COL_PARTS:	partitioning columns output...
+ */
+
+#define OUT_IDXS_ON		OutIdxs_
+#define COL_PARTS_ON	ColParts_
+#undef OUT_IDXS
+#undef COL_PARTS
+
+#define OFF_F 0
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+		//generate outIdxs AND colParts versions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
+#define OFF_F 1
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+		//generate outIdxs AND colParts versions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
@ -0,0 +1,573 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+/*#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )*/
+#ifndef OFF_F
+	#error generic implementation requires OFF_F defined
+#endif
+
+
+
+//////////////////// COMPUTE CORE Sp[3]MM Upperbound //////////////////////////
+spmat* CAT(spmmSerial_,OFF_F)(spmat* A,spmat* B, CONFIG* _cfg){	//serial implementation
+	spmat* AB = NULL;
+	ACC_DENSE acc;
+	if ( allocAccDense(&acc,B->N) )			goto _free;
+	if (!(AB = allocSpMatrix(A->M,B->N)))	goto _free;
+	for( idx_t r=0; r<A->M; r++ ){
+		for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
+			CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, &acc);
+		sparsifyDirect(&acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
+	}
+	_free:
+	freeAccsDense(&acc,1);
+	
+	return AB;
+}
+////////Sp3MM as 2 x SpMM
+///1D
+spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
+	///thread aux
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	if (!(rowsSizes = CAT(spMMSizeUpperbound_,OFF_F) (A,B)))   goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsSizes[AB->M],AB->M)))  goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t r=0,rSizeCumul=0; r<AB->M; rSizeCumul += rowsSizes[r++]){
+		accSp 		= outAccumul->accs+r;
+		accSp->JA 	= outAccumul->JA + rSizeCumul;
+		accSp->AS 	= outAccumul->AS + rSizeCumul;
+		//accSp->len	= rowsSizes[r];
+	}
+	#endif
+
+	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (AB->M,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	#pragma omp parallel for schedule(runtime) private(acc)
+	for (ulong r=0;  r<A->M; r++){	//row-by-row formulation
+		//iterate over nz entry index c inside current row r
+		acc = accVects + omp_get_thread_num();
+		/* direct use of sparse scalar vector multiplication
+		for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; ja<A->IRP[r+1]-OFF_F; ja++){
+			ca = A->JA[ja]  - OFF_F;
+			jb = B->IRP[ca] - OFF_F;
+			bRowLen = B->IRP[ca+1] - B->IRP[ca];
+			CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc);
+		}*/
+		for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
+			CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
+		//trasform accumulated dense vector to a CSR row
+		#if SPARSIFY_PRE_PARTITIONING == T
+		_sparsifyUB(acc,outAccumul->accs+r,0);
+		#else
+		sparsifyUBNoPartsBounds(outAccumul,acc,outAccumul->accs + r,0);
+		#endif
+		_resetAccVect(acc);   //rezero for the next A row
+	}
+	///merge sparse row computed before
+	if (mergeRows(outAccumul->accs,AB))	goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG				   checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	if(rowsSizes)   free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+}
+
+spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	DEBUG printf("spmm\trowBlocks of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
+	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
+	///thread aux
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	if (!(rowsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B)))   goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsSizes[AB->M],AB->M)))  goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t r=0,rSizeCumul=0; r<AB->M; rSizeCumul += rowsSizes[r++]){
+		accSp = outAccumul->accs+r;
+		accSp->JA = outAccumul->JA + rSizeCumul;
+		accSp->AS = outAccumul->AS + rSizeCumul;
+	}
+	#endif
+   
+	//perform Gustavson over rows blocks -> M / @cfg->gridRows
+	ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong b,startRow,block; //omp for aux vars
+	#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
+	for (b=0;   b < cfg->gridRows; b++){
+		block	= UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
+		startRow= UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem);
+		acc	= accVects + omp_get_thread_num();
+	   
+		DEBUGPRINT{
+			fflush(NULL);
+			printf("block %lu\t%lu:%lu(%lu)\n",b,startRow,startRow+block-1,block);
+			fflush(NULL);
+		}
+		//row-by-row formulation in the given row block
+		for (ulong r=startRow;  r<startRow+block;  r++){
+			//iterate over nz entry index c inside current row r
+			for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) 
+				CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
+			//trasform accumulated dense vector to a CSR row
+			#if SPARSIFY_PRE_PARTITIONING == T
+			_sparsifyUB(acc,outAccumul->accs+r,0);
+			#else
+			sparsifyUBNoPartsBounds(outAccumul,acc,outAccumul->accs + r,0);
+			#endif
+			_resetAccVect(acc);   //rezero for the next A row
+		}
+	}
+	///merge sparse row computed before
+	if (mergeRows(outAccumul->accs,AB))	goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG			checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	if(rowsSizes)   free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+}
+
+///2D
+//PARTITIONS NOT ALLOCATED
+spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ 
+	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
+	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
+	idx_t* bColOffsets = NULL;   //B group columns starting offset for each row
+	ACC_DENSE *accVectors=NULL,*accV;
+	SPACC* accRowPart;
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	SPMM_ACC* outAccumul=NULL;
+	idx_t*	rowsPartsSizes=NULL;
+	if (!AB)	goto _err;
+	//2D indexing aux vars
+	ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	////get bColOffsets for B column groups 
+	if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	uint rowsPartsSizesN = aSubRowsN;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperboundColParts_,OFF_F)
+					(A,B,cfg->gridCols,bColOffsets)))
+	#else
+	uint rowsPartsSizesN = AB->M;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B)))
+	#endif
+		goto _err;
+	 
+	//aux vectors  
+
+	///other AUX struct alloc
+	if (!(accVectors = _initAccVectors(gridSize,_colBlock+(_colBlockRem?1:0)))){
+		ERRPRINT("accVectors calloc failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsPartsSizes[rowsPartsSizesN],aSubRowsN)))   
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t i=0,rSizeCumul=0; i<aSubRowsN; rSizeCumul += rowsPartsSizes[i++]){
+		accSp 		= outAccumul->accs+i;
+		accSp->JA 	= outAccumul->JA + rSizeCumul;
+		accSp->AS 	= outAccumul->AS + rSizeCumul;
+	}
+	//memset(outAccumul->AS,0,sizeof(double)*rowsSizes[AB->M]);memset(outAccumul->JA,0,sizeof(idx_t)*rowsSizes[AB->M]);
+	#endif
+	
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong tileID,t_i,t_j;							//for aux vars
+	ulong bPartLen,bPartID,bPartOffset;//B partition acces aux vars
+	#pragma omp parallel for schedule(runtime) \
+	  private(accV,accRowPart,rowBlock,colBlock,startRow,startCol,\
+	  bPartLen,bPartID,bPartOffset,t_i,t_j)
+	for (tileID = 0; tileID < gridSize; tileID++){
+		///get iteration's indexing variables
+		//tile index in the 2D grid of AB computation TODO OMP HOW TO PARALLELIZE 2 FOR
+		t_i = tileID/cfg->gridCols;  //i-th row block
+		t_j = tileID%cfg->gridCols;  //j-th col block
+		//get tile row-cols group FAIR sizes
+		rowBlock = UNIF_REMINDER_DISTRI(t_i,_rowBlock,_rowBlockRem); 
+		startRow = UNIF_REMINDER_DISTRI_STARTIDX(t_i,_rowBlock,_rowBlockRem);
+		startCol = UNIF_REMINDER_DISTRI_STARTIDX(t_j,_colBlock,_colBlockRem);
+		
+		accV = accVectors + tileID; 
+		 
+		DEBUGPRINT{
+			fflush(NULL);
+			colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem);
+			printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",t_i,startRow,startRow+rowBlock-1,rowBlock);
+			printf("colBlock [%lu\t%lu:%lu(%lu)]\n",t_j,startCol,startCol+colBlock-1,colBlock);
+			fflush(NULL);
+		}
+		///AB[t_i][t_j] block compute
+		for (ulong r=startRow;  r<startRow+rowBlock;  r++){
+			//iterate over nz col index j inside current row r
+			//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
+			for (ulong j=A->IRP[r]-OFF_F,c; j<A->IRP[r+1]-OFF_F; j++){
+				//get start of B[A->JA[j]][:colBlock:]
+				c = A->JA[j]-OFF_F; // col of nnz in A[r][:] <-> target B row
+				bPartID	 = IDX2D(c,t_j,cfg->gridCols); 
+				bPartOffset = bColOffsets[ bPartID ];
+				bPartLen = bColOffsets[ bPartID + 1 ] - bPartOffset;
+
+				CAT(scSparseVectMulPart_,OFF_F)(A->AS[j],B->AS+bPartOffset,
+				  B->JA+bPartOffset,bPartLen,startCol,accV);
+			}
+
+			accRowPart = outAccumul->accs + IDX2D(r,t_j,cfg->gridCols);
+			#if SPARSIFY_PRE_PARTITIONING == T
+			_sparsifyUB(accV,accRowPart,startCol);
+			#else
+			sparsifyUBNoPartsBounds(outAccumul,accV,accRowPart,startCol);
+			#endif
+			_resetAccVect(accV);
+		}
+	}
+	if (mergeRowsPartitions(outAccumul->accs,AB,cfg))  goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG				   
+	  CAT(checkOverallocRowPartsPercent_, OFF_F)(rowsPartsSizes, AB, cfg->gridCols, bColOffsets);
+	goto _free;
+
+	_err:
+	if (AB) freeSpmat(AB);
+	AB = NULL; 
+	_free:
+	free(rowsPartsSizes);
+	free(bColOffsets);
+	if (accVectors)  freeAccsDense(accVectors,gridSize);
+	if (outAccumul)  freeSpMMAcc(outAccumul);
+	
+	return AB;
+		
+}
+
+spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	DEBUG printf("spmm\trowBlocks of A,\tcolBlocks (allcd) of B\tM=%luxN=%lu\n",A->M,B->N);
+	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
+	spmat *AB = NULL, *colPartsB = NULL, *colPart;
+	idx_t*   rowsPartsSizes=NULL;
+	//aux vectors  
+	SPMM_ACC* outAccumul=NULL;
+	ACC_DENSE *accVectors=NULL,*accV;
+	SPACC* accRowPart;
+	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	//2D indexing aux vars
+	idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	idx_t* bColOffsets = NULL;
+	
+	if (!(AB = allocSpMatrix(A->M,B->N)))		   goto _err;
+	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	
+	////B cols  partition in CSRs
+	//if (!(colPartsB = CAT(colsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))  goto _err;
+	if (!(colPartsB = CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)
+					(B, cfg->gridCols, &bColOffsets)))  
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	uint rowsPartsSizesN = aSubRowsN;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperboundColParts_,OFF_F)
+				 (A, B, cfg->gridCols, bColOffsets)))   
+	#else
+	uint rowsPartsSizesN = AB->M;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B)))
+	#endif
+		goto _err;
+	///other AUX struct alloc
+	if (!(accVectors = _initAccVectors(gridSize,_colBlock+(_colBlockRem?1:0)))){
+		ERRPRINT("accVectors calloc failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsPartsSizes[rowsPartsSizesN],aSubRowsN)))
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t i=0,rLenCumul=0; i<aSubRowsN; rLenCumul += rowsPartsSizes[i++]){
+		accSp 		= outAccumul->accs+i;
+		accSp->JA 	= outAccumul->JA + rLenCumul;
+		accSp->AS 	= outAccumul->AS + rLenCumul;
+	}
+	#endif
+	
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong tileID,t_i,t_j;	//for aux vars
+	#pragma omp parallel for schedule(runtime) \
+	  private(accV,accRowPart,colPart,rowBlock,colBlock,startRow,startCol,t_i,t_j)
+	for (tileID = 0; tileID < gridSize; tileID++){
+		///get iteration's indexing variables
+		//tile index in the 2D grid of AB computation TODO OMP HOW TO PARALLELIZE 2 FOR
+		t_i = tileID/cfg->gridCols;  //i-th row block
+		t_j = tileID%cfg->gridCols;  //j-th col block
+		//get tile row-cols group FAIR sizes
+		rowBlock = UNIF_REMINDER_DISTRI(t_i,_rowBlock,_rowBlockRem); 
+		colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem);
+		startRow = UNIF_REMINDER_DISTRI_STARTIDX(t_i,_rowBlock,_rowBlockRem);
+		startCol = UNIF_REMINDER_DISTRI_STARTIDX(t_j,_colBlock,_colBlockRem);
+		
+		colPart = colPartsB + t_j;
+		accV = accVectors + tileID; 
+		 
+		DEBUGPRINT{
+			fflush(NULL);
+			printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",
+				t_i,startRow,startRow+rowBlock-1,rowBlock);
+			printf("colBlock [%lu\t%lu:%lu(%lu)]\n",
+				t_j,startCol,startCol+colBlock-1,colBlock);
+			fflush(NULL);
+		}
+		///AB[t_i][t_j] block compute
+		for (ulong r=startRow;  r<startRow+rowBlock;  r++){
+			//iterate over nz col index j inside current row r
+			//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
+			for (ulong j=A->IRP[r]-OFF_F,c,bRowStart,bRowLen; j<A->IRP[r+1]-OFF_F; j++){
+				//get start of B[A->JA[j]][:colBlock:]
+				c = A->JA[j]-OFF_F; // column of nnz entry in A[r][:] <-> target B row
+				bRowStart = colPart->IRP[c];
+				#ifdef ROWLENS
+				bRowLen   = colPart->RL[c];
+				#else
+				bRowLen   = colPart->IRP[c+1] - bRowStart;
+				#endif
+				CAT(scSparseVectMulPart_,OFF_F)(A->AS[j],
+					colPart->AS+bRowStart,colPart->JA+bRowStart,
+					bRowLen,startCol,accV);
+			}
+
+			accRowPart = outAccumul->accs + IDX2D(r,t_j,cfg->gridCols);
+			#if SPARSIFY_PRE_PARTITIONING == T
+			_sparsifyUB(accV,accRowPart,startCol);
+			#else
+			sparsifyUBNoPartsBounds(outAccumul,accV,accRowPart,startCol);
+			#endif
+			_resetAccVect(accV);
+		}
+	}
+	if (mergeRowsPartitions(outAccumul->accs,AB,cfg))  goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG				   
+	  CAT(checkOverallocRowPartsPercent_, OFF_F)(rowsPartsSizes,AB,cfg->gridCols,bColOffsets);
+	goto _free;
+
+	_err:
+	ERRPRINT("spmmRowByRow2DBlocksAllocated failed\n");
+	if (AB) freeSpmat(AB);
+	AB = NULL; 
+	_free:
+	if (colPartsB){
+		for (ulong i=0; i<cfg->gridCols; i++)
+			freeSpmatInternal(colPartsB+i);
+		free(colPartsB);
+	}
+	free(rowsPartsSizes);
+	free(bColOffsets);
+	if (accVectors)  freeAccsDense(accVectors,gridSize);
+	if (outAccumul)  freeSpMMAcc(outAccumul);
+	
+	return AB;
+		
+}
+///SP3MM
+spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,
+	   			     CONFIG* cfg,SPMM_INTERF spmm){
+	
+	double end,start,elapsed,partial,flops;
+	spmat *RAC = NULL, *out = NULL;
+   
+	if (!spmm){
+		//TODO runtime on sizes decide witch spmm implementation to use if not given
+		spmm = &CAT(spmmRowByRow2DBlocks_,OFF_F);
+	}
+	/* TODO 
+	alloc dense aux vector, reusable over 3 product 
+	TODO arrays sovrallocati per poter essere riusati nelle 2 SpMM
+	ulong auxVectSize = MAX(R->N,AC->N);
+	auxVectSize	  = MAX(auxVectSize,P->N);
+	*/
+	
+	start = omp_get_wtime();
+	/// triple product as a pair of spmm
+	if (!(RAC = spmm(R,AC,cfg)))	goto _free;
+	AUDIT_INTERNAL_TIMES		partial = End - Start;
+	if (!(out = spmm(RAC,P,cfg)))	goto _free;
+	//
+	end = omp_get_wtime();
+	ElapsedInternal = End - Start + partial;
+	VERBOSE {
+		elapsed		 = end - start;
+		flops = (2 * R->NZ * P->NZ * AC->NZ) / (elapsed);
+		printf("elapsed %le - flops %le",elapsed,flops);
+		AUDIT_INTERNAL_TIMES
+			printf("\tinternalTime: %le",ElapsedInternal);
+		printf("\n");
+	}
+	_free:
+	zeroSpmat(RAC);
+	freeSpmat(RAC);
+
+	return out;
+}
+
+////////Sp3MM direct
+///1D
+spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,
+	   SPMM_INTERF spmm){
+	
+	ulong* rowSizes = NULL;
+	SPMM_ACC* outAccumul=NULL;
+	ACC_DENSE *accVectorsR_AC=NULL,*accVectorsRAC_P=NULL,*accRAC,*accRACP;
+
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* out = allocSpMatrix(R->M,P->N);
+	if (!out)   goto _err;
+	/*TODO 3MM VERSION COMPUTE OUT ALLOC :  
+	 -> \forall RAC.row -> hashmap{col=True}->(AVL||RBTHREE); upperBound std col RAC.rows.cols in hashmap || SYM_bis
+	 * NB: UP per RACP => NN note dimensioni righe precise => stesso approccio riservazione spazio di spmm ( fetch_and_add )
+	 *	 SYM_BIS ==> note dimensioni righe => 
+	 *		  1) pre riservazione spazio per righe -> cache allignement per threads 
+				 -(sc. static & blocco di righe allineato a cache block successivo a blocco righe precedente)
+				 -(sc. dynamic& righe tutte allineate a cache block (NO OVERLAPS!) -> huge overhead ?
+	 *		  2) pre riservazione spazio righe diretamente in out CSR
+					-> probabili cache blocks overlap; salvo costo di P.M memcpy
+	*/
+	if (!(rowSizes = CAT(spMMSizeUpperbound_,OFF_F)(R,AC))) goto _err;	///TODO TOO LOOSE UB...INTEGRATE RBTREE FOR SYM->PRECISE
+	///aux structures alloc 
+	if (!(outAccumul = initSpMMAcc(rowSizes[R->M],P->M)))   goto _err; //TODO size estimated with RAC mat
+	if (!(accVectorsR_AC = _initAccVectors(cfg->threadNum,AC->N))){ //TODO LESS || REUSE
+		ERRPRINT("accVectorsR_AC init failed\n");
+		goto _err;
+	}
+	if (!(accVectorsRAC_P = _initAccVectors(cfg->threadNum,R->N))){ //TODO LESS || REUSE
+		ERRPRINT("accVectorsRAC_P init failed\n");
+		goto _err;
+	}
+
+	ulong c;
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (R->M,R,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	#pragma omp parallel for schedule(runtime) private(accRAC,accRACP,c)
+	for (ulong r=0;  r<R->M; r++){	//row-by-row formulation
+		//iterate over nz entry index c inside current row r
+		accRAC  = accVectorsR_AC  + omp_get_thread_num();
+		accRACP = accVectorsRAC_P + omp_get_thread_num();
+		//computing (tmp) R*AC r-th row
+		for (idx_t j=R->IRP[r]-OFF_F; j<R->IRP[r+1]-OFF_F; j++)
+			CAT(scSparseRowMul_,OFF_F)(R->AS[j], AC, R->JA[j]-OFF_F, accRAC);
+		//forward the computed row
+		for (idx_t j=0; j<accRAC->nnzIdxMap.len; j++){
+			c = accRAC->nnzIdx[j];	
+			CAT(scSparseRowMul_,OFF_F)(accRAC->v[c],P,c,accRACP);
+		}
+		//trasform accumulated dense vector to a CSR row TODO in UB buff
+		sparsifyUBNoPartsBounds(outAccumul,accRACP,outAccumul->accs+r,0);
+		_resetAccVect(accRAC);
+		_resetAccVect(accRACP);
+	}
+	///merge sparse row computed before
+	if (mergeRows(outAccumul->accs,out))	goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(out);
+	#endif
+	AUDIT_INTERNAL_TIMES{
+		End=omp_get_wtime();
+		ElapsedInternal = End-Start;
+	}
+	DEBUG				   checkOverallocPercent(rowSizes,out);
+	goto _free;
+
+	_err:
+	if(out) freeSpmat(out);
+	out = NULL;
+	_free:
+	if(rowSizes)	   free(rowSizes);
+	if(accVectorsR_AC)  freeAccsDense(accVectorsR_AC,cfg->threadNum);
+	if(accVectorsRAC_P) freeAccsDense(accVectorsRAC_P,cfg->threadNum);
+	if(outAccumul)	  freeSpMMAcc(outAccumul);
+	
+	return out;
+}
--- a/base/serial/impl/sp3mm4amg/commons/ompGetICV.c
+++ b/base/serial/impl/sp3mm4amg/commons/ompGetICV.c
@ -0,0 +1,86 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include <stdlib.h>
+#include <stdio.h>
+#include <omp.h>
+#include <config.h>
+
+char* SCHEDULES[]={"OMP_SCHED_STATIC","OMP_SCHED_DYNAMIC","OMP_SCHED_GUIDED","OMP_SCHED_AUTO"};
+void ompGetRuntimeSchedule(int* kind_chunk_monotonic){
+	/*
+	 * export OMP_SCHEDULE="[modifier:]kind[, chunk]"
+	 * typedef enum omp_sched_t {
+	 * //schedule kinds
+	 * omp_sched_static     = 0x1,
+	 * omp_sched_dynamic    = 0x2,
+	 * omp_sched_guided     = 0x3,
+	 * omp_sched_auto       = 0x4,
+	 * //schedule modifier  //TODO API>=5.0
+	 * omp_sched_monotonic  = 0x80000000u   //TODO OMP >=5
+	 * } omp_sched_t;
+	 */
+	omp_sched_t k,kind; int chunk_size,monotonic=0;
+	omp_get_schedule(&kind,&chunk_size);
+	k=kind; //[monotonic OFF]
+	#if _OPENMP >= 201811   //OMP_SCHED_SCHEDULE modifier from 5.0
+	monotonic = omp_sched_monotonic & kind;
+	if(monotonic)   k = kind - omp_sched_monotonic;
+	#endif
+	printf("omp sched gather:\tkind:%s\tomp chunkSize:%d\tmonotonic:%s\tfairChunkFolding:%d\n",
+	  SCHEDULES[k-1],chunk_size,monotonic?"Y":"N",FAIR_CHUNKS_FOLDING);
+	if (kind_chunk_monotonic){
+		kind_chunk_monotonic[0] = k;
+		kind_chunk_monotonic[1] = chunk_size;
+		kind_chunk_monotonic[2] = monotonic;
+	}
+}
+
+float ompVersionMacroMap(){
+	switch ( _OPENMP ){
+		case 200505:	return 2.5; 
+		case 200805:	return 3.0;
+		case 201107:	return 3.1;
+		case 201307:	return 4.0;
+		case 201511:	return 4.5;
+		case 201811:	return 5.0;
+		case 202011:	return 5.1;
+	}
+}
+//WRAPPER to print all ICV vars
+#ifdef OMP_GET_ICV_MAIN
+int main(){
+#else
+void ompGetAllICV(){
+#endif
+	printf("export OMP_DISPLAY_ENV=VERBOSE for full ICV details\n"); 
+	printf("omp MAX THREADS USABLE\t%d\n",omp_get_max_threads());
+	ompGetRuntimeSchedule(NULL);
+	printf("omp API version:\t %1.1f\n",ompVersionMacroMap()); 
+}
--- a/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/mpi.h.in
+++ b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/mpi.h.in
--- a/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_config_minimal.h
+++ b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_config_minimal.h
@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007-2021 Cisco Systems, Inc.  All rights reserved
+ * Copyright (c) 2008-2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2009-2012 Oak Rigde National Laboratory.  All rights reserved.
+ * Copyright (c) 2011-2020 Sandia National Laboratories. All rights reserved.
+ * Copyright (c) 2012-2018 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2011-2013 INRIA.  All rights reserved.
+ * Copyright (c) 2015      University of Houston. All rights reserved.
+ * Copyright (c) 2015-2021 Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2017-2019 IBM Corporation.  All rights reserved.
+ * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021-2022 Google, LLC. All rights reserved.
+ * Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates.  All Rights
+ *                         reserved.
+ * Copyright (c) 2021      Bull S.A.S. All rights reserved.
+ * Copyright (c) 2018      Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2021 Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/*
+ * Error classes and codes
+ * Do not change the values of these without also modifying mpif.h.in.
+ */
+#define MPI_SUCCESS                   0
+#define MPI_ERR_BUFFER                1
+#define MPI_ERR_COUNT                 2
+#define MPI_ERR_TYPE                  3
+#define MPI_ERR_TAG                   4
+#define MPI_ERR_COMM                  5
+#define MPI_ERR_RANK                  6
+#define MPI_ERR_REQUEST               7
+#define MPI_ERR_ROOT                  8
+#define MPI_ERR_GROUP                 9
+#define MPI_ERR_OP                    10
+#define MPI_ERR_TOPOLOGY              11
+#define MPI_ERR_DIMS                  12
+#define MPI_ERR_ARG                   13
+#define MPI_ERR_UNKNOWN               14
+#define MPI_ERR_TRUNCATE              15
+#define MPI_ERR_OTHER                 16
+#define MPI_ERR_INTERN                17
+#define MPI_ERR_IN_STATUS             18
+#define MPI_ERR_PENDING               19
+#define MPI_ERR_ACCESS                20
+#define MPI_ERR_AMODE                 21
+#define MPI_ERR_ASSERT                22
+#define MPI_ERR_BAD_FILE              23
+#define MPI_ERR_BASE                  24
+#define MPI_ERR_CONVERSION            25
+#define MPI_ERR_DISP                  26
+#define MPI_ERR_DUP_DATAREP           27
+#define MPI_ERR_FILE_EXISTS           28
+#define MPI_ERR_FILE_IN_USE           29
+#define MPI_ERR_FILE                  30
+#define MPI_ERR_INFO_KEY              31
+#define MPI_ERR_INFO_NOKEY            32
+#define MPI_ERR_INFO_VALUE            33
+#define MPI_ERR_INFO                  34
+#define MPI_ERR_IO                    35
+#define MPI_ERR_KEYVAL                36
+#define MPI_ERR_LOCKTYPE              37
+#define MPI_ERR_NAME                  38
+#define MPI_ERR_NO_MEM                39
+#define MPI_ERR_NOT_SAME              40
+#define MPI_ERR_NO_SPACE              41
+#define MPI_ERR_NO_SUCH_FILE          42
+#define MPI_ERR_PORT                  43
+#define MPI_ERR_QUOTA                 44
+#define MPI_ERR_READ_ONLY             45
+#define MPI_ERR_RMA_CONFLICT          46
+#define MPI_ERR_RMA_SYNC              47
+#define MPI_ERR_SERVICE               48
+#define MPI_ERR_SIZE                  49
+#define MPI_ERR_SPAWN                 50
+#define MPI_ERR_UNSUPPORTED_DATAREP   51
+#define MPI_ERR_UNSUPPORTED_OPERATION 52
+#define MPI_ERR_WIN                   53
+
+#define MPI_T_ERR_MEMORY              54
+#define MPI_T_ERR_NOT_INITIALIZED     55
+#define MPI_T_ERR_CANNOT_INIT         56
+#define MPI_T_ERR_INVALID_INDEX       57
+#define MPI_T_ERR_INVALID_ITEM        58
+#define MPI_T_ERR_INVALID_HANDLE      59
+#define MPI_T_ERR_OUT_OF_HANDLES      60
+#define MPI_T_ERR_OUT_OF_SESSIONS     61
+#define MPI_T_ERR_INVALID_SESSION     62
+#define MPI_T_ERR_CVAR_SET_NOT_NOW    63
+#define MPI_T_ERR_CVAR_SET_NEVER      64
+#define MPI_T_ERR_PVAR_NO_STARTSTOP   65
+#define MPI_T_ERR_PVAR_NO_WRITE       66
+#define MPI_T_ERR_PVAR_NO_ATOMIC      67
+#define MPI_ERR_RMA_RANGE             68
+#define MPI_ERR_RMA_ATTACH            69
+#define MPI_ERR_RMA_FLAVOR            70
+#define MPI_ERR_RMA_SHARED            71
+#define MPI_T_ERR_INVALID             72
+#define MPI_T_ERR_INVALID_NAME        73
+#define MPI_ERR_PROC_ABORTED          74
+
+/* not #if conditional on OPAL_ENABLE_FT_MPI for ABI */
+#define MPI_ERR_PROC_FAILED           75
+#define MPI_ERR_PROC_FAILED_PENDING   76
+#define MPI_ERR_REVOKED               77
+
+/* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
+   MPI_ERR_<foo> code. Set the last code to allow some room for adding
+   error codes without breaking ABI. */
+#define MPI_ERR_LASTCODE              92
+
+/*
+ * Comparison results.  Don't change the order of these, the group
+ * comparison functions rely on it.
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_IDENT,
+  MPI_CONGRUENT,
+  MPI_SIMILAR,
+  MPI_UNEQUAL
+};
+
+/*
+ * MPI_Init_thread constants
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_THREAD_SINGLE,
+  MPI_THREAD_FUNNELED,
+  MPI_THREAD_SERIALIZED,
+  MPI_THREAD_MULTIPLE
+};
+
+/*
+ * Datatype combiners.
+ * Do not change the order of these without also modifying mpif.h.in.
+ * (see also mpif-common.h.fin).
+ */
+enum {
+  MPI_COMBINER_NAMED,
+  MPI_COMBINER_DUP,
+  MPI_COMBINER_CONTIGUOUS,
+  MPI_COMBINER_VECTOR,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HVECTOR_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HVECTOR_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HVECTOR,
+  MPI_COMBINER_INDEXED,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HINDEXED_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HINDEXED_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HINDEXED,
+  MPI_COMBINER_INDEXED_BLOCK,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_STRUCT_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_STRUCT_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_STRUCT,
+  MPI_COMBINER_SUBARRAY,
+  MPI_COMBINER_DARRAY,
+  MPI_COMBINER_F90_REAL,
+  MPI_COMBINER_F90_COMPLEX,
+  MPI_COMBINER_F90_INTEGER,
+  MPI_COMBINER_RESIZED,
+  MPI_COMBINER_HINDEXED_BLOCK
+};
+
+////MOCK ERRHANDELERS
+#define OMPI_ERRHANDLER_INVOKE(comm,err,fname)				err
+#define OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,fname)		err
+
--- a/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_dims_create.c
+++ b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_dims_create.c
@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2014 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2012      Los Alamos National Security, LLC.  All rights
+ *                         reserved.
+ * Copyright (c) 2014      Intel, Inc. All rights reserved
+ * Copyright (c) 2015 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015-2016 Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * extraction from openMPI, mocking not needed dependencies for MPI_Dims_create function export: 
+ * Copyright (c) 2022 		Andrea Di Iorio
+ */
+
+//#include "ompi_config.h"
+#include "ompi_config_minimal.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <math.h>
+
+//#include "ompi/mpi/c/bindings.h"
+//#include "ompi/runtime/params.h"
+//#include "ompi/communicator/communicator.h"
+//#include "ompi/errhandler/errhandler.h"
+
+
+static const char FUNC_NAME[] = "MPI_Dims_create";
+
+/* static functions */
+static int assignnodes(int ndim, int nfactor, int *pfacts,int **pdims);
+static int getfactors(int num, int *nfators, int **factors);
+
+
+/*
+ * This is a utility function, no need to have anything in the lower
+ * layer for this at all
+ * ****
+ *	original manpage ... https://www.open-mpi.org/doc/v3.1/man3/MPI_Dims_create.3.php
+ *  For Cartesian topologies, the function MPI_Dims_create helps the user select a balanced distribution of processes per coordinate direction, 
+ *  depending on the number of processes in the group to be balanced and optional constraints that can be specified by the user. 
+ *  One use is to partition all the processes (the size of MPI_COMM_WORLD’s group) into an n-dimensional topology.
+ *	The entries in the array dims are set to describe a Cartesian grid with ndims dimensions and a total of nnodes nodes. 
+ *	The dimensions are set to be as close to each other as possible, using an appropriate divisibility algorithm. 
+ *
+ *	The caller may further constrain the operation of this routine by specifying elements of array dims. 
+ *	If dims[i] is set to a positive number, the routine will not modify the number of nodes in dimension i; only those entries where dims[i] = 0 are modified by the call.
+ *	Negative input values of dims[i] are erroneous. An error will occur if nnodes is not a multiple of ((pi) over (i, dims[i] != 0)) dims[i].
+ *	For dims[i] set by the call, dims[i] will be ordered in nonincreasing order. Array dims is suitable for use as input to routine MPI_Cart_create. MPI_Dims_create is local.
+ *	Example:
+ *	
+ *	dims
+ *	before                    dims
+ *	call        function call        on return
+ *	-----------------------------------------------------
+ *	(0,0)    MPI_Dims_create(6, 2, dims)    (3,2)
+ *	(0,0)    MPI_Dims_create(7, 2, dims)     (7,1)
+ *	(0,3,0)    MPI_Dims_create(6, 3, dims)    (2,3,1)
+ *	(0,3,0)    MPI_Dims_create(7, 3, dims)    erroneous call
+ *	------------------------------------------------------
+ */
+int MPI_Dims_create(int nnodes, int ndims, int dims[])
+{
+    int i;
+    int freeprocs;
+    int freedims;
+    int nfactors;
+    int *factors;
+    int *procs;
+    int *p;
+    int err;
+
+    /*if (MPI_PARAM_CHECK) {
+        OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+        if (0 > ndims) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD,
+                                           MPI_ERR_DIMS, FUNC_NAME);
+        }
+
+        if ((0 != ndims) && (NULL == dims)) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD,
+                                           MPI_ERR_ARG, FUNC_NAME);
+        }
+
+        if (1 > nnodes) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD,
+                                           MPI_ERR_DIMS, FUNC_NAME);
+        }
+    }*/
+	assert( ndims > 0 && nnodes > 1 && NULL != dims ); 
+    /* Get # of free-to-be-assigned processes and # of free dimensions */
+    freeprocs = nnodes;
+    freedims = 0;
+    for (i = 0, p = dims; i < ndims; ++i,++p) {
+        if (*p == 0) {
+            ++freedims;
+        } else if ((*p < 0) || ((nnodes % *p) != 0)) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD, MPI_ERR_DIMS,
+                                           FUNC_NAME);
+        } else {
+            freeprocs /= *p;
+        }
+    }
+
+    if (freedims == 0) {
+       if (freeprocs == 1) {
+          return MPI_SUCCESS;
+       }
+       return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_DIMS,
+                                     FUNC_NAME);
+    }
+
+    if (freeprocs == 1) {
+        for (i = 0; i < ndims; ++i, ++dims) {
+            if (*dims == 0) {
+               *dims = 1;
+            }
+        }
+        return MPI_SUCCESS;
+    }
+
+    /* Factor the number of free processes */
+    if (MPI_SUCCESS != (err = getfactors(freeprocs, &nfactors, &factors))) {
+       return OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,
+                                     FUNC_NAME);
+    }
+
+    /* Assign free processes to free dimensions */
+    if (MPI_SUCCESS != (err = assignnodes(freedims, nfactors, factors, &procs))) {
+       free(factors);
+       return OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,
+                                     FUNC_NAME);
+    }
+
+    /* Return assignment results */
+    p = procs;
+    for (i = 0; i < ndims; ++i, ++dims) {
+        if (*dims == 0) {
+           *dims = *p++;
+        }
+    }
+
+    free((char *) factors);
+    free((char *) procs);
+
+    /* all done */
+    return MPI_SUCCESS;
+}
+
+/*
+ *  assignnodes
+ *
+ *  Function:   - assign processes to dimensions
+ *          - get "best-balanced" grid
+ *          - greedy bin-packing algorithm used
+ *          - sort dimensions in decreasing order
+ *          - dimensions array dynamically allocated
+ *  Accepts:    - # of dimensions
+ *          - # of prime factors
+ *          - array of prime factors
+ *          - ptr to array of dimensions (returned value)
+ *  Returns:    - 0 or ERROR
+ */
+static int
+assignnodes(int ndim, int nfactor, int *pfacts, int **pdims)
+{
+    int *bins;
+    int i, j;
+    int n;
+    int f;
+    int *p;
+    int *pmin;
+
+    if (0 >= ndim) {
+       return MPI_ERR_DIMS;
+    }
+
+    /* Allocate and initialize the bins */
+    bins = (int *) malloc((unsigned) ndim * sizeof(int));
+    if (NULL == bins) {
+       return MPI_ERR_NO_MEM;
+    }
+    *pdims = bins;
+
+    for (i = 0, p = bins; i < ndim; ++i, ++p) {
+        *p = 1;
+     }
+
+    /* Loop assigning factors from the highest to the lowest */
+    for (j = nfactor - 1; j >= 0; --j) {
+        f = pfacts[j];
+        /* Assign a factor to the smallest bin */
+        pmin = bins;
+        for (i = 1, p = pmin + 1; i < ndim; ++i, ++p) {
+            if (*p < *pmin) {
+                pmin = p;
+            }
+        }
+        *pmin *= f;
+     }
+
+     /* Sort dimensions in decreasing order (O(n^2) for now) */
+     for (i = 0, pmin = bins; i < ndim - 1; ++i, ++pmin) {
+         for (j = i + 1, p = pmin + 1; j < ndim; ++j, ++p) {
+             if (*p > *pmin) {
+                n = *p;
+                *p = *pmin;
+                *pmin = n;
+             }
+         }
+     }
+
+     return MPI_SUCCESS;
+}
+
+/*
+ *  getfactors
+ *
+ *  Function:   - factorize a number
+ *  Accepts:    - number
+ *          - # prime factors
+ *          - array of prime factors
+ *  Returns:    - MPI_SUCCESS or ERROR
+ */
+static int
+getfactors(int num, int *nfactors, int **factors) {
+    int size;
+    int d;
+    int i;
+    int sqrtnum;
+
+    if(num  < 2) {
+        (*nfactors) = 0;
+        (*factors) = NULL;
+        return MPI_SUCCESS;
+    }
+    /* Allocate the array of prime factors which cannot exceed log_2(num) entries */
+    sqrtnum = ceil(sqrt(num));
+    size = ceil(log(num) / log(2));
+    *factors = (int *) malloc((unsigned) size * sizeof(int));
+
+    i = 0;
+    /* determine all occurences of factor 2 */
+    while((num % 2) == 0) {
+        num /= 2;
+        (*factors)[i++] = 2;
+    }
+    /* determine all occurences of uneven prime numbers up to sqrt(num) */
+    d = 3;
+    for(d = 3; (num > 1) && (d <= sqrtnum); d += 2) {
+        while((num % d) == 0) {
+            num /= d;
+            (*factors)[i++] = d;
+        }
+    }
+    /* as we looped only up to sqrt(num) one factor > sqrt(num) may be left over */
+    if(num != 1) {
+        (*factors)[i++] = num;
+    }
+    (*nfactors) = i;
+    return MPI_SUCCESS;
+}
+
+
+#ifdef TEST_MAIN
+int main(int argc,char** argv){
+	if (argc != 2)	{fprintf(stderr,"usage: <N> ->  to fit in a 2D grid");return EXIT_FAILURE;}
+	int dims2[2] = {0,0};
+	int n = atoi(argv[1]);
+	if (MPI_Dims_create(n,2,dims2)){return EXIT_FAILURE;} printf("%d 2D division:\t%d-%d\n",n,dims2[0],dims2[1]);
+}
+#endif //TEST_MAIN
--- a/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
+++ b/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
@ -0,0 +1,499 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <errno.h>
+
+#include "macros.h"
+#include "sparseMatrix.h"
+#include "utils.h"
+
+///////////////   no offset --> SINGLE implementation functions
+#ifndef SPARSE_UTILS_C
+#define SPARSE_UTILS_C
+///DENSE accumulator utils
+/*
+ * alloc threads' aux arrays once and split them in threads' structures
+ * so free them once from the first thread struct, with the original pointers returned from the alloc
+ */
+ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEXT
+	ACC_DENSE* out	 = NULL;
+	double* vAll	 = NULL;
+	ulong* vAllNzIdx = NULL;
+	if (!(out = calloc(num,sizeof(*out)))){
+		ERRPRINT("_initAccVectors aux struct alloc failed\n");
+		return NULL;
+	}
+	if (!(vAll = calloc(num*size,sizeof(*vAll)))) {
+		ERRPRINT("_initAccVectors aux dense vectors alloc failed\n");
+		goto err;
+	}
+	if (!(vAllNzIdx = calloc(num*size,sizeof(*vAllNzIdx)))) {
+		ERRPRINT("_initAccVectors aux dense vectors' idx alloc failed\n");
+		goto err;
+	}
+	for (ulong i=0; i<num; i++){
+		out[i].vLen	= size; //TODO USELESS INFO?
+		out[i].v	= vAll	  + i*size;  
+		out[i].nnzIdx	= vAllNzIdx + i*size;;
+		out[i].nnzIdxMap.len  = 0;
+	}
+	return out;
+	
+	err:
+	free(out);
+	free(vAll);
+	free(vAllNzIdx);
+	return NULL;
+}
+int allocAccDense(ACC_DENSE* v,ulong size){
+		v->vLen = size; 
+		if (!(v->v = calloc(size,sizeof(*(v->v))))) {
+			ERRPRINT("_initAccVectors aux dense vector alloc failed\n");
+			return EXIT_FAILURE;
+		}
+		if (!(v->nnzIdx = calloc(size,sizeof(*(v->nnzIdx))))) {
+			ERRPRINT("_initAccVectors aux dense vector' idx alloc failed\n");
+			return EXIT_FAILURE;
+		}
+		if (initSpVectIdxDenseAcc(size, &v->nnzIdxMap))	return EXIT_FAILURE;
+
+		return EXIT_SUCCESS;
+}
+
+ACC_DENSE* _initAccVectors(ulong num,ulong size){
+	ACC_DENSE* out	= NULL;
+	if (!(out = calloc(num,sizeof(*out)))){
+		ERRPRINT("_initAccVectors aux struct alloc failed\n");
+		return NULL;
+	}
+	for (ulong i=0; i<num; i++){
+		if (allocAccDense(out+i,size))   goto _err;
+	}
+	return out;
+	
+	_err:
+	for (ulong i=0; i<num; i++){
+		if (out[i].v)	   free(out[i].v);
+		if (out[i].nnzIdx)  free(out[i].nnzIdx);
+	}
+	free(out);
+	return NULL;
+}
+
+void freeAccsDense(ACC_DENSE* vectors,ulong num){
+	for (ulong i=0; i<num; i++){
+		free(vectors[i].v);
+		free(vectors[i].nnzIdx);
+		free(vectors[i].nnzIdxMap.idxsMap);
+	}
+	free(vectors);
+}
+void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num){ 
+	if (!vectors)   return;
+	for (ulong i=0; i<num; i++){
+		if(vectors[i].v)	free(vectors[i].v);
+		if(vectors[i].nnzIdx)   free(vectors[i].nnzIdx);
+	}
+	free(vectors);
+}
+
+int initSpVectIdxDenseAcc(idx_t idxMax,SPVECT_IDX_DENSE_MAP* vectIdxsMap){
+	vectIdxsMap->len = 0;
+	nnz_idxs_flags_t* idxMaps = &vectIdxsMap->idxsMap;
+	#if SPVECT_IDX_BITWISE == TRUE //nnz presence falgs as bitflags in limbs
+	vectIdxsMap->idxsMapN = INT_DIV_CEIL(idxMax, LIMB_SIZE_BIT);
+	#else	//nnz presence falgs in a single array
+	vectIdxsMap->idxsMapN = idxMax;
+	#endif //SPVECT_IDX_BITWISE == TRUE
+	if (!(*idxMaps = calloc(vectIdxsMap->idxsMapN, sizeof(**idxMaps)))) {
+		ERRPRINT("initSpVectIdxDenseAcc\tidxMaps SPVECT_IDX_BITWISE callc err\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+void checkOverallocPercent(ulong* forecastedSizes,spmat* AB){
+	for (ulong r=0,rSize,forecastedSize; r < AB->M; r++){
+		forecastedSize = forecastedSizes[r];
+		#ifdef ROWLENS
+		rSize = AB->RL[r];
+		#else
+		rSize = AB->IRP[r+1] - AB->IRP[r];
+		#endif
+		DEBUGCHECKS{
+			if ( forecastedSize < rSize ){
+				ERRPRINT("BAD FORECASTING\n");
+				assert(forecastedSize >= rSize );
+			}
+		}
+		DEBUGPRINT
+			printf("extra forecastedSize of row: %lu\t=\t%lf %% \n",
+			  r,100*(forecastedSize-rSize) / (double) forecastedSize);
+	}
+	idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
+	printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
+	  extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
+}
+int spmatDiff(spmat* A, spmat* B){
+	if (A->NZ != B->NZ){
+		ERRPRINT("NZ differ\n");
+		return EXIT_FAILURE;
+	}
+	if (memcmp(A->IRP,B->IRP,A->M)){
+		ERRPRINT("IRP differ\n");
+		return EXIT_FAILURE;
+	}
+	if (doubleVectorsDiff(A->AS,B->AS,A->NZ,NULL)){
+		ERRPRINT("AS DIFFER\n");
+		return EXIT_FAILURE;
+	}
+	if (memcmp(A->JA,B->JA,A->NZ)){
+		ERRPRINT("JA differ\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+double* CSRToDense(spmat* sparseMat){
+	double* denseMat;
+	ulong i,j,idxNZ,denseSize;
+	if (__builtin_umull_overflow(sparseMat->M,sparseMat->N,&denseSize)){
+		ERRPRINT("overflow in dense allocation\n");
+		return NULL;
+	}
+	if (!(denseMat = calloc(denseSize, sizeof(*denseMat)))){
+		ERRPRINT("dense matrix alloc failed\n");
+		return  NULL;
+	}
+	for (i=0;i<sparseMat->M;i++){
+		for (idxNZ=sparseMat->IRP[i]; idxNZ<sparseMat->IRP[i+1]; ++idxNZ){
+			 j = sparseMat->JA[idxNZ];
+			 //converting sparse item into dense entry
+			 denseMat[(ulong) IDX2D(i,j,sparseMat->N)] = sparseMat->AS[idxNZ]; 
+		}
+	}
+	return denseMat;
+}
+void printSparseMatrix(spmat* spMatrix,char justNZMarkers){
+	double* denseMat = CSRToDense(spMatrix);
+	if (!denseMat)  return;
+	printMatrix(denseMat,spMatrix->M,spMatrix->N,justNZMarkers);
+	free(denseMat);
+}
+
+
+
+static inline int _colsPartitioningUnifRanges_init(spmat* A,uint gridCols,
+  spmat** colParts,idx_t** colPartsLens){
+
+	spmat* colPart;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *tmpJA;
+	///alloc/init partitions structures
+	if (!(*colParts = calloc(gridCols, sizeof(**colParts)))){
+		ERRPRINT("colsPartitioningUnifRanges\tcolumns partitions of A calloc fail\n");
+		return EXIT_FAILURE;
+	}
+	for (ulong i=0,colBlock; i<gridCols; i++){
+		colBlock = UNIF_REMINDER_DISTRI(i,_colBlock,_colBlockRem);
+		colPart  = *colParts + i;
+		if (allocSpMatrixInternal(A->M,colBlock,colPart)){
+			ERRPRINT("colsPartitioningUnifRanges\tallocSpMatrixInternal partition err\n");
+			return EXIT_FAILURE;
+		}
+		//TODO TODO overalloc A cols partitions NZ arrays, then realloc
+		if (!(colPart->AS = malloc(A->NZ * sizeof(*A->AS)))){
+			ERRPRINT("colPart of A overalloc of AS errd\n");
+			return EXIT_FAILURE;
+		}
+		if (!(colPart->JA = malloc(A->NZ * sizeof(*A->JA)))){
+			ERRPRINT("colPart of A overalloc of JA errd\n");
+			return EXIT_FAILURE;
+		}
+	}
+	//for each A col partition -> last copied nz index = nnz copied ammount
+	if (! (*colPartsLens = calloc(gridCols, sizeof(**colPartsLens))) ) {
+		ERRPRINT("colsPartitioningUnifRanges: colPartsLens calloc errd\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,uint gridCols,
+  spmat* colParts,idx_t* colPartsLens){
+
+	spmat* colPart;
+	double* tmpAS; idx_t* tmpJA;
+	//realloc overallcd A parts NZ arrays (TODO -> downsizing -> nofails?)
+	for (ulong i=0,partLen; i<gridCols; i++){
+		colPart = colParts + i;
+		partLen = colPartsLens[i];
+		if (!(tmpAS = realloc(colPart->AS,partLen*sizeof(*(colPart->AS))))){
+			ERRPRINT("realloc overallocated cols partition AS array\n");
+			return EXIT_FAILURE;
+		}
+		colPart->AS = tmpAS;
+		if (!(tmpJA = realloc(colPart->JA,partLen*sizeof(*(colPart->JA))))){
+			ERRPRINT("realloc overallocated cols partition JA array\n");
+			return EXIT_FAILURE;
+		}
+		colPart->JA		 = tmpJA;
+		colPart->NZ		 = partLen;
+		colPart->IRP[A->M]  = partLen;
+	}
+	return EXIT_SUCCESS;
+}
+#endif
+
+ 
+#ifndef OFF_F
+	#error generic implementation requires OFF_F defined
+#endif
+////////////////////////  CSR SPECIFIC -- TODO RENAME //////////////////
+///SPARSE MATRIX PARTITIONING
+idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
+	ulong subRowsN = A->M * gridCols;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
+	ulong* offsets = malloc( (subRowsN+1) * sizeof(*offsets) );
+	if (!offsets)  {
+		ERRPRINT("colsOffsetsPartitioningUnifRanges:\toffsets malloc errd\n");
+		return NULL;
+	}
+	///OFFSETS COMPUTE FOR COL GROUPS -> O( A.NZ )
+	for (ulong r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
+		offsets[ IDX2D(r,0,gridCols) ] = j;  //row's first gc start is costrained
+		//navigate column groups inside current row
+		for (ulong gc=1,gcStartCol;  gc<gridCols;  gc++){
+			gcStartCol = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
+			//goto GroupCols start entry,keeping A's nnz entries navigation (idx j)
+			//for (ulong c=A->JA[j]-OFF_F; c<gcStartCol && j < A->IRP[r+1]-OFF_F; c=A->JA[++j]-OFF_F);
+			while (j < A->IRP[r+1]-OFF_F &&  A->JA[j]-OFF_F < gcStartCol)  
+				j++;
+			offsets[ IDX2D(r,gc,gridCols) ] = j;  //row's gc group startIdx
+		}
+	}
+	offsets[subRowsN] = A->NZ;  //last row's partition end costrained
+	return offsets;
+}
+
+spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,
+	   idx_t** colPartsOffsets)
+{
+	spmat *colParts = NULL, *colPart;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
+	ulong *colPartsLens=NULL, *tmpJA;
+	double* tmpAS;
+
+	///alloc/init partitions structures
+	idx_t* colOffsets  = NULL;
+	if (!(colOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(A,gridCols))) 
+		goto _err;
+	if (_colsPartitioningUnifRanges_init(A,gridCols,&colParts,&colPartsLens))
+		goto _err;
+	//OFFSET BASED COPY OF A.COL_GROUPS -> O( A.NZ )
+	for (idx_t r=0,gcId=0;	 r<A->M;	r++){
+		for (idx_t gc=0,gcStartIdx=0,gLen=0;  gc<gridCols; gc++,gcId++){
+			//gcId		= IDX2D(r,gc,gridCols); //seqent. scan of parts
+			colPart		= colParts + gc;
+			gcStartIdx 	= colOffsets[ gcId   ];
+			gLen		= colOffsets[ gcId+1 ] - gcStartIdx;
+			colPart->IRP[r] = colPartsLens[gc];	//new line for the col partition
+			//actual copy of nnz entries to colPartitions
+			memcpy(colPart->AS+colPart->IRP[r], A->AS+gcStartIdx, gLen*sizeof(*A->AS));
+			memcpy(colPart->JA+colPart->IRP[r], A->JA+gcStartIdx, gLen*sizeof(*A->JA));
+			colPartsLens[gc] += gLen;
+			#ifdef ROWLENS
+			colPart->RL[r] = i;
+			#endif
+		}
+	}
+
+	//realloc overallcd A parts NZ arrays
+	if(_colsPartitioningUnifRanges_finalRealloc(A,gridCols,colParts,colPartsLens))
+		goto  _err;
+
+	free(colPartsLens);
+	if (colPartsOffsets)	*colPartsOffsets = colOffsets;	//save for the caller
+	else			free(colOffsets);
+
+	return colParts;
+	_err:
+	free(*colPartsOffsets);
+	for (ulong i=0; i<gridCols; i++)   	freeSpmatInternal(colParts+i);
+	free(colOffsets);
+	free(colParts);
+	free(colPartsLens);
+	return NULL;
+}
+spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
+	spmat *colParts, *colPart;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *colPartsLens=NULL, *tmpJA;
+	double* tmpAS;
+	///alloc/init partitions structures
+	if (_colsPartitioningUnifRanges_init(A,gridCols,&colParts,&colPartsLens))
+		goto _err;
+	/* TODO
+	 * Parallelize: 2for collapse OMP, gcEndCol -> startIdxize, ...
+	 * oppure wrappare cio in static inline 
+	 */
+	for (ulong r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
+		//navigate column groups inside current row
+		for (ulong gc=0,gcEndCol=0,i;  gc<gridCols ;  gc++,j+=i){
+			i = 0;  //@i=len current subpartition of row @r to copy
+			colPart = colParts + gc;
+			//NB: not shifting IRP because is handled as internal implementation component
+ 			//But JA idx memcopied -> kept as they were originally, handled with shift in functions
+			colPart->IRP[r] = colPartsLens[gc];	
+			gcEndCol += UNIF_REMINDER_DISTRI(gc,_colBlock,_colBlockRem);
+			//goto next GroupCols,keeping A's nnz entries navigation ( index j+i )
+			//for (ulong c=A->JA[j+i]-OFF_F; c<gcEndCol && j+i  < A->IRP[r+1]-OFF_F; c=A->JA[j+ ++i]-OFF_F);
+			while ( j+i < A->IRP[r+1]-OFF_F && A->JA[j+i]-OFF_F < gcEndCol ) i++;
+			memcpy(colPart->AS+colPart->IRP[r], A->AS+j, i*sizeof(*A->AS));
+			memcpy(colPart->JA+colPart->IRP[r], A->JA+j, i*sizeof(*A->JA));
+			
+			colPartsLens[gc] += i;
+			#ifdef ROWLENS
+			colPart->RL[r] = i;
+			#endif
+		}
+	}
+	//realloc overallcd A parts NZ arrays
+	if(_colsPartitioningUnifRanges_finalRealloc(A,gridCols,colParts,colPartsLens))
+		goto  _err;
+	free(colPartsLens);
+	return colParts;
+	_err:
+	for (ulong i=0; i<gridCols; i++)   freeSpmatInternal(colParts+i);
+	if(colParts)		free(colParts);
+	if(colPartsLens)	free(colPartsLens);
+	return NULL;
+}
+
+
+void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
+  				   idx_t gridCols,idx_t* bColOffsets){
+	idx_t* abColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(AB, gridCols);
+	assert(abColOffsets);	//partitioning error
+	for (idx_t rLen,forecast,partId=0; partId<AB->M*gridCols; partId++){
+		forecast = forecastedSizes[partId];
+		rLen = abColOffsets[partId+1] - abColOffsets[partId];
+		DEBUGCHECKS	assert(forecast >= rLen);
+	}
+	idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
+	printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
+		extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
+
+	free(abColOffsets);
+	
+}
+
+
+#ifdef SPARSEUTILS_MAIN_TEST	///unit test embbeded
+
+///inline export here 
+//SPMV_CHUNKS_DISTR spmvChunksFair; 
+spmat* allocSpMatrix(ulong rows, ulong cols);
+int allocSpMatrixInternal(ulong rows, ulong cols, spmat* mat);
+void freeSpmatInternal(spmat* mat);
+void freeSpmat(spmat* mat);
+
+////INTERNAL TEST FUNCTIONS
+//test that each row's partition from colsOffsetsPartitioningUnifRanges is in the correct index range
+#include <alloca.h>
+int testColsOffsetsPartitioningUnifRanges(spmat* mat,ulong gridCols,ulong* partsOffs){
+	ulong _colBlock = mat->N/gridCols, _colBlockRem = mat->N%gridCols;
+	ulong j=0;	//CSR scanning nnz idx
+	ulong* colPartsPopulations = alloca(gridCols * sizeof(*colPartsPopulations));
+	memset(colPartsPopulations,0,gridCols * sizeof(*colPartsPopulations));
+	for (ulong r=0,pId=0; r<mat->M; r++){
+		for (ulong gc=0,pStartIdx,pEndIdx; gc<gridCols; gc++,pId++){
+			pStartIdx = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
+			pEndIdx   = UNIF_REMINDER_DISTRI_STARTIDX(gc+1,_colBlock,_colBlockRem)-1; 
+			//pId=IDX2D(r,gc,gridCols);
+			for (ulong idx=partsOffs[pId],c; idx<partsOffs[pId+1]; idx++,j++){
+				c = mat->JA[idx];
+				assert(j == idx); //consecutive index in partitioning
+				assert(pStartIdx <= c && c <= pEndIdx);	//colRange
+				assert(mat->IRP[r] <= idx && idx <= mat->IRP[r+1] ); //rowRange
+			}
+			colPartsPopulations[gc] += partsOffs[pId+1] - partsOffs[pId]; 
+		}
+	}
+	assert(j == mat->NZ);
+	ulong s=0;
+	for (ulong gc=0,partSize; gc < gridCols; gc++,s+=partSize){
+		partSize = colPartsPopulations[gc];
+		double partShare=partSize/(double)mat->NZ,partsAvg=1/(double)gridCols;
+		double partShareAvgDiff = partShare - partsAvg;
+		printf("colPartition %lu has:\t%lu = %lf of NNZ\t\t .. %lf\tAVG diff\n",
+		  gc,partSize,partShare,partShareAvgDiff);
+	}
+	assert(s == mat->NZ); //TODO DUPLICATED
+	return EXIT_SUCCESS;
+}
+
+CONFIG Conf = {
+	.gridRows = 8,
+	.gridCols = 8,
+};
+
+#include "parser.h"
+int main(int argc, char** argv){
+	int out=EXIT_FAILURE;
+	if (init_urndfd())  return out;
+	if (argc < 2 )  {ERRPRINT("COO MATRIX FOR TEST"); return out;}
+	////parse sparse matrix and dense vector
+	spmat* mat;
+	char* trgtMatrix = TMP_EXTRACTED_MARTIX;
+	if (extractInTmpFS(argv[1],TMP_EXTRACTED_MARTIX) < 0)
+		trgtMatrix = argv[1];
+	if (!(mat = MMtoCSR(trgtMatrix))){
+		ERRPRINT("err during conversion MM -> CSR\n");
+		return out;
+	}
+	////partitioning test
+	ulong* colsPartitions = colsOffsetsPartitioningUnifRanges_0(mat,Conf.gridCols);
+	if (!colsPartitions)	goto _free;
+	if (testColsOffsetsPartitioningUnifRanges(mat,Conf.gridCols,colsPartitions))
+		goto _free;
+
+	out=EXIT_SUCCESS;
+	printf("testColsOffsetsPartitioningUnifRanges passed with "
+		   "mat: %lux%lu-%luNNZ\tgrid: %dx%d\n",
+			mat->M,mat->N,mat->NZ,Conf.gridRows,Conf.gridCols);
+	_free:
+	if (colsPartitions) free(colsPartitions);
+
+	return out;
+}
+#endif //SPARSEUTILS_MAIN_TEST
--- a/base/serial/impl/sp3mm4amg/commons/sparseUtilsMulti.c
+++ b/base/serial/impl/sp3mm4amg/commons/sparseUtilsMulti.c
@ -0,0 +1,38 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "sparseUtilsMulti.h"
+
+#define OFF_F 0 
+#include "sparseUtilsGeneric.c"
+#undef  OFF_F
+
+#define OFF_F 1 
+#include "sparseUtilsGeneric.c"
+#undef  OFF_F
--- a/base/serial/impl/sp3mm4amg/commons/utils.c
+++ b/base/serial/impl/sp3mm4amg/commons/utils.c
@ -0,0 +1,502 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//various aux functions
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+//#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "macros.h"
+#include "sparseMatrix.h"
+#include "utils.h"
+
+
+int urndFd; //will point to urandom device file
+
+///IO
+//UNBUFFERED IO
+int write_wrap(int fd,void* src,size_t count){
+	ssize_t wr;
+	size_t written=0;
+	while (written < count){
+		wr=write(fd,src+written,count-written);
+		if (wr<0){
+			perror("write");
+			return wr;
+		}
+		written+=wr;
+	}
+	return 0;
+}
+
+int read_wrap(int fd,void* dst,size_t count){
+	ssize_t rd;
+	size_t readed=0;
+	while (readed < count){
+		rd=read(fd,dst+readed,count-readed);
+		if (rd<0){
+			perror("read");
+			return rd;
+		}
+		readed+=rd;
+	}
+	return 0;
+}
+
+int readALL_wrap(int fd,char** dst,size_t* count){
+	ssize_t rd=!0; //to allow count > fsize
+	size_t readed=0;
+	char allocated=0;   //flag if required *dst allocation
+	if (!(*dst)){  //allocate dst buffer of same size of file
+		off_t seekCurr=lseek(fd,0,SEEK_CUR);
+		off_t fsize=lseek(fd,0,SEEK_END);
+		if( seekCurr==-1 || fsize==-1 || lseek(fd,seekCurr,SEEK_SET)==-1){
+			perror("lseek");
+			return EXIT_FAILURE;
+		}
+		*count=fsize;
+		if (!(*dst=malloc(fsize))){
+			fprintf(stderr,"malloc read_wrap file size buf error\n");
+			return EXIT_FAILURE;
+		}
+		allocated=!0;
+	}
+	//read loop
+	while (readed < *count && rd > 0){
+		rd=read(fd,(*dst)+readed,*count-readed);
+		if (rd<0){
+			perror("read");
+			if (allocated) free(*dst);
+			return rd;
+		}
+		readed+=rd;
+	}
+	if (readed < *count) (*dst)[readed]='\0';	//TODO NEEDED?
+	return EXIT_SUCCESS;
+}
+int init_urndfd(){ // wrap init urndFd
+	if((urndFd=open(DRNG_DEVFILE,O_RDONLY))<0){
+			perror("open DRNG_DEVFILE");
+			return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+int createNewFile(char* const outFpath){
+	int mode=S_IRWXU;
+	errno = 0;
+	int outFd=open(outFpath, O_WRONLY | O_CREAT | O_TRUNC, mode);
+	if (errno==EEXIST)      outFd=open(outFpath, O_WRONLY | O_TRUNC, mode);
+	if (outFd<0)            perror("open outFd failed ");
+	return outFd;
+}
+///BUFFERED IO 
+//TODO series of 0 returns.... fix
+int fread_wrap(FILE* fp,void* dst,size_t count){
+	int rd=0,toRead;
+	size_t readed=0;
+	while (readed < count){
+		toRead = count - readed;
+		rd=fread(dst+readed,1,toRead,fp);
+		if (rd != toRead){ //TODO SHORT ITEM COUNT
+			if (feof(fp))	return EOF;
+			else if (ferror(fp)){
+				ERRPRINT("fread_wrap errd\n");
+				return -2;
+			}
+			//else ERRPRINT("W** SHORT ITEM COUNT RETURN MEANS!?"); //TODO OO
+		}
+		readed+=rd;
+	}
+	return rd;
+}
+///STRUCTURED DATA IO
+int writeDoubleVector(char* fpath,double* v,ulong size){
+	int fd,out=EXIT_FAILURE;
+	if ( (fd = createNewFile(fpath) ) < 0 ) goto _end;
+	write_wrap(fd,v,size * sizeof(*v)); //raw write of vector
+	out = EXIT_SUCCESS;
+	DEBUG printf("written double vector into %s RAW of %lu elements\n",fpath,size);
+
+	_end:
+	if (close(fd) == EOF)  perror("close errd\n");
+	return out;
+}
+
+///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
+int writeDoubleVectorAsStr(char* fpath,double* v,ulong size){
+	int out=EXIT_FAILURE;
+	FILE* fp = fopen(fpath,"w");
+	if (!fp){
+		perror("fopen vector file write");
+		return EXIT_FAILURE;
+	}
+	for (ulong i=0; i<size; i++){
+		if (fprintf(fp,DOUBLE_STR_FORMAT,v[i]) < 0){
+			ERRPRINT("fprintf to out vector file errd\n");
+			goto _end;
+		} 
+	}
+	out = EXIT_SUCCESS;
+	DEBUG printf("written vector into %s of %lu elements\n",fpath,size);
+
+	_end:
+	if (fclose(fp) == EOF)  perror("fclose errd\n");
+	return out;
+
+}
+
+double* readDoubleVector(char* fpath,ulong* size){
+	int rd,hleft;
+	double *out,*tmp;
+	ulong i=0,vectorSize=RNDVECTORSIZE;
+	if (*size)   vectorSize = *size;
+	FILE* fp = fopen(fpath,"r");
+	if (!fp){
+		perror("fopen vector file");
+		return NULL;
+	}
+	if (!(out = malloc(vectorSize * sizeof(*out)))){ 
+		ERRPRINT("vector read malloc fail for file\n");
+		goto _err;
+	}
+	while (1){
+		if (i >= vectorSize ){ //realloc the array
+			vectorSize *= VECTOR_STEP_REALLOC;
+			if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
+				ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
+				goto _err;
+			}
+			out = tmp;
+			DEBUG   printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
+		}
+		ulong toRead = MIN(VECTOR_READ_BLOCK,(vectorSize-i));
+		if((rd = fread_wrap(fp,out + i,sizeof(*out)*toRead)) == -2){
+			ERRPRINT("fread_wrap errd\n");
+			goto _err;
+		}
+		if(feof(fp))	//TODO rd == EOF not work always != W*F???
+			break;
+		i += rd/sizeof(out);
+		if( (hleft = rd % sizeof(out)) ){
+			DEBUG hprintf("half left double read... rounding down fptr\n");
+			if(fseek(fp,-hleft,SEEK_CUR)){
+				perror("fseek in readDoubleVector");
+				goto _err;
+			}
+		}
+	}
+	//REALLOC THE ARRAY TO THE FINAL SIZE
+	assert( i > 0 );
+	if (!(tmp = realloc(out,*size*sizeof(*out)))){
+		ERRPRINT("realloc errd\n");
+		goto _err;
+	}
+	out = tmp;
+	DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
+	goto _free;
+
+	_err:
+	free(out);
+	out = NULL;
+	_free:
+	if (fclose(fp) == EOF)  perror("fclose errd\n");
+	return out;
+}
+
+double* readDoubleVectorStr(char* fpath,ulong* size){
+	int fscanfOut;
+	double *out,*tmp;
+	ulong i=0,vectorSize=RNDVECTORSIZE;
+	if (*size)   vectorSize = *size;
+	FILE* fp = fopen(fpath,"r");
+	if (!fp){
+		perror("fopen vector file");
+		return NULL;
+	}
+	if (!(out = malloc(vectorSize * sizeof(*out)))){ 
+		ERRPRINT("vector read malloc fail for file\n");
+		goto _err;
+	}
+	while (1){
+		if (i >= vectorSize ){ //realloc the array
+			vectorSize *= VECTOR_STEP_REALLOC;
+			if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
+				ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
+				goto _err;
+			}
+			out = tmp;
+			DEBUG   printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
+		}
+		fscanfOut = fscanf(fp,DOUBLE_STR_FORMAT,out + i++ );
+		if ( fscanfOut == EOF && ferror(fp)){
+			perror("invalid fscanf");
+			goto _err;
+		}
+		if ( fscanfOut != 1 || fscanfOut == EOF )   break;  //end of vector
+	}
+	//REALLOC THE ARRAY TO THE FINAL SIZE
+	assert( i > 0 );
+	*size = --i;
+	if (!(tmp = realloc(out,*size*sizeof(*out)))){
+		ERRPRINT("realloc errd\n");
+		goto _err;
+	}
+	out = tmp;
+	DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
+	goto _free;
+
+	_err:
+	free(out);
+	out = NULL;
+	_free:
+	if (fclose(fp) == EOF)  perror("fclose errd\n");
+	return out;
+}
+
+///CFG-AUX
+int getConfig(CONFIG* conf){
+	int changes=EXIT_FAILURE;
+	char *varVal,*ptr;
+	ulong val;
+	if ((varVal = getenv(GRID_ROWS))){
+		val=strtoul(varVal,&ptr,10);
+		if (ptr==varVal || val>= UINT_MAX){
+			perror("strtol errd");
+		} else {
+			conf->gridRows = val;
+		}
+		changes = EXIT_SUCCESS;
+	}
+	if ((varVal = getenv(GRID_COLS))){
+		val=strtoul(varVal,&ptr,10);
+		if (ptr==varVal || val>= UINT_MAX){
+			perror("strtol errd");
+		} else {
+			conf->gridCols = val;
+		}
+		changes = EXIT_SUCCESS;
+	}
+	return changes;
+}
+
+/////LIB-SORTING -- WRAPPERS
+//comparing functions
+static inline int cmp_idx_t(const void* a, const void*b){
+	idx_t aa=*((ulong*) a), bb = *((ulong*) b);
+	return aa==bb?0:aa>bb?1:-1;
+}
+static inline int cmpulong(const void* a, const void*b){
+	ulong aa=*((ulong*) a), bb = *((ulong*) b);
+	return aa==bb?0:aa>bb?1:-1;
+}
+static inline int cmpuint(const void* a, const void*b){
+	uint aa=*((uint*) a), bb = *((uint*) b);
+	return aa==bb?0:aa>bb?1:-1;
+}
+static inline int cmpRbNode(const void* a, const void* b){
+	rbNode *aa=(rbNode*) a, *bb = (rbNode*) b;
+	return cmp_idx_t(&aa->key,&bb->key);
+}
+//sorting functions 
+void sort_idx_t(idx_t* arr, idx_t len){
+	qsort(arr,len,sizeof(*arr),cmp_idx_t);
+}
+void sortulong(ulong* arr, ulong len){
+	qsort(arr,len,sizeof(*arr),cmpulong);
+}
+void sortuint(uint* arr, uint len){
+	qsort(arr,len,sizeof(*arr),cmpuint);
+}
+void sortRbNode(rbNode* arr, idx_t len){
+	qsort(arr,len,sizeof(*arr),cmpRbNode);
+}
+
+////Tests AUX
+inline void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen){
+	if (arrLen > 0 )	return;
+	for (idx_t i=1,last=arrSorted[0]; i<arrLen; last = arrSorted[i++]) 
+		assert( arrSorted[i] != last );
+}
+///MATH UTILS
+
+static inline int rndDouble_sinAll(double* d){
+	if(read_wrap(urndFd,(void*) d,sizeof(*d))){
+		ERRPRINT("read_wrap failed to read rnd double\n");
+		return EXIT_FAILURE;
+	}
+	*d = sin(*d) * MAXRND;
+	return EXIT_SUCCESS;
+}
+long _rndHold;  //permanent storage of rnd longs
+static inline int rndDouble_sinDecimal(double* d){
+	if(read_wrap(urndFd,(void*) &_rndHold,sizeof(_rndHold))){
+		ERRPRINT("read_wrap failed to read holder for rnd double\n");
+		return EXIT_FAILURE;
+	}
+	*d = (_rndHold % (ulong) ceil(MAXRND)) + sin(_rndHold);
+	return EXIT_SUCCESS;
+}
+   
+void statsAvgVar(double* values,uint numVals, double* out){
+	double sum=0,sumSquare=0;
+	for (uint i=0;  i<numVals;  i++){
+		sum += values[i];
+		sumSquare += values[i]*values[i];
+	}
+	out[0]  =   sum/numVals;							//AVG
+	out[1]  =   sumSquare/numVals - (out[0] * out[0]);  //VAR
+}
+
+/// MATRIX - VECTOR COMPUTE UTILS
+int fillRndVector(ulong size, double* v){
+	for( ulong x=0; x<size; ++x ){
+		if(rndDouble_sinAll( v+x )) return EXIT_FAILURE;
+		#ifdef RNDVECTMIN
+		v[x] += RNDVECTMIN;
+		#endif
+	}
+	return EXIT_SUCCESS;
+}
+
+//convention true result in @a, toCheck in @b
+int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
+	int out = EXIT_SUCCESS;
+	double diff,diffAbs,_diffMax=0;
+	if (diffMax)	*diffMax = 0;
+	else diffMax = &_diffMax;
+	for (ulong i=0; i<n; i++){
+		diff	= a[i] - b[i];
+		diffAbs = ABS( diff );
+		if( diffAbs > DOUBLE_DIFF_THREASH ){
+			out = EXIT_FAILURE;
+			ERRPRINTS("DOUBLE VECTORS DIFF: DOUBLE_DIFF_THREASH=%lf\t<\t"
+				  "|%13lg| = %lf %% of @a[%lu]\n",
+			  	  DOUBLE_DIFF_THREASH,diff,100*diffAbs/ABS(a[i]),i);
+			#ifdef DOUBLE_VECT_DIFF_EARLY_EXIT
+			/*#pragma message("DOUBLE_VECT_DIFF_EARLY_EXIT: only first diff double reported")*/
+			return EXIT_FAILURE;
+			#endif
+		}
+		if ( ABS(*diffMax) < diffAbs )   *diffMax = diff;
+	}
+	DEBUG{
+		printf("\nchecked diff %s"CEND" between 2 double vector of %lu elements"
+			   "\tmax diff: %le %s threash: %le\n", !out?CCC"OK":CCCERR"ERR!",
+				n,*diffMax,*diffMax<DOUBLE_DIFF_THREASH?"<":">=",
+			   DOUBLE_DIFF_THREASH);
+		if (!*diffMax){ //self diff check uselss TODO REMOVE
+			if (!memcpy(a,b,n*sizeof(*a)))
+				printf("EXACT MATCHING AMONG THE 2 DOUBLE VECTORS\n!");
+		}
+	}
+	return out;
+}
+
+void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers){
+	printf("printing matrix: %lu x %lu\n",m,n);
+	ulong i,j;
+	for (i=0;i<m;i++){
+		for (j=0;j<n;j++){
+			if (justNZMarkers)  printf("%s",mat[IDX2D(i,j,n)]?".":" ");
+			else				printf("%1.1lf ",mat[IDX2D(i,j,n)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+
+void printVector(double* v,ulong size){
+	for( ulong i=0;i<size;i++)   printf("%1.1lf ",v[i]);
+	printf("\n");
+}
+
+////VAR -- MISC
+/* search for pattern in @str from NULL terminated @patterns array
+ * return pointer of pattern in @str if any, otherwise NULL
+ */
+static inline char* searchPatternsInStr(char** patterns,char* str){
+	char* out = NULL;
+	for (char** p=patterns;  !out && *p;	p++)	out = strstr(str,*p);
+	return out;
+}
+/* search for pattern in @str from NULL terminated @patterns array
+ * return pointer of pattern in @str if any, otherwise NULL
+ */
+static inline char* searchPatternInStrs(char* pattern,char** strings){
+	char* out = NULL;
+	for (char** str=strings;  !out && *str;	str++)	out = strstr(*str,pattern);
+	return out;
+}
+
+inline int appendArr(ulong val,APPENDARRAY* list){
+	return 0;   //TODO
+}
+
+///DECOMPRESSION IN TMPFS  ; TODO vector not puttable here... figure out nicer way..
+char* COMPRESS_EXTENSIONS[] = { ".gz", ".xz", ".bz2", ".zip", NULL};
+#define STD_DECOMPR_FLAGS " -d -c "
+char* DECOMPRESS_CMDS[] = {"gzip" STD_DECOMPR_FLAGS, "xz" STD_DECOMPR_FLAGS, "bzip2" STD_DECOMPR_FLAGS,
+			   "unzip -c", NULL }; //zip is too old ;)
+
+int extractInTmpFS(char* path, char* tmpFsDecompressPath){
+	char* ext = searchPatternsInStr(COMPRESS_EXTENSIONS,path);
+	if (!ext)   return -1;	//NOT SUPPORTED COMPRESS EXTENSION -> TRY REGULAR PATH
+	//search first 2 char after dot of ext in DECOMPRESS_CMDS to get the decompress cmd
+	if (strlen(ext) < 3 ){
+		ERRPRINTS("NOT SUPPORTED DECOMPRESSION:\textension %s too short to be matched",ext);
+		return -1;
+	}
+	char extStart[3];
+	extStart[0] = ext[1];
+	extStart[1] = ext[2];
+	extStart[2] = '\0';
+	char* decompressCmdBase = searchPatternInStrs(extStart,DECOMPRESS_CMDS);
+	if (!decompressCmdBase){
+		ERRPRINTS("NOT SUPPORTED DECOMPRESS FOR %s\n",ext);
+		return -1;
+	}
+	uint cmdLen = strlen(decompressCmdBase) + strlen(path) + 4 + strlen(tmpFsDecompressPath);
+	char* decompressCmd = alloca(cmdLen+1);
+	if (snprintf(decompressCmd,cmdLen+1,"%s %s > %s",
+		  decompressCmdBase,path,tmpFsDecompressPath) < 0){
+		ERRPRINT("extractInTmpFS, snprintf errd\n");
+	}
+	VERBOSE printf("decompressing %s --> %s\ncmd:\t%s\n",path,tmpFsDecompressPath,decompressCmd);
+	return system(decompressCmd);
+}
--- a/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
+++ b/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
@ -0,0 +1,107 @@
+#include "../include/Sp3MM_CSR_OMP_Multi.h"
+#include "../include/utils.h"
+
+/**
+ * @brief performs multiplication of two sparse matrices 
+ * A and B stored in the CRS format. The resulting matrix is C
+ * 
+ * @param[in] a_m           number of columns of A
+ * @param[in] a_n           number of rows of A
+ * @param[in] a_nz          number of non zero elements of A
+ * @param[in] a_as          array with the non zero coefficients of A
+ * @param[in] a_ja          array with the column indices of the non 
+ *                      zero coefficients of A
+ * @param[in] a_irp         array with the indices of the beginning of 
+ *                      each row in a_as and a_ja
+ * @param[in] a_rl          array with the lengths of each rows of A
+ * @param[in] a_max_row_nz  maximum number of coefficients in a row in A
+ * @param[in] b_m           number of columns of B
+ * @param[in] b_n           number of rows of B
+ * @param[in] b_nz          number of non zero elements of B
+ * @param[in] b_as          array with the non zero coefficients of B
+ * @param[in] b_ja          array with the column indices of the non
+ *                      zero coefficients of B
+ * @param[in] b_irp         array with the indices of the beginning of
+ *                      each row in b_as and b_ja
+ * @param[in] b_rl          array with the lengths of each rows of B
+ * @param[in] b_max_row_nz  maximum number of coefficients in a row in B
+ * @param[out] c_m           number of colums of C
+ * @param[out] c_n           number of rows of C
+ * @param[out] c_nz          number of non zero elements in C
+ * @param[out] c_as          array with the non zero coefficients of C
+ * @param[out] c_ja          array with the column indices of the non
+ *                      zero coefficients of C
+ * @param[out] c_irp         array with the indices of the beginning of
+ *                      each row in c_as and c_ja
+ * @param[out] c_rl          array with the lengths of each rows of C
+ * @param[out] c_max_row_nz  maximum number of coefficients in a row in C
+ * @param[out] info          return value to check if the operation was successful
+ */
+#ifdef ROWLENS
+void psb_f_spmm_row_by_row_ub_0(idx_t a_m, idx_t a_n, idx_t a_nz, 
+                                double *a_as, idx_t *a_ja, 
+                                idx_t *a_irp, idx_t *a_rl, idx_t a_max_row_nz, 
+                                idx_t b_m, idx_t b_n, idx_t b_nz, 
+                                double *b_as, idx_t *b_ja, 
+                                idx_t *b_irp, idx_t *b_rl, idx_t b_max_row_nz,
+                                idx_t *c_m, idx_t *c_n, idx_t *c_nz, 
+                                double **c_as, idx_t **c_ja, 
+                                idx_t **c_irp, idx_t **c_rl, idx_t *c_max_row_nz,
+                                int info)
+#else
+void psb_f_spmm_row_by_row_ub_0(idx_t a_m, idx_t a_n, idx_t a_nz, 
+                                double *a_as, idx_t *a_ja, 
+                                idx_t *a_irp, idx_t a_max_row_nz, 
+                                idx_t b_m, idx_t b_n, idx_t b_nz, 
+                                double *b_as, idx_t *b_ja, 
+                                idx_t *b_irp, idx_t b_max_row_nz,
+                                idx_t *c_m, idx_t *c_n, idx_t *c_nz, 
+                                double **c_as, idx_t **c_ja, 
+                                idx_t **c_irp,  idx_t *c_max_row_nz,
+                                int info)
+#endif                            
+{
+    int rc;
+    spmat *a, *b, *c;
+    CONFIG *cfg;
+
+    #ifdef ROWLENS
+    a->RL = a_rl;
+    b->RL = b_rl;
+    #endif//ROWLENS
+
+    // setting up cfg
+    // TODO : CHECK THAT THIS IS COMPATIBLE WITH PSB
+    rc = getConfig(cfg);
+
+    // setting up spmat type matrices
+    a->M = a_m;
+    a->N = a_n;
+    a->NZ = a_nz;
+    a->AS = a_as;
+    a->JA = a_ja;
+    a->IRP = a_irp;
+    a->MAX_ROW_NZ = a_max_row_nz;
+
+    b->M = b_m;
+    b->N = b_n;
+    b->NZ = b_nz;
+    b->AS = b_as;
+    b->JA = b_ja;
+    b->IRP = b_irp;
+    b->MAX_ROW_NZ = b_max_row_nz;
+
+    // performing spmm
+    c = spmmRowByRow_0(a, b, cfg);
+
+    // output result
+    *(c_m) = c->M;
+    *(c_n) = c->N;
+    *(c_nz)= c->NZ;
+    *(c_as)= c->AS;
+    *(c_ja)= c->JA;
+    *(c_irp)=c->IRP;
+    #ifdef ROWLENS
+    *(c_rl)= c->RL;
+    #endif
+}
--- a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
@ -0,0 +1,94 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+//CORE IMPLEMENTATIONS HEADER
+#ifndef SP3MM_CSR_OMP_MULTI_H
+#define SP3MM_CSR_OMP_MULTI_H
+
+///commons single implementation stuff
+#include "macros.h"
+#include "sparseMatrix.h"
+
+///aux structures
+//hold SPMM result over a unpartitionated space among threads-row[s' blocks]
+typedef struct{
+	//space to hold SPMM output
+	ulong*  JA;
+	double* AS;
+	ulong   size;			//num of entries allocated -> only dbg checks
+	ulong   lastAssigned;	//last JA&AS assigned index to an accumulator(atom)
+	SPACC*  accs;			//SPARSIFIED ACC POINTERS
+	uint	accsNum;	
+} SPMM_ACC; //accumulator for SPMM
+///compute function interface and its pointer definitions
+typedef spmat* ( SPMM)		(spmat*,spmat*,CONFIG*);
+typedef spmat* (*SPMM_INTERF)	(spmat*,spmat*,CONFIG*);
+typedef spmat* ( SP3MM)		(spmat*,spmat*,spmat*,CONFIG*,SPMM_INTERF);
+typedef spmat* (*SP3MM_INTERF)  (spmat*,spmat*,spmat*,CONFIG*,SPMM_INTERF);
+
+typedef enum {
+	_1D_DIRECT,
+	_1D_BLOCKS,
+	_2D_OFFSET,
+	_2D_ALLOCD 
+} SPMM_IMPL_TYPE;
+///-- commons single implementation stuff
+
+///includes
+#include "linuxK_rbtree_minimalized.h"
+#include "Sp3MM_CSR_OMP_SymbStep_Multi.h"
+
+//extern char TRGT_IMPL_START_IDX; //multi implementation switch
+#include "sparseUtilsMulti.h"
+#ifdef OFF_F	//save "includer" OFF_F value before overwriting it
+	#pragma push_macro("OFF_F")
+	#define _OFF_F_OLD
+	#undef  OFF_F
+#endif
+
+
+#define OFF_F 0
+#include "Sp3MM_CSR_OMP_UB_Generic.h"
+#include "Sp3MM_CSR_OMP_Num_Generic.h"
+#undef OFF_F
+
+#define OFF_F 1
+#include "Sp3MM_CSR_OMP_UB_Generic.h"
+#include "Sp3MM_CSR_OMP_Num_Generic.h"
+#undef OFF_F
+
+
+
+#ifdef _OFF_F_OLD
+	#pragma pop_macro("OFF_F")
+	#undef  _OFF_F_OLD
+#endif
+
+
+#endif	//SP3MM_CSR_OMP_MULTI_H 
--- a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Num_Generic.h
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Num_Generic.h
@ -0,0 +1,95 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OFF_F
+    /*#pragma message("generic implementation requires OFF_F defined")*/
+    #error generic implementation requires OFF_F defined
+#endif
+
+/////SYMBOLIC - NUMERIC IMPLEMENTATIONS
+///SP3MM FUNCTIONS
+/*
+ *  triple matrix multiplication among @R * @AC * @P using gustavson parallel implementation
+ *  implmented as a pair of subsequent spmm operations
+ *  if @conf->spmm != NULL, it will be used as spmm function, otherwise euristics will be 
+ *  used to decide wich implementation to use
+ */
+SP3MM CAT(sp3mmRowByRowPair_SymbNum_,OFF_F);
+
+/*
+ * row-by-row-by-row implementation: forwarding @R*@AC rth row to P for row-by-row
+ * accumulation in preallocated space, TODO exactly determined
+ * basic parallelization: 1thread per @R's rows that will also forward the result to P
+ */
+SP3MM CAT(sp3mmRowByRowMerged_SymbNum_,OFF_F);
+
+///SUB FUNCTIONS
+///SPMM FUNCTIONS
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson row-by-row
+ * formulation using an aux dense vector @_auxDense
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow_SymbNum_,OFF_F);
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson 
+ * with partitioning of @A in @conf->gridRows blocks of rows  
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, accessed by aux offsets
+ */
+SPMM CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, ALLOCATED as CSR submatrixes
+ */
+SPMM CAT(spmmRowByRow2DBlocksAllocated_SymbNum_,OFF_F);
+
+///implementation wrappers as static array of function pointers 
+//sp3mm as pair of spmm
+static SPMM_INTERF  CAT(Spmm_SymbNum_Funcs_,OFF_F)[] = {
+	& CAT(spmmRowByRow_SymbNum_,OFF_F),
+	& CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F),
+	//& CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F),
+	//& CAT(spmmRowByRow2DBlocksAllocated_SymbNum_,OFF_F)
+};
+//sp3mm as pair of spmm
+static SP3MM_INTERF CAT(Sp3mm_SymbNum_Funcs_,OFF_F)[] = {
+	//& CAT(sp3mmRowByRowMerged_SymbNum_,OFF_F)
+};
--- a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Generic.h
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Generic.h
@ -0,0 +1,195 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+/*
+ * CSR Sp[3]MM Symbolic step implementations
+ * target: 	compute the output matrix size and the row lens for preallocation
+ * 			direct write out partial results
+ * See interfaces in respective header
+ *
+ * MultiImplementations functions with all parameters
+ * several functions (lower level) has config based on 
+ * 	OUT_IDXS for returing or using also symbMul Output idxs somehow
+ * 	COL_PARTS for returing or using also distribution of symb out idxs in col partition
+ */
+
+///MultiImplementations
+///setup aux macros for different signatures implementation via #if arith expr
+#ifdef OUT_IDXS 
+	#define _OUT_IDXS  	TRUE
+#else   
+	#define _OUT_IDXS 	FALSE
+	#define OUT_IDXS	_UNDEF
+#endif
+#ifdef COL_PARTS
+	#define _COL_PARTS	TRUE
+#else   
+	#define _COL_PARTS	FALSE
+	#define COL_PARTS	_UNDEF
+#endif
+///
+#if !defined OFF_F 
+	#error generic implementations requires OFF_F 
+#endif
+
+
+
+
+/*
+ * Compute symbolic product of (nnz indexes of) row @aRowJA and matrix @b
+ * insert nnz indexes of the mul. result row as nodes in a rbtree rooted at @root
+ * with nodes in @nodes which have to be enough for the mul result row (use an UB)
+ * Retuns: multiplication result row NNZ number,se CONFIG_MACROS below for more 
+ *
+ * CONFIG_MACROS:
+ * if _OUT_IDXS == TRUE	return mul.result row nnz idxs in @outIdxs
+ *	  ifdef: OUT_IDXS_RBTREE_NODES: nnz indexes returned inplace sorting rbtree
+ * 								as nnz indexes(JA) of the mul result row
+ * else:							stop at returning the mul. result row lenght
+ * if _COL_PARTS == TRUE	return the number of nonzero elements in
+ *		in each of the @gridCols column partitions inside @rowColPartsLens
+ * OFF_F:	offset back indexes from fortran
+ * 		TODO also output indexes are shifted (see c_b )
+
+idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F) (idx_t* aRowJA,idx_t aRowLen,
+  spmat* b,rbRoot* root,rbNode* nodes
+	#if _OUT_IDXS  == TRUE
+	#ifndef OUT_IDXS_RBTREE_NODES 
+	,idx_t* outIdxs
+	#endif
+	#endif
+	#if _COL_PARTS == TRUE
+	,ushort gridCols,idx_t* rowColPartsOffsets
+	#endif
+  );
+ */
+
+/*
+ * SPVECT_IDX_DENSE_MAP based, as SpMM_Row_Symb_Rbtree but with idxMap aux idx keeping
+ * CONFIG_MACROS (new)
+ * IDX_RMUL_SYMB_RBTREE && ( _OUT_IDXS == T || _COL_PARTS == T ):
+ * 	 (tmp) symb mult out indexes will be kept via a rbtree
+ * 	 otherwise directly in the out array appending them and then sorting them
+ * 	 		(potentially same n log n)
+
+idx_t CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   idx_t* aRowJA, idx_t aRowLen, spmat* b, SPVECT_IDX_DENSE_MAP* idxsMapAcc
+   #if _OUT_IDXS  == TRUE 
+   ,idx_t* outIdxs
+	#if IDX_RMUL_SYMB_RBTREE == TRUE
+	,rbRoot* root, rbNode* nodes
+	#endif
+   #endif	// _OUT_IDXS == TRUE
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  );
+ */
+
+/*
+ * Compute symbolic product of sparse matrixes @a * @b
+ * Alloc aux structures based on a upper bounded allocation
+ * Returns array of exact row lens of the result mul matrix c=@a*@b
+ * 		plus an extra entry for the cumulative num of matrix nnz 
+ * 		(interoperability with upper bound implementations)
+ *
+ * CONFIG_MACROS:
+ * 	if _OUT_IDXS == TRUE:	
+ * 		return in *@outIdxs pointers to overallocd JAs nnz indexes
+ * 		*outIdxs[0] --> start of first row (its size in 0th of returned array)
+ * 						also is the malloc returned address for later free
+ * 							[NB: rows are not contiguos in the allocation]
+ * 	if _COL_PARTS == TRUE: 	return in *@rowColPartsLens
+ * 		a matrix @a->M * @gridCols of offsets 
+ * 		for each column partition in each row
+ */
+idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
+  (
+   ROW_MMSYM_IMPL_MODE symbRowImplID, spmat* a, spmat* b
+   #if _OUT_IDXS  == TRUE
+   ,idx_t*** outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols, idx_t** rowColPartsLens
+   #endif
+  );
+
+//////Sp3MM - rowByrowByrow
+#define SP3MM_UB_HEURISTIC	2
+#if !defined COL_PARTS && defined OUT_IDXS
+///MultiImplementations functions without COL_PARTS
+//NB required OUT_IDXS for initial row-by-row step...
+/*
+ * as earlier but meant for work with 
+ * triple product as rob-by-row-by-row forwarding:
+ * @abRowJATmp is a tmp storage for first row-by-row sym product
+ * 	has to be big enough to store all the possible nonzero elements
+ * @nodes has to be big enough to store all possible nnz of ab and abc row
+ * 	(not only ab row as earlier version)
+ *
+ */
+idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID, idx_t* aRowJA,idx_t aRowLen,
+	spmat* b,spmat* c, rbRoot* root,rbNode* nodes, idx_t* abRowJATmp
+	#if _OUT_IDXS  == TRUE
+	#ifndef OUT_IDXS_RBTREE_NODES 
+	,idx_t* outIdxs
+	#endif
+	#endif
+  );
+
+/*
+ * triple product @a*@b*@c as rob-by-row-by-row forwarding:
+ * Returns: resulting matrix rows' sizes exactly in an array
+ * 			plus an extra element for the res.matrix's total size
+ * CONFIG_MACROS:
+ * 	if _OUT_IDXS == TRUE:	
+ * 		return in *@outIdxs pointers to overallocd JAs nnz indexes
+ * 		*outIdxs[0] --> start of first row (its size in 0th of returned array)
+ * 						also is the malloc returned address for later free
+ * 							[NB: rows are not contiguos in the allocation]
+ */ 
+//TODO 	HEURISTICS IN _OUT_IDXS to avoid serializing after first sym.product
+//		see HEURISTICS_UB
+idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID, spmat* a, spmat* b, spmat* c
+	#if _OUT_IDXS  == TRUE
+	,idx_t*** outIdxs
+	#endif
+  );
+
+#endif
+	
+#undef OUT_IDXS
+#undef _OUT_IDXS
+#undef COL_PARTS
+#undef _COL_PARTS
--- a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Multi.h
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Multi.h
@ -0,0 +1,80 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef SP3MM_CSR_OMP_SYMB_MULTI
+#define SP3MM_CSR_OMP_SYMB_MULTI
+
+/* Multi implementation of symbolic product of sparse matrixes, config macros
+ * OFF_F:		C-Fortran spmat indexing
+ * OUT_IDXS:	indexes output
+ * COL_PARTS:	partitioning columns output...
+ */
+
+#define OUT_IDXS_ON		OutIdxs_
+#define COL_PARTS_ON	ColParts_
+#undef OUT_IDXS
+#undef COL_PARTS
+
+#define OFF_F 0
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+		//generate outIdxs AND colParts ve sions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
+#define OFF_F 1
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+		//generate outIdxs AND colParts ve sions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
+
+#endif
--- a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
@ -0,0 +1,95 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OFF_F
+    /*#pragma message("generic implementation requires OFF_F defined")*/
+    #error generic implementation requires OFF_F defined
+#endif
+
+///SP3MM FUNCTIONS
+/*
+ *  triple matrix multiplication among @R * @AC * @P using gustavson parallel implementation
+ *  implmented as a pair of subsequent spmm operations
+ *  if @conf->spmm != NULL, it will be used as spmm function, otherwise euristics will be 
+ *  used to decide wich implementation to use
+ */
+SP3MM CAT(sp3mmRowByRowPair_,OFF_F);
+
+/*
+ * row-by-row-by-row implementation: forwarding @R*@AC rth row to P for row-by-row
+ * accumulation in preallocated space, TODO exactly determined
+ * basic parallelization: 1thread per @R's rows that will also forward the result to P
+ */
+SP3MM CAT(sp3mmRowByRowMerged_,OFF_F);
+
+///SUB FUNCTIONS
+///SPMM FUNCTIONS
+SPMM CAT(spmmSerial_,OFF_F); //mono thread version for debug oracle-less
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson row-by-row
+ * formulation using an aux dense vector @_auxDense
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow_,OFF_F);
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson 
+ * with partitioning of @A in @conf->gridRows blocks of rows  
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow1DBlocks_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, accessed by aux offsets
+ */
+SPMM CAT(spmmRowByRow2DBlocks_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, ALLOCATED as CSR submatrixes
+ */
+SPMM CAT(spmmRowByRow2DBlocksAllocated_,OFF_F);
+
+///implementation wrappers as static array of function pointers
+//sp3mm as pair of spmm
+static SPMM_INTERF  CAT(Spmm_UB_Funcs_,OFF_F)[] = {
+	& CAT(spmmRowByRow_,OFF_F),
+	& CAT(spmmRowByRow1DBlocks_,OFF_F),
+	& CAT(spmmRowByRow2DBlocks_,OFF_F),
+	& CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)
+};
+//sp3mm as direct product
+static SP3MM_INTERF CAT(Sp3mm_UB_Funcs_,OFF_F)[] = {
+	& CAT(sp3mmRowByRowMerged_,OFF_F)
+};
--- a/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
+++ b/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
@ -0,0 +1,406 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+///scalar-vector multiply
+//TODO che guadagno si ha ad utilizzare solo la versione generica delle successive 2 funzioni
+/*
+ * Sparse vector part <->scalar multiplication in a dense output
+ * sparse vector part will hold nnz values in @vectVals 		 (AS subvector)
+ * with corresponding indexes in @vectIdxs in range [0,@vectLen] (JA subvector)
+ * resulting vector accumulated in a dense array in @aux->v, along with nnzIdx
+ * both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
+ */
+inline void CAT(scSparseVectMul_,OFF_F)
+  (double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux){
+	for (ulong i=0,j; i<vectLen; i++){
+		j = vectIdxs[i]-OFF_F;
+		DEBUGCHECKS{
+			if (j>=aux->vLen){
+				fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
+				assert(j < aux->vLen);
+			}
+		}
+		aux->v[j] += vectVals[i] * scalar;  //accumulate
+		//append new nonzero index to auxVNNZeroIdxs for quick sparsify
+		//if (!(aux->v[j]))	aux->nnzIdx[ aux->nnzIdxMap.len++ ] = j; //TODO numerical zero may then cause readd
+		if(!spVect_idx_in(j,&aux->nnzIdxMap))
+			aux->nnzIdx[ aux->nnzIdxMap.len-1 ] = j;
+	}
+}
+
+/* Same as above scSparseVectMul_ , but consider an initial offset to remove from each idx
+ * Sparse vector part <->scalar multiplication in a dense output
+ * @vectVals:	sparse vector part values	( from spmat.AS )
+ * with [at least] @vectLen corresponding 
+ * target idxs in @vectIdxs (from spmat.JA ) starting from @startIdx 
+ *
+ * Resulting vector accumulated in a dense array in @aux->v, along with nnzIdx
+ * all nnz values indexes will be shifted back of @startIdx in @aux
+ * both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
+ */
+inline void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,
+  ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux){
+	for (ulong i=0,j; i<vectLen; i++){
+		j = vectIdxs[i]-OFF_F - startIdx;
+		DEBUGCHECKS{
+			if (j>=aux->vLen){
+				fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
+				assert(j < aux->vLen);
+			}
+		}
+		aux->v[j] += vectVals[i] * scalar;  //accumulate
+		//append new nonzero index to auxVNNZeroIdxs for quick sparsify
+		//if (!(aux->v[j]))	aux->nnzIdx[ aux->nnzIdxMap.len++ ] = j; 
+		////TODO numerical zero may then cause readd
+		if(!spVect_idx_in(j,&aux->nnzIdxMap))
+			aux->nnzIdx[ aux->nnzIdxMap.len-1 ] = j;
+	}
+}
+
+
+///MultiImplementations functions
+#ifndef OFF_F
+	#error generic implementations requires OFF_F defined
+#endif
+
+///TODO IMPLEMENT SCALAR<->ROW MUL AS GENERIC SPARSE VECTOR<->SCALAR MUL
+//inline void scSparseRowMul(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
+inline void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
+	ulong  rowStartIdx = mat->IRP[trgtR]-OFF_F,rowLen;
+	#ifdef ROWLENS
+	rowLen = mat->RL[trgtR];
+	#else
+	rowLen = mat->IRP[trgtR+1] - mat->IRP[trgtR];
+	#endif
+	CAT(scSparseVectMul_,OFF_F)(scalar,mat->AS+rowStartIdx,mat->JA+rowStartIdx,rowLen,aux);
+	//TODO check impact of generic version use
+}
+
+
+///OUTPUT SIZE PREDICTION	
+
+/*			O(A.NZ [+B.M] )
+ * return array of upper bounded row sizes of the product @A * @B
+ * also appended at the end for the cumulative total size of the matrix AB = A*B
+ */
+inline idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B){
+	AUDIT_INTERNAL_TIMES	Start = omp_get_wtime();
+	idx_t* rowSizes = calloc((A->M+1),  sizeof(*rowSizes));
+	if (!rowSizes){
+		ERRPRINT("spMMSizeUpperbound: rowSizes calloc errd\n");
+		return NULL;
+	}
+	idx_t fullMatBound = 0;
+	#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
+	for (idx_t r=0;  r<A->M; r++){
+		for (idx_t jj=A->IRP[r] - OFF_F,j,rlen;  jj<A->IRP[r+1] - OFF_F; jj++){
+			j = A->JA[jj] - OFF_F;
+			#ifdef ROWLENS
+			rlen = B->RL[j];
+			#else
+			rlen = B->IRP[j+1] - B->IRP[j]; //OFF_F: delta is offset independent
+			#endif 
+			rowSizes[r]	 += rlen;
+			fullMatBound	+= rlen;
+			//rowSizes[A->M]  += rlen;	//just below omp reduction sum
+		}
+	}
+	rowSizes[A->M]  = fullMatBound;
+	AUDIT_INTERNAL_TIMES	End= omp_get_wtime();
+	VERBOSE 
+		printf("spMMSizeUpperbound:%lu\t%le s\n",rowSizes[A->M],End-Start);
+	return rowSizes;
+}
+
+/*	O(A.NZ + B.NZ)
+ * return matrix @A.M x gridCols of upper bound 
+ * for each of the @gridCols col partitions of the output matrix AB = @A * @B 
+ * also appended at the end for the cumulative total size of the matrix AB 
+ */
+inline idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)
+  (spmat* A,spmat* B,ushort gridCols,idx_t* bColPartOffsets){
+	AUDIT_INTERNAL_TIMES	Start = omp_get_wtime();
+	
+	idx_t* rowPartsSizes = calloc((A->M*gridCols +1),  sizeof(*rowPartsSizes));
+	if (!rowPartsSizes){
+		ERRPRINT("spMMSizeUpperbound: rowPartsSizes calloc errd\n");
+		return NULL;
+	}
+
+	idx_t fullMatBound = 0;
+	#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
+	for (idx_t r=0;  r<A->M; r++){
+		//for each A.row -> sum B.colParts lens		
+		for (idx_t jj=A->IRP[r]-OFF_F,j,rlen;  jj<A->IRP[r+1]-OFF_F; jj++){
+			j = A->JA[jj] - OFF_F;
+			for (idx_t gc=0,bPartID=IDX2D(j,gc,gridCols);  gc < gridCols; gc++,bPartID++){
+				rlen = bColPartOffsets[bPartID+1] - bColPartOffsets[bPartID];
+				rowPartsSizes[ IDX2D(r,gc,gridCols) ]	+= rlen;
+				fullMatBound	+= rlen;
+			}
+		}
+	}
+	rowPartsSizes[ A->M*gridCols ]  = fullMatBound;
+	AUDIT_INTERNAL_TIMES	End= omp_get_wtime();
+	VERBOSE 
+	 printf("spMMSizeUpperboundColParts_:%lu\t%le s\n",rowPartsSizes[A->M],End-Start);
+	return rowPartsSizes;
+}
+
+
+////Single implementations funcs
+#ifndef SPMMUTILS_H_SINGLE_IMPLEMENTATION
+#define SPMMUTILS_H_SINGLE_IMPLEMENTATION
+
+#include "utils.h"
+///Allocs - Free
+//SpMM holder of accumulats 
+inline SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum){
+	SPMM_ACC* out = calloc(1,sizeof(*out));
+	if (!out){
+		ERRPRINT("initSpMMAcc:	out calloc errd\n");
+		return NULL;
+	}
+	out->size = entriesNum;
+	if (!(out->JA = malloc(entriesNum * sizeof(*(out->JA))))){
+		ERRPRINT("initSpMMAcc:	JA malloc errd\n");
+		goto _err;
+	}
+	if (!(out->AS = malloc(entriesNum * sizeof(*(out->AS))))){
+		ERRPRINT("initSpMMAcc:	AS malloc errd\n");
+		goto _err;
+	}
+	if (!(out->accs = malloc(accumulatorsNum * sizeof(*(out->accs))))){
+		ERRPRINT("initSpMMAcc:	accs malloc errd\n");
+		goto _err;
+	}
+	return out;
+
+	_err:
+	if (out->JA)	free(out->JA);
+	if (out->AS)	free(out->AS);
+	if (out->accs)  free(out->accs);
+	if (out)		free(out);
+	return NULL;
+}
+inline void freeSpMMAcc(SPMM_ACC* acc){
+	free(acc->JA);
+	free(acc->AS);
+	free(acc->accs);
+	free(acc);
+}
+///// dense acc sparsify functions
+/*
+ * sparsify dense accumulated vector @accV (with shifted of @startColAcc) 
+ * into sparse accumulator @accSparse that'll use space for nnz entries from @acc
+*/
+///DIRECT OUT MATRIX SPARSIFY
+/*
+ * sparsify @accV directly inside row @row of matrix @m
+ * considering, if given not NULL, 2D partitioning with 
+ * @gridCols cols groups and colGroups offsets per row matrix @colPartsOffsets
+ * :Returns	inplace modify of @m
+ */
+static inline void sparsifyDirect(ACC_DENSE* accV,spmat* m,idx_t row){
+	idx_t nnz = accV->nnzIdxMap.len, sparsifyStart = m->IRP[row], sparsifyEnd = m->IRP[row+1];
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+	DEBUGCHECKS	assertArrNoRepetitions(accV->nnzIdx,nnz);
+	DEBUGCHECKS	assert(nnz <= (sparsifyEnd - sparsifyStart));
+
+	for (idx_t i=0,j;	i < nnz;   i++){
+		j =  accV->nnzIdx[i];
+		m -> JA[sparsifyStart + i] = j; //+ startColAcc;
+		m -> AS[sparsifyStart + i] = accV->v[j];
+	}
+}
+//TODO 2D partitioning - colParts version of above...best to use multi impl trick of symbStep
+static inline void sparsifyDirectColParts(ACC_DENSE* accV,spmat* m,idx_t row,
+  ushort colGroupId,ushort gridCols, idx_t* colPartsOffsets,idx_t startCol){
+	idx_t nnz = accV->nnzIdxMap.len;
+	ushort partID = IDX2D(row,colGroupId,gridCols);
+	idx_t sparsifyStart = colPartsOffsets[partID], sparsifyEnd = colPartsOffsets[partID+1];
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+	DEBUGCHECKS		assertArrNoRepetitions(accV->nnzIdx,nnz);
+	DEBUGCHECKS		assert( nnz == sparsifyEnd - sparsifyStart );
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+
+	for (idx_t i=0,j;	i < nnz;   i++){ 
+		j =  accV->nnzIdx[i];
+		m -> JA[sparsifyStart + i] = j + startCol;
+		m -> AS[sparsifyStart + i] = accV->v[j];
+	}
+	
+}	
+
+///UB SPACE SPARSIFY
+//internal sparsivy dense acc inside (prepared) sparse acc struct
+static inline void _sparsifyUB(ACC_DENSE* accV,SPACC* accSparse,idx_t startColAcc){
+	idx_t nnz = accV->nnzIdxMap.len;
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+	DEBUGCHECKS		assertArrNoRepetitions(accV->nnzIdx,nnz);
+	for (idx_t i=0,j;	i < nnz;   i++){ 
+		j =  accV->nnzIdx[i];
+		accSparse -> JA[i] = j + startColAcc;
+		accSparse -> AS[i] = accV->v[j];
+	}
+	accSparse -> len = nnz;
+}
+
+//row[Part] sparsified in a thread safe (exactly long) reserved area using atomics
+static inline void sparsifyUBNoPartsBounds
+  (SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc){
+	//sort nnz indexes of dense accumulator
+	idx_t nnz = accV->nnzIdxMap.len;
+	idx_t sparsifyStartV;		 //start index(inside @accSparse) of @accV to sparsify
+	//sparsifyStartV = __atomic_fetch_add(&(acc->lastAssigned),nnz,__ATOMIC_ACQ_REL); 
+	#pragma omp atomic capture
+	{   //fetch and add like .... 
+		sparsifyStartV = acc->lastAssigned;
+		acc->lastAssigned += nnz;
+	}
+	DEBUGCHECKS{
+		if (acc->lastAssigned >= acc->size){
+			ERRPRINT("OMP ATOMIC OR SG ERRD IN SPACE ASSIGNMENTS...\n");
+			assert(acc->lastAssigned < acc->size);
+		}
+	}
+	//
+	accSparse -> AS = acc->AS + sparsifyStartV; 
+	accSparse -> JA = acc->JA + sparsifyStartV; 
+	_sparsifyUB(accV,accSparse,startColAcc);
+}
+////output-gather functons
+/*
+ * merge @conf->gridCols*@mat->M sparse rows partitions into @mat
+ * EXPECTED rowsParts @rowsParts to be sorted in accord to the 
+ * 2D rowMajor computing grid given in @conf
+ * allocd arrays to hold non zero values and indexes into @mat
+ */
+inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
+  CONFIG* conf){
+	ulong nzNum=0,j,rLen,idx,partsNum = mat->M * conf->gridCols;
+	//TODO PARALLEL MERGE - SAVE STARTING offset OF EACH PARTITION IN THE OUT MATRIX
+	ulong* rowsPartsOffsets=alloca(partsNum*sizeof(*rowsPartsOffsets));
+	///count nnz entries and alloc arrays for them
+	for (ulong r=0; r<mat->M; r++){
+		//for each partition ->get len -> outMat.IRP and aux offsets  
+		for (j=0,rLen=0; j<conf->gridCols; j++){
+			idx = IDX2D(r,j,conf->gridCols);
+			rowsPartsOffsets[idx]=nzNum+rLen;//part start=prev accumulated end
+			rLen += rowsParts[idx].len;
+		}
+		nzNum += rLen;
+		mat->IRP[r+1] = nzNum;
+		#ifdef ROWLENS
+		mat->RL[r] = rLen;
+		#endif
+	}
+	mat->NZ = nzNum;
+	if (!(mat->AS = malloc(nzNum * sizeof(*(mat->AS))))){
+		ERRPRINT("merged sparse matrix AS alloc errd\n");
+		return EXIT_FAILURE;
+	}  
+	if (!(mat->JA = malloc(nzNum * sizeof(*(mat->JA))))){
+		ERRPRINT("merged sparse matrix JA alloc errd\n");
+		return EXIT_FAILURE;
+	}
+	///popolate with rows nnz values and indexes
+	ulong pLen; //omp for aux vars
+	#pragma omp parallel for schedule(static) private(pLen)
+	for (ulong i=0;  i<partsNum; i++){
+		pLen = rowsParts[i].len;
+		memcpy(mat->AS + rowsPartsOffsets[i],rowsParts[i].AS,pLen*sizeof(*(mat->AS)));
+		memcpy(mat->JA + rowsPartsOffsets[i],rowsParts[i].JA,pLen*sizeof(*(mat->JA)));
+	}
+	CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
+		for (ulong i=0,w=0; i<mat->M; i++){
+			if (mat->IRP[i] != w) 
+				{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
+			for (j=0; j<conf->gridCols; j++){
+				SPACC r = rowsParts[IDX2D(i,j,conf->gridCols)];
+				for (ulong jj=0; jj<r.len; jj++,w++){
+					if (mat->AS[w]!= r.AS[jj]){
+						ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
+					if (mat->JA[w]!= r.JA[jj]){
+						ERRPRINT("MERGE ROW ERR JA\n"); return -1;}
+				}
+			}
+		}
+	}
+	return EXIT_SUCCESS;
+}
+
+/*
+ * merge @mat->M sparse rows @rows in sparse matrix @mat
+ * EXPECTED @rows to be sorted in accord to trgt matrix row index @r
+ * allocd arrays to hold non zero values and indexes into @mat
+ */
+inline int mergeRows(SPACC* rows,spmat* mat){
+	ulong nzNum=0;
+	//count nnz entries and alloc arrays for them
+	for (ulong r=0;   r<mat->M;   ++r){
+		nzNum += rows[r].len;
+		mat->IRP[r+1] = nzNum;
+		#ifdef ROWLENS
+		mat->RL[r]  = rows[r].len
+		#endif
+	}
+	mat->NZ = nzNum;
+	if (!(mat->AS = malloc(nzNum * sizeof(*(mat->AS))))){
+		ERRPRINT("merged sparse matrix AS alloc errd\n");
+		return EXIT_FAILURE;
+	}  
+	if (!(mat->JA = malloc(nzNum * sizeof(*(mat->JA))))){
+		ERRPRINT("merged sparse matrix JA alloc errd\n");
+		return EXIT_FAILURE;
+	}
+	///POPOLATE WITH ROWS NNZ VALUES AND INDEXES
+	//TODO PARALLEL COPY
+	#pragma omp parallel for schedule(static)
+	for (ulong r=0; r<mat->M; r++){
+		memcpy(mat->AS+mat->IRP[r], rows[r].AS, rows[r].len*sizeof(*(mat->AS)));
+		memcpy(mat->JA+mat->IRP[r], rows[r].JA, rows[r].len*sizeof(*(mat->JA)));
+	}
+	CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
+		for (ulong r=0,i=0; r<mat->M; r++){
+			if (i != mat->IRP[r])
+				{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
+			for (ulong j=0; j<rows[r].len; j++,i++){
+				if (mat->AS[i]!= rows[r].AS[j]){
+					ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
+				if (mat->JA[i]   != rows[r].JA[j]){
+					ERRPRINT("MERGE ROW ERR JA\n"); return -1;}
+			}
+		}
+	}
+	return EXIT_SUCCESS;
+} 
+
+#endif	//SPMMUTILS_H_SINGLE_IMPLEMENTATION
+
--- a/base/serial/impl/sp3mm4amg/include/SpMMUtilsMulti.h
+++ b/base/serial/impl/sp3mm4amg/include/SpMMUtilsMulti.h
@ -0,0 +1,56 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef SPMMUTILSMULTI_H
+#define SPMMUTILSMULTI_H
+
+extern char TRGT_IMPL_START_IDX; //multi implementation switch
+
+#ifdef OFF_F	//save "includer" OFF_F value before overwriting it
+	#pragma push_macro("OFF_F")
+	#define _OFF_F_OLD
+	#undef  OFF_F
+#endif
+
+#define OFF_F 0
+#include "SpMMUtilsGeneric.h"
+#undef OFF_F
+
+#define OFF_F 1
+#include "SpMMUtilsGeneric.h"
+#undef OFF_F
+
+#ifdef _OFF_F_OLD
+	#pragma pop_macro("OFF_F")
+	#undef  _OFF_F_OLD
+#endif
+
+#endif
+
--- a/base/serial/impl/sp3mm4amg/include/config.h
+++ b/base/serial/impl/sp3mm4amg/include/config.h
@ -0,0 +1,155 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+
+#ifndef CONFIG_H
+#define CONFIG_H
+//CONFIG STRUCT DEPENDECIES
+//switch among diferent symb row -> rows implementation var
+typedef enum{
+	RBTREE,
+	IDXMAP
+} ROW_MMSYM_IMPL_MODE;
+
+//
+typedef struct{
+	ushort gridRows;
+	ushort gridCols;
+	//TODO FULL CONFIG DOCCED HERE
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID; //how to compute symb mul (if required)
+	uint threadNum;  //thread num to use in an OMP parallel region ...
+	void* chunkDistrbFunc;  //CHUNKS_DISTR_INTERF func pntr
+} CONFIG;  
+///Smart controls
+typedef size_t 		idx_t;	//spmat indexes
+typedef unsigned __int128	uint128;
+
+#include "macros.h"
+#ifndef SPARSIFY_PRE_PARTITIONING
+	#define SPARSIFY_PRE_PARTITIONING TRUE	//u.b. implementation will sparsify dense acc in a pre splitted mem area
+#endif
+///AUDIT&CHECKS
+//debug checks and tmp stuff
+#ifndef DEBUG 
+    #define DEBUG                       if( TRUE )
+#endif
+//long prints
+#ifndef DEBUGPRINT
+    #define DEBUGPRINT                  if( FALSE )
+#endif
+//heavy impact debug checks
+#ifndef DEBUGCHECKS
+    #define DEBUGCHECKS                 if( FALSE )
+#endif
+//extra print in the normal output
+#ifndef AUDIT_INTERNAL_TIMES
+    #define AUDIT_INTERNAL_TIMES        if( TRUE )
+#endif
+#ifndef VERBOSE
+    #define VERBOSE                     if( FALSE )
+#endif
+//extra checks over the imput and correct partials
+#ifndef CONSISTENCY_CHECKS
+    #define CONSISTENCY_CHECKS          if( TRUE )
+#endif
+#ifndef SPVECT_IDX_BITWISE 	//SPVECT_IDX_DENSE_ACC.nnzIdxsFlags will be an array of bitflags
+	#define SPVECT_IDX_BITWISE TRUE
+#endif
+#if SPVECT_IDX_BITWISE == TRUE
+	#ifndef LIMB_T
+		#define LIMB_T uint128
+	#endif
+	typedef LIMB_T limb_t;
+	typedef limb_t* nnz_idxs_flags_t;
+	#define LIMB_SIZE_BIT ( sizeof(limb_t) * 8 )
+#else //nnz idxs ar flags in a byte arry
+	typedef uchar* nnz_idxs_flags_t;
+	#define LIMB_SIZE_BIT ( sizeof(uchar) * 8 )
+#endif
+
+///AUDIT extra configuration
+//#define ROWLENS
+#ifdef ROWLENS
+/*#pragma message("ROW_LENS ARRAY ENABLED")*/
+#endif
+
+/*
+ * idxsMapAcc based symb row*rows, outIdxs and colParts carried with an aux rbtree
+ * otherwise carried throught an append array
+ * (hyp same computational cost... n + nlog n)
+ */
+#ifndef IDX_RMUL_SYMB_RBTREE
+	#define IDX_RMUL_SYMB_RBTREE  FALSE
+#endif
+#ifndef RB_CACHED_INSERT
+	#define RB_CACHED_INSERT TRUE	//use cached insert
+#endif
+//#define USE_RB_ROOT_CACHE_LMOST //use leftmost leaf cached in rbtree in sym mul
+///CONSTS
+#define ELL_MAX_ENTRIES ( 6l << 27 ) //2*6GB of ell (padded) entries maxSupport in a matrix 
+#define LIMIT_ELL_SIZE				 //enable above threshold
+#define ELL_AS_FILLER       (0 )        //handled with calloc
+//TODO NOW FILLED WITH LAST NNPADDED COL #define ELL_JA_FILLER       (-1)
+//#define DOUBLE_VECT_DIFF_EARLY_EXIT 1
+//#define RNDVECTMIN          222222
+#define VECTOR_STEP_REALLOC 25
+#define VECTOR_READ_BLOCK	50		//file (raw) vector read block
+#define RNDVECTORSIZE       100000
+#define RNDVECTORDUMP       TMPDIR  "rndVectorDump"
+#define RNDVECTORDUMPRAW    TMPDIR  "rndVectorDumpRaw"
+#define OUTVECTORDUMP       TMPDIR  "outVectorDump"
+#define OUTVECTORDUMPRAW    TMPDIR  "outVectorDumpRaw"
+//#define FLOAT_DIFF_ABS
+#ifndef AVG_TIMES_ITERATION
+    #define AVG_TIMES_ITERATION         5
+#endif
+//ompChunksDivide.h -> chunksFairFolded()
+#ifndef FAIR_CHUNKS_FOLDING
+    #define FAIR_CHUNKS_FOLDING 4
+#endif
+//SPMV specific
+//rows partitions for dotProduct SIMD reduction enable
+#ifndef SIMD_ROWS_REDUCTION
+    #define SIMD_ROWS_REDUCTION         TRUE
+#endif
+/*#if SIMD_ROWS_REDUCTION == TRUE
+    #pragma message("SIMD_ROWS_REDUCTION enabled")
+    //TODO SOME TRICK TO HAVE 1! PRINT
+#endif*/
+extern double Start,End,Elapsed,ElapsedInternal;
+#define DOUBLE_DIFF_THREASH         7e-4
+#define DRNG_DEVFILE                "/dev/urandom"
+#define MAXRND                      3e-5
+#ifndef TMPDIR
+    #define TMPDIR                      "/tmp/"
+#endif
+#define TMP_EXTRACTED_MARTIX    TMPDIR "extractedMatrix"
+
+#endif  //CONFIG_H
--- a/base/serial/impl/sp3mm4amg/include/inlineExports.h
+++ b/base/serial/impl/sp3mm4amg/include/inlineExports.h
@ -0,0 +1,49 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//inline export - single implmentation functions
+void cleanRbNodes(rbRoot* root,rbNode* nodes,idx_t nodesNum);
+int rbInsertNewKey(rbRoot *root,rbNode *node, idx_t key);
+//void C_FortranShiftIdxs(spmat* outMat);
+//void Fortran_C_ShiftIdxs(spmat* m);
+ACC_DENSE* _initAccVectors(ulong num,ulong size);
+ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size); //TODO PERF WITH NEXT
+SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum);
+idx_t reductionMaxSeq(idx_t* arr,idx_t arrLen);
+int _allocAccDense(ACC_DENSE* v,ulong size);
+int mergeRows(SPACC* rows,spmat* mat);
+int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,CONFIG* conf);
+void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num); 
+void _resetAccVect(ACC_DENSE* acc);
+void _resetIdxMap(SPVECT_IDX_DENSE_MAP* acc);
+void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
+void freeAccsDense(ACC_DENSE* vectors,ulong num);
+void freeSpMMAcc(SPMM_ACC* acc);
+void sparsifyDenseVect(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc);
--- a/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
+++ b/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
@ -0,0 +1,43 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "macros.h"
+
+#ifndef OFF_F
+	#error generic implementation requires OFF_F defined
+#endif
+
+////inline exports
+//multi implmentation functions
+void CAT(scSparseVectMul_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux);
+void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux);
+void CAT(_scRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
+void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
+idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B);
+idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)(spmat* A,spmat* B,ushort gridCols,idx_t* bColPartOffsets);
--- a/base/serial/impl/sp3mm4amg/include/linuxK_rbtree_minimalized.h
+++ b/base/serial/impl/sp3mm4amg/include/linuxK_rbtree_minimalized.h
@ -0,0 +1,335 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+
+  Userspace GNUC porting,del augmented deps and minimalizing for few OPs only:	Andrea Di Iorio
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  See Documentation/core-api/rbtree.rst for documentation and samples.
+*/
+/*
+ *              RedBlackTree_linux_userspace
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the RedBlackTree_linux_userspace or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE RedBlackTree_linux_userspace GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+ /*
+ * https://bitbucket.org/andysnake96/redblacktree_linux_userspace
+ */ 
+
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+//#include <linux/kernel.h>     //TODO LESS_DEPENDENCIES
+//#include <linux/stddef.h>     //TODO LESS_DEPENDENCIES
+//#include <linux/rcupdate.h>   //TODO LESS_DEPENDENCIES
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <macros.h>             ///my usefull macros :)
+#include <macrosLinuxMock.h>             ///my usefull macros :)
+
+struct rb_node {
+	unsigned long  __rb_parent_color;
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+	struct rb_node *rb_node;
+};
+/////MINIMALIZE fulfilling rbtree_augmented deps TODO
+///rb_node colors labels
+#define	RB_RED		0
+#define	RB_BLACK	1
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+///rb_set_parent	//TODO GENERIC ADDONLY
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			WRITE_ONCE(parent->rb_left, new);
+		else
+			WRITE_ONCE(parent->rb_right, new);
+	} else
+		WRITE_ONCE(root->rb_node, new);
+}
+
+/////////////////////////////
+
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
+
+#define RB_ROOT	(struct rb_root) { NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
+
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+			    struct rb_root *root);
+extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+				struct rb_root *root);
+
+
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+/*static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}**/ //TODO LESS_DEPENDENCIES
+
+#define rb_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
+	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); 1; }); \
+	     pos = n)
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+static inline void rb_erase_cached(struct rb_node *node,
+				   struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase(node, &root->rb_root);
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+///////////////////////////////////////////////////////////////////////////////
+/***END OF ORIGINAL LINUX KERNEL HEADER MINIMALIZED
+ ***FEW AUX FUNCS TO SUPPORT SpMM-Symbolic
+ ***/
+#include <string.h>
+#include "config.h"
+typedef struct{
+	idx_t key;
+	struct rb_node rb;
+
+	/* following fields used for testing augmented rbtree functionality
+	u32 val;
+	u32 augmented;	///only for AUGMENTED_TEST
+	*/
+} rbNode;
+//static struct rb_root_cached root = RB_ROOT_CACHED;
+
+typedef struct rb_root_cached rbRoot;
+
+
+/*
+ * return 1 if @node with the given key @key 
+ * has been inserted in rbtree rooted at @root; 0 otherwise
+ */
+static inline int rbInsertStdNewKey(rbRoot *root,rbNode *node, idx_t key)
+{
+	struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+	idx_t parentK;
+
+	while (*new) {
+		parent = *new;
+		parentK = rb_entry(parent, rbNode, rb)->key;
+		if	(key < parentK)		new = &parent->rb_left;
+		else if	(key > parentK)		new = &parent->rb_right;
+		else				return 0; //already in 
+		DEBUGCHECKS			assert( *new != parent );
+	}
+	//insert the node in the correct position, assigning the key
+	/*DEBUGCHECKS{	//check for double insertion
+		rbNode testNode;
+		memset(&testNode,0,sizeof(testNode));
+		assert( !memcmp(node,&testNode,sizeof(*node)) );
+  		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n))
+			assert( n != *new );
+	}*/
+			
+	node->key = key;
+	rb_link_node(&node->rb, parent, new);
+	rb_insert_color(&node->rb, &root->rb_root);
+	return 1;
+}
+
+static inline int rbInsertCachedNewKey(rbRoot *root,rbNode *node, idx_t key)
+{
+	struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+	idx_t parentK;
+	bool leftmost = true;
+
+	while (*new) {
+		parent = *new;
+		parentK = rb_entry(parent, rbNode, rb)->key;
+		if (key < parentK)			new = &parent->rb_left;
+		else if (key > parentK) {
+			new = &parent->rb_right;
+			leftmost = false;
+		}
+		else					return 0;
+	}
+
+	//insert the node in the correct position, assigning the key
+	/*DEBUGCHECKS{	//check for double insertion
+		rbNode testNode;
+		memset(&testNode,0,sizeof(testNode));
+		assert( !memcmp(node,&testNode,sizeof(*node)) );
+  		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n))
+			assert( n != *new );
+	}*/
+			
+	node->key = key;
+	rb_link_node(&node->rb, parent, new);
+	rb_insert_color_cached(&node->rb, root, leftmost);
+	return 1;
+}
+
+static inline int rbInsertNewKey(rbRoot *root,rbNode *node, idx_t key){
+	#if	RB_CACHED_INSERT == TRUE
+	return rbInsertCachedNewKey(root,node,key);
+	#else
+	return rbInsertStdNewKey(root,node,key);
+	#endif
+}
+
+#define rbNodeOrderedVisit(n,root) \
+  for (n = rb_first(&root->rb_root); n; n = rb_next(n));
+
+inline void cleanRbNodes(rbRoot* root,rbNode* nodes,idx_t nodesNum){
+	memset(nodes,0,nodesNum * sizeof(*nodes));
+	memset(root,0,sizeof(*root));
+}
+#endif	/* _LINUX_RBTREE_H */
--- a/base/serial/impl/sp3mm4amg/include/macros.h
+++ b/base/serial/impl/sp3mm4amg/include/macros.h
@ -0,0 +1,118 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#ifndef MACROS
+#define MACROS
+
+#include <stdio.h>
+#include <stdlib.h>
+
+///aux macro-functions
+#define	ABS(a)				        ((a) > 0   ? (a) : -(a))
+#define	MIN(a,b)			        ((a) < (b) ? (a) : (b))
+#define MAX(a,b)			        ((a) > (b) ? (a) : (b))
+#define AVG(a,b)                    ((a)/2 + (b)/2 + ((a)%2+(b)%2)/2)
+#define SWAP(a,b)                   (a)=(a)^(b);(b)=(b)^(a);(a)=(a)^(b)
+#define IN_RANGE(i,start,end)		( (start) <= (i) && (i) <= (end) )
+
+//ceil(x/y) with integers
+#define INT_DIV_CEIL(x,y)		    ( ( (x) - 1) / (y) + 1 )
+//2D ROW MAJOR indexing wrap compute
+#define IDX2D(i,j,nCols)            ((j) + (i)*(nCols))
+
+///distribuite reminder @rem in group givin an extra +1 to the first @rem
+#define UNIF_REMINDER_DISTRI(i,div,rem) \
+    ( (div) + ( (i) < (rem) ? 1 : 0 ) )
+#define UNIF_REMINDER_DISTRI_STARTIDX(i,div,rem) \
+    ( (i) * (div) + MIN( (i),(rem) ) )
+#define UNIF_REMINDER_DISTRI_ENDIDX(i,div,rem) \
+    ( (i+1) * (div) + MIN( (i),(rem) ) )
+//shorter name alias
+#define unifRemShare(i,div,rem)			UNIF_REMINDER_DISTRI(i,div,rem)
+#define unifRemShareStart(i,div,rem)	UNIF_REMINDER_DISTRI_STARTIDX(i,div,rem)
+#define unifRemShareEnd(i,div,rem)		UNIF_REMINDER_DISTRI_ENDIDX( (i) , (div) , (rem) )
+#define unifRemShareBlock(i,div,rem)	unifRemShareStart(i,div,rem), unifRemShare(i,div,rem)
+
+#define STATIC_ARR_ELEMENTS_N(arr)  (sizeof( (arr) ) / (sizeof(*(arr))))  
+////STRING UTILS
+#define _STR(s) #s
+#define STR(s) _STR(s)
+///CONCATENATE
+//Concatenate preprocessor tokens A and B WITHOUT   expanding macro definitions
+#define _CAT(a,b)		a ## b
+#define _CAT3(a,b,c)	a ## b ## c
+#define _CAT4(a,b,c,d)	a ## b ## c ## d
+//Concatenate preprocessor tokens A and B           EXPANDING macro definitions
+#define CAT(a,b)		_CAT(a,b)
+#define CAT3(a,b,c)		_CAT3(a,b,c)
+#define CAT4(a,b,c,d)	_CAT4(a,b,c,d)
+
+#define _UNDEF			_
+////PRINTS
+#define CHIGHLIGHT                  "\33[1m\33[92m"
+#define CCC                         CHIGHLIGHT
+#define CHIGHLIGHTERR               "\33[31m\33[1m\33[44m"
+#define CCCERR                      CHIGHLIGHTERR
+#define CEND                        "\33[0m"
+#define hprintsf(str,...)           printf( CHIGHLIGHT str CEND,__VA_ARGS__ ) 
+#define hprintf(str)                printf( CHIGHLIGHT str CEND) 
+#define ERRPRINTS(str,...)          fprintf( stderr, CHIGHLIGHTERR str CEND,__VA_ARGS__ )
+#define ERRPRINT(str)               fprintf( stderr, CHIGHLIGHTERR str CEND )
+
+#include <assert.h> 
+
+///aux types
+typedef unsigned char  uchar;
+typedef unsigned short ushort;
+typedef unsigned int   uint;
+typedef unsigned long  ulong;
+typedef char bool;
+#define TRUE    (!0)
+#define FALSE   0
+#define true    TRUE
+#define false   FALSE
+#define T   TRUE
+#define F   FALSE
+//smart decimal type custom precision def build macro _DECIMAL_TRGT_PREC 
+#ifndef _DECIMAL_TRGT_PREC
+//dflt floating point precision & formatting chars
+	#define _DECIMAL_TRGT_PREC	double
+	#define _DECIMAL_TRGT_PREC_PR 	"%lf"
+#else 
+    //TODO SELECT WITH RESPECT TO THE EXPORTED TARGET DECIMAL TYPE
+	#define _DECIMAL_TRGT_PREC_PR 	"%f"
+#endif
+typedef _DECIMAL_TRGT_PREC	decimal;
+
+
+///EXTRA INCLUDE    --- cuda 
+///assertionn are disabled at compile time by defining the NDEBUG preprocessor macro before including assert.h	s
+//#ifdef ASSERT 	#include <assert.h> #endif
+
+#endif 	//MACROS
--- a/base/serial/impl/sp3mm4amg/include/macrosLinuxMock.h
+++ b/base/serial/impl/sp3mm4amg/include/macrosLinuxMock.h
@ -0,0 +1,140 @@
+/*
+ *              RedBlackTree_linux_userspace
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the RedBlackTree_linux_userspace or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE RedBlackTree_linux_userspace GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef MACROSLINUXMOCK
+#define MACROSLINUXMOCK
+
+////////////////////////// LINUX KERNEL Userspaceing RBTree//////////////////////////////
+typedef unsigned int u32;
+typedef unsigned int cycles_t;
+
+#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
+
+/** 
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *	TODO LESS_DEPENDENCIES
+	removed:
+		BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
+				 !__same_type(*(ptr), void),			\
+				 "pointer type mismatch in container_of()");	\
+	
+ */
+#define container_of(ptr, type, member) ({				\
+	void *__mptr = (void *)(ptr);					\
+	((type *)(__mptr - offsetof(type, member))); })
+
+#define NOOP(x)     x
+//#define NOOP      do {} while(0)
+#undef unlikely
+#define unlikely(x) NOOP(x)
+/*
+ * Yes, this permits 64-bit accesses on 32-bit architectures. These will
+ * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
+ * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
+ * (e.g. a virtual address) and a strong prevailing wind.
+ */
+#define compiletime_assert_rwonce_type(t)					\
+	compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),	\
+		"Unsupported access size for {READ,WRITE}_ONCE().")
+///TODO OVERWRITTEN FOR PORTING
+#undef compiletime_assert_rwonce_type 
+#define compiletime_assert_rwonce_type(t)   NOOP(t)
+/*
+ * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
+ * atomicity. Note that this may result in tears!
+ */
+#ifndef __READ_ONCE
+#define __READ_ONCE(x)	(*(const volatile __unqual_scalar_typeof(x) *)&(x))
+#endif
+
+#define READ_ONCE(x)							\
+({									\
+	compiletime_assert_rwonce_type(x);				\
+	__READ_ONCE(x);							\
+})
+
+#define __WRITE_ONCE(x, val)						\
+do {									\
+	*(volatile typeof(x) *)&(x) = (val);				\
+} while (0)
+
+/*//TODO ORIGNAL VERSION
+#define WRITE_ONCE(x, val)						\
+do {									\
+	compiletime_assert_rwonce_type(x);				\
+	__WRITE_ONCE(x, val);						\
+} while (0)
+*/
+#define WRITE_ONCE(x,val)	x = val
+
+/*
+ * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
+ * word from memory atomically but without telling KASAN/KCSAN. This is
+ * usually used by unwinding code when walking the stack of a running process.
+ */
+#define READ_ONCE_NOCHECK(x)						\
+({									\
+	compiletime_assert(sizeof(x) == sizeof(unsigned long),		\
+		"Unsupported access size for READ_ONCE_NOCHECK().");	\
+	(typeof(x))__read_once_word_nocheck(&(x));			\
+})
+
+#include <x86intrin.h>	//rdtsc
+static inline cycles_t get_cycles(void)
+{
+	
+	/**#ifndef CONFIG_X86_TSC
+	if (!boot_cpu_has(X86_FEATURE_TSC))
+		return 0;
+	#endif */ ///TODO LESS_DEPENDENCIES
+
+	return __rdtsc();
+}
+
+/**#define WARN_ON_ONCE(condition)	({				\
+	static bool __section(".data.once") __warned;		\
+	int __ret_warn_once = !!(condition);			\
+								\
+	if (unlikely(__ret_warn_once && !__warned)) {		\
+		__warned = true;				\
+		WARN_ON(1);					\
+	}							\
+	unlikely(__ret_warn_once);				\
+}) */ //TODO LESS_DEPENDENCIES
+#include <assert.h>
+#define WARN_ON_ONCE(condition)	assert( !(condition) )
+
+#define div_u64(a,b)	( a / b )
+#define kfree(x)		free(x)
+
+#endif //MACROSLINUXMOCK
--- a/base/serial/impl/sp3mm4amg/include/mmio.h
+++ b/base/serial/impl/sp3mm4amg/include/mmio.h
@ -0,0 +1,133 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif
--- a/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
+++ b/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
@ -0,0 +1,105 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OMPCHUNKSDIVIDE
+#define OMPCHUNKSDIVIDE
+
+/*
+ * chunks distribution for runtime schedule setted to dynamic, TODO guided
+ * generic interface with elements, srcMatrix, configuration
+ * TODO the srcMatrix is leaved for advanced chunk distribution...
+ * configuration is expected to have a valid number of threadNum setted
+ */
+#include "config.h"
+//distribution of @rows|blocks of @matrix, exploiting @config
+typedef void (CHUNKS_DISTR )           (ulong,spmat*,CONFIG*);
+typedef void (*CHUNKS_DISTR_INTERF )   (ulong,spmat*,CONFIG*);
+
+//NOOP chunks division for manual configuration via export OMP_SCHEDULE  
+inline void chunksNOOP(ulong r,spmat* mat,CONFIG* cfg){ return; }
+#include "ompGetICV.h"
+//fair division of @r elements from matrix @mat with threads num in @cfg
+inline void chunksFair(ulong r,spmat* mat,CONFIG* cfg){
+	assert(cfg->threadNum > 0); //configured target thread num
+	omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
+	omp_get_schedule(&kind,&chunk_size);
+	k = kind;
+	#if _OPENMP >= 201811   //OMP_SCHED_SCHEDULE modifier from 5.0
+	monotonic = omp_sched_monotonic & kind;
+	if(monotonic)   k = kind-omp_sched_monotonic;
+	#endif
+	(void) monotonic;   //else no unused warning
+	switch(k){
+		case omp_sched_static :
+			DEBUG   printf("static it's already fair\n");
+			return;
+		case omp_sched_dynamic: 
+			chunk_size_new = MAX(1,r/cfg->threadNum);
+			break; 
+		//case omp_sched_guided :
+		//case omp_sched_auto   :
+		//default:
+	}
+	if(chunk_size == chunk_size_new)	return;
+	omp_set_schedule(kind,chunk_size_new);
+	VERBOSE printf("chunksFair:\tchunk adapted to %d\n",chunk_size_new);
+	DEBUG   ompGetRuntimeSchedule(NULL);
+}
+//fair division of @r elements from matrix @mat of threads in @cfg
+//subdividing the fair share with a factor of @FAIR_CHUNKS_FOLDING
+inline void chunksFairFolded(ulong r,spmat* mat,CONFIG* cfg){
+	assert(cfg->threadNum > 0); //configured target thread num
+	omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
+	omp_get_schedule(&kind,&chunk_size);
+	k = kind;
+	#if _OPENMP >= 201811   //OMP_SCHED_SCHEDULE modifier from 5.0
+	monotonic = omp_sched_monotonic & kind;
+	if(monotonic)   k = kind-omp_sched_monotonic;
+	#endif
+	(void) monotonic;   //else no unused warning
+	switch(k){
+		case omp_sched_static :
+			DEBUG   printf("static it's already fair\n");
+			return;
+		case omp_sched_dynamic: 
+			chunk_size_new = MAX(1,r/(cfg->threadNum*FAIR_CHUNKS_FOLDING));
+			break; 
+		//case omp_sched_guided :
+		//case omp_sched_auto   :
+		//default:
+	}
+	if(chunk_size == chunk_size_new)	return;
+	omp_set_schedule(kind,chunk_size_new);
+	DEBUG{  //check with ICV get
+		printf("chunksFairFolded:\tchunk adapted to %d\n",chunk_size_new);
+		ompGetRuntimeSchedule(NULL);
+	}
+}
+#endif
--- a/base/serial/impl/sp3mm4amg/include/ompGetICV.h
+++ b/base/serial/impl/sp3mm4amg/include/ompGetICV.h
@ -0,0 +1,41 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OMPGETICV_H
+#define OMPGETICV_H
+//only header definitions
+/*
+ * log sched configuration on stdout
+ * return kind,monotonic,chunkSize if arg0 not NULL
+ */
+void ompGetRuntimeSchedule(int* );
+void ompGetAllICV();    //only if not OMP_GET_ICV_MAIN 
+float ompVersionMacroMap(); //version number as float using API dates mappings
+#endif
--- a/base/serial/impl/sp3mm4amg/include/ompi_config_minimal.h
+++ b/base/serial/impl/sp3mm4amg/include/ompi_config_minimal.h
@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007-2021 Cisco Systems, Inc.  All rights reserved
+ * Copyright (c) 2008-2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2009-2012 Oak Rigde National Laboratory.  All rights reserved.
+ * Copyright (c) 2011-2020 Sandia National Laboratories. All rights reserved.
+ * Copyright (c) 2012-2018 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2011-2013 INRIA.  All rights reserved.
+ * Copyright (c) 2015      University of Houston. All rights reserved.
+ * Copyright (c) 2015-2021 Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2017-2019 IBM Corporation.  All rights reserved.
+ * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021-2022 Google, LLC. All rights reserved.
+ * Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates.  All Rights
+ *                         reserved.
+ * Copyright (c) 2021      Bull S.A.S. All rights reserved.
+ * Copyright (c) 2018      Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2021 Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/*
+ * Error classes and codes
+ * Do not change the values of these without also modifying mpif.h.in.
+ */
+#define MPI_SUCCESS                   0
+#define MPI_ERR_BUFFER                1
+#define MPI_ERR_COUNT                 2
+#define MPI_ERR_TYPE                  3
+#define MPI_ERR_TAG                   4
+#define MPI_ERR_COMM                  5
+#define MPI_ERR_RANK                  6
+#define MPI_ERR_REQUEST               7
+#define MPI_ERR_ROOT                  8
+#define MPI_ERR_GROUP                 9
+#define MPI_ERR_OP                    10
+#define MPI_ERR_TOPOLOGY              11
+#define MPI_ERR_DIMS                  12
+#define MPI_ERR_ARG                   13
+#define MPI_ERR_UNKNOWN               14
+#define MPI_ERR_TRUNCATE              15
+#define MPI_ERR_OTHER                 16
+#define MPI_ERR_INTERN                17
+#define MPI_ERR_IN_STATUS             18
+#define MPI_ERR_PENDING               19
+#define MPI_ERR_ACCESS                20
+#define MPI_ERR_AMODE                 21
+#define MPI_ERR_ASSERT                22
+#define MPI_ERR_BAD_FILE              23
+#define MPI_ERR_BASE                  24
+#define MPI_ERR_CONVERSION            25
+#define MPI_ERR_DISP                  26
+#define MPI_ERR_DUP_DATAREP           27
+#define MPI_ERR_FILE_EXISTS           28
+#define MPI_ERR_FILE_IN_USE           29
+#define MPI_ERR_FILE                  30
+#define MPI_ERR_INFO_KEY              31
+#define MPI_ERR_INFO_NOKEY            32
+#define MPI_ERR_INFO_VALUE            33
+#define MPI_ERR_INFO                  34
+#define MPI_ERR_IO                    35
+#define MPI_ERR_KEYVAL                36
+#define MPI_ERR_LOCKTYPE              37
+#define MPI_ERR_NAME                  38
+#define MPI_ERR_NO_MEM                39
+#define MPI_ERR_NOT_SAME              40
+#define MPI_ERR_NO_SPACE              41
+#define MPI_ERR_NO_SUCH_FILE          42
+#define MPI_ERR_PORT                  43
+#define MPI_ERR_QUOTA                 44
+#define MPI_ERR_READ_ONLY             45
+#define MPI_ERR_RMA_CONFLICT          46
+#define MPI_ERR_RMA_SYNC              47
+#define MPI_ERR_SERVICE               48
+#define MPI_ERR_SIZE                  49
+#define MPI_ERR_SPAWN                 50
+#define MPI_ERR_UNSUPPORTED_DATAREP   51
+#define MPI_ERR_UNSUPPORTED_OPERATION 52
+#define MPI_ERR_WIN                   53
+
+#define MPI_T_ERR_MEMORY              54
+#define MPI_T_ERR_NOT_INITIALIZED     55
+#define MPI_T_ERR_CANNOT_INIT         56
+#define MPI_T_ERR_INVALID_INDEX       57
+#define MPI_T_ERR_INVALID_ITEM        58
+#define MPI_T_ERR_INVALID_HANDLE      59
+#define MPI_T_ERR_OUT_OF_HANDLES      60
+#define MPI_T_ERR_OUT_OF_SESSIONS     61
+#define MPI_T_ERR_INVALID_SESSION     62
+#define MPI_T_ERR_CVAR_SET_NOT_NOW    63
+#define MPI_T_ERR_CVAR_SET_NEVER      64
+#define MPI_T_ERR_PVAR_NO_STARTSTOP   65
+#define MPI_T_ERR_PVAR_NO_WRITE       66
+#define MPI_T_ERR_PVAR_NO_ATOMIC      67
+#define MPI_ERR_RMA_RANGE             68
+#define MPI_ERR_RMA_ATTACH            69
+#define MPI_ERR_RMA_FLAVOR            70
+#define MPI_ERR_RMA_SHARED            71
+#define MPI_T_ERR_INVALID             72
+#define MPI_T_ERR_INVALID_NAME        73
+#define MPI_ERR_PROC_ABORTED          74
+
+/* not #if conditional on OPAL_ENABLE_FT_MPI for ABI */
+#define MPI_ERR_PROC_FAILED           75
+#define MPI_ERR_PROC_FAILED_PENDING   76
+#define MPI_ERR_REVOKED               77
+
+/* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
+   MPI_ERR_<foo> code. Set the last code to allow some room for adding
+   error codes without breaking ABI. */
+#define MPI_ERR_LASTCODE              92
+
+/*
+ * Comparison results.  Don't change the order of these, the group
+ * comparison functions rely on it.
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_IDENT,
+  MPI_CONGRUENT,
+  MPI_SIMILAR,
+  MPI_UNEQUAL
+};
+
+/*
+ * MPI_Init_thread constants
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_THREAD_SINGLE,
+  MPI_THREAD_FUNNELED,
+  MPI_THREAD_SERIALIZED,
+  MPI_THREAD_MULTIPLE
+};
+
+/*
+ * Datatype combiners.
+ * Do not change the order of these without also modifying mpif.h.in.
+ * (see also mpif-common.h.fin).
+ */
+enum {
+  MPI_COMBINER_NAMED,
+  MPI_COMBINER_DUP,
+  MPI_COMBINER_CONTIGUOUS,
+  MPI_COMBINER_VECTOR,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HVECTOR_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HVECTOR_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HVECTOR,
+  MPI_COMBINER_INDEXED,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HINDEXED_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HINDEXED_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HINDEXED,
+  MPI_COMBINER_INDEXED_BLOCK,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_STRUCT_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_STRUCT_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_STRUCT,
+  MPI_COMBINER_SUBARRAY,
+  MPI_COMBINER_DARRAY,
+  MPI_COMBINER_F90_REAL,
+  MPI_COMBINER_F90_COMPLEX,
+  MPI_COMBINER_F90_INTEGER,
+  MPI_COMBINER_RESIZED,
+  MPI_COMBINER_HINDEXED_BLOCK
+};
+
+////MOCK ERRHANDELERS
+#define OMPI_ERRHANDLER_INVOKE(comm,err,fname)				err
+#define OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,fname)		err
+
--- a/base/serial/impl/sp3mm4amg/include/parser.h
+++ b/base/serial/impl/sp3mm4amg/include/parser.h
@ -0,0 +1,90 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef PARSER
+#define PARSER
+
+#include "mmio.h" 
+#include "sparseMatrix.h" 
+
+typedef struct{
+	ulong row;
+	ulong col;
+	double val;
+} entry;	 //MatrixMarket COO entry
+
+typedef struct{
+	MM_typecode mcode;
+	entry* entries;
+	ulong* rowLens;
+	ulong M,N,NZ;   //spmat sizes
+} MatrixMarket;
+
+////COO PARSE
+//parse and check MatrixMarket mat in @matPath file
+MatrixMarket* MMRead(char* matPath);
+void freeMatrixMarket(MatrixMarket* mm);
+//basic check for sparse matrix compliance to the app, return posix bool
+int MMCheck(MM_typecode typecode);
+/* 
+ * parse MatrixMarket matrix entries in @fp, of type @mcode
+ * into COOrdinate list of entries
+ *  -> expand simmetric matrixes into a normal matrix with both parts
+ *	  so NZ will be inplace doubled
+ * return allocated and filled COO entries with the NNZ number into
+ * NO SORT CHECKING HERE
+ */
+entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens);
+
+////COO -> ANYTHING ELSE CONVERSION
+/*
+ * write COO entries in @entries inside sparse matrix @mat in ELL format
+ * EXPECTED: CSR arrays allocated, @entries col sorted in (not madatory consecut) rows
+ * [simmetrical parts explicitly rappresented --> not important here]
+ */
+int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens);
+/*
+ * write COO entries in @entries inside sparse matrix @mat in ELL format
+ * EXPECTED: @entries col sorted in (not madatory consecut) rows
+ * ELL internal array allocated in this function, not freed in case of error
+ */
+int COOtoELL(entry* entries, spmat* mat, ulong* rowLens);
+////wrapper MM -> specialized target
+/*
+ * Parse MatrixMarket matrix stored in file at @matPath
+ * IMPLEMENTED WRAPPING: MMtoCOO -> COOtoCSR
+ * Returns: allocated spmat sparse matrix with all field allocated
+ * symetric matrix are expanded in a full matrix
+ */
+spmat* MMtoCSR(char* matPath);
+spmat* MMtoELL(char* matPath);
+
+
+#endif
--- a/base/serial/impl/sp3mm4amg/include/sparseMatrix.h
+++ b/base/serial/impl/sp3mm4amg/include/sparseMatrix.h
@ -0,0 +1,216 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//sparse matrix def & aux
+#ifndef SPARSEMATRIX
+#define SPARSEMATRIX 
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "macros.h"
+#include "config.h"
+
+typedef struct {
+	idx_t NZ,M,N;
+	double *AS; 
+	idx_t* JA;
+	//CSR SPECIFIC
+	idx_t* IRP;
+	#ifdef ROWLENS
+	idx_t* RL;   //row lengths
+	#endif
+	//CUDA SPECIFIC
+	idx_t MAX_ROW_NZ;
+
+} spmat; //describe a sparse matrix
+
+//smart index keeping in a dense map
+typedef struct{
+	idx_t	len;				//num of nnz idx accumulated
+	/* nnz index presence packing, implict space enough for all possible indexes*/
+	nnz_idxs_flags_t idxsMap;
+	uint idxsMapN;	//either num of limbs or len of char flag array
+} SPVECT_IDX_DENSE_MAP;
+int initSpVectIdxDenseAcc(idx_t idxMax, SPVECT_IDX_DENSE_MAP* );  
+inline void _resetIdxMap(SPVECT_IDX_DENSE_MAP* acc){
+	acc->len = 0;
+	memset(acc->idxsMap,0,sizeof(*acc->idxsMap)*acc->idxsMapN);
+}
+inline void _freeIdxMap(SPVECT_IDX_DENSE_MAP* acc){
+	free(acc->idxsMap);
+	free(acc);
+}
+//aux struct for sparse vector-scalar product accumualtion
+typedef struct{ 
+	double*  v;          			//aux accumulating dense vector (sparse)
+	idx_t*   nnzIdx;     			//v nnz value's indexes 		(contiguos)
+	idx_t    vLen;       			//size of the aux dense vector //TODO USELESS?
+	SPVECT_IDX_DENSE_MAP nnzIdxMap;	//
+} ACC_DENSE;
+int allocAccDense(ACC_DENSE* v, ulong size);
+
+/*  
+ **** NNZ IDXS PRESENCE FLAGS ACCESS INTERFACE:  ***
+ *	sp_vect_idx_set(idx,SPVECT_IDX_DENSE_MAP)
+ *		-> return 0 if idx isn't already setted and in case set it
+ */
+inline int spVect_idx_in(idx_t idx, SPVECT_IDX_DENSE_MAP* idxsMapAcc){
+	#if SPVECT_IDX_BITWISE == TRUE
+	uint limbID 	= idx / LIMB_SIZE_BIT; //idx's limb id
+	uint limbIdxID	= idx % LIMB_SIZE_BIT; //idx's pos in limb
+	DEBUGCHECKS		assert( limbID < idxsMapAcc->idxsMapN );
+	limb_t idxPos   = ((limb_t) 1) << limbIdxID;
+	if (!(idxsMapAcc->idxsMap[limbID] & idxPos) ){
+		idxsMapAcc->idxsMap[limbID] |= idxPos;
+		idxsMapAcc->len++;
+		return 0;
+	}
+	#else	
+	//assert( idx < idxsMapAcc->idxsMapN ); //TODO
+	if (!( idxsMapAcc->idxsMap[idx] )){	//TODO usabile primitiva atomica di cmpswap, accorpamento ops a livello HW?
+		idxsMapAcc->idxsMap[idx] = 1;
+		idxsMapAcc->len++;
+		return 0;
+	}
+	#endif //SPVECT_IDX_BITWISE == TRUE
+	return 1;
+}	
+inline void _resetAccVect(ACC_DENSE* acc){
+	memset(acc->v,		 0,	acc->vLen * sizeof(*(acc->v)));
+	memset(acc->nnzIdx,	 0,	acc->vLen * sizeof(*(acc->nnzIdx)));
+	_resetIdxMap(&acc->nnzIdxMap);
+}
+////Sparse vector accumulator -- corresponding to a matrix portion
+typedef struct{
+	//idx_t    r;     //row index in the corresponding matrix
+	//idx_t    c;     //col index in the corresponding matrix
+	idx_t   len;   //rowLen
+	double* AS;    //row nnz    values
+	idx_t*  JA;    //row nnz    colIndexes
+} SPACC; 
+
+
+/*
+ * ARRAY BISECTION - RECURSIVE VERSION
+ * TODO ASSERT LEN>0 ommitted
+ */
+inline int BISECT_ARRAY(idx_t target, idx_t* arr, idx_t len){
+	//if (len == 0)              return FALSE;
+	if (len <= 1)              return *arr == target; 
+	idx_t middleIdx = (len-1) / 2;  //len=5-->2, len=4-->1
+	idx_t middle    = arr[ middleIdx ];
+	if	(target == middle)  return TRUE;
+	else if (target <  middle)  return BISECT_ARRAY(target,arr,middleIdx); 
+	else	return BISECT_ARRAY(target,arr+middleIdx+1,middleIdx + (len-1)%2);
+}
+
+/*
+ * return !0 if col @j idx is in row @i of sparse mat @smat
+ * bisection used --> O(log_2(ROWLENGHT))
+ */
+inline int IS_NNZ(spmat* smat,idx_t i,idx_t j){
+	idx_t rStart = smat->IRP[i];
+	idx_t rLen   = smat->IRP[i+1] - rStart;
+	if (!rLen)  return FALSE;
+	return BISECT_ARRAY(j,smat->JA + rStart,rLen);
+}
+inline int IS_NNZ_linear(spmat* smat,idx_t i,idx_t j){	//linear -> O(ROWLENGHT)
+	int out = 0;
+	for (idx_t x=smat->IRP[i]; x<smat->IRP[i+1] && !out; x++){
+		out = (j == smat->JA[x]); 
+	} 
+	return out;
+}
+////aux functions
+//free sparse matrix
+inline void freeSpmatInternal(spmat* mat){
+	if(!mat)	return;
+	free(mat->AS);  
+	free(mat->JA);  
+	free(mat->IRP);  
+#ifdef ROWLENS
+	free(mat->RL);
+#endif 
+}
+
+inline void freeSpmat(spmat* mat){
+	if (!mat)	return;
+	freeSpmatInternal(mat);
+	free(mat);
+}
+
+static inline void zeroSpmat(spmat* m) {
+	memset(m->AS, 0, sizeof(*(m->AS)) * m->NZ);
+	memset(m->JA, 0, sizeof(*(m->JA)) * m->NZ);
+	memset(m->IRP, 0, sizeof(*(m->IRP)) * (m->M + 1));
+}
+
+//free max aux structs not NULL pointed
+inline void freeSpAcc(SPACC* r){ 
+	free(r->AS);
+	free(r->JA);
+}
+////alloc&init functions
+//alloc&init internal structures only dependent of dimensions @rows,@cols
+inline int allocSpMatrixInternal(idx_t rows, idx_t cols, spmat* mat){
+	mat -> M = rows;
+	mat -> N = cols;
+	if (!(mat->IRP=calloc(mat->M+1,sizeof(*(mat->IRP))))){ //calloc only for 0th
+		ERRPRINT("IRP calloc err\n");
+		freeSpmatInternal(mat);
+		return EXIT_FAILURE;
+	}
+#ifdef ROWLENS
+	if (!(mat->RL = malloc(mat->M*sizeof(*(mat->RL))))){
+		ERRPRINT("RL calloc err\n");
+		freeSpmatInternal(mat);
+		return EXIT_FAILURE;
+	}
+#endif
+	return EXIT_SUCCESS;
+}
+
+//alloc a sparse matrix of @rows rows and @cols cols 
+inline spmat* allocSpMatrix(idx_t rows, idx_t cols){
+
+	spmat* mat;
+	if (!(mat = calloc(1,sizeof(*mat)))) { 
+		ERRPRINT("mat  calloc failed\n");
+		return NULL;
+	}
+	if (allocSpMatrixInternal(rows,cols,mat)){
+		free(mat);
+		return NULL;
+	}
+	return mat;
+}
+
+#endif
--- a/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
+++ b/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
@ -0,0 +1,138 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "sparseMatrix.h"
+#include "config.h"
+
+#ifndef OFF_F
+	#pragma error OFF_F required
+#endif
+//////////////////////////////// CSR SPECIFIC /////////////////////////////////
+///SPARSE MATRIX PARTITIONING
+/*
+ * partition CSR sparse matrix @A in @gridCols columns partitions 
+ * returning an offsets matrix out[i][j] = start of jth colPartition of row i
+ * subdivide @A columns in uniform cols ranges in the output 
+ */
+idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
+
+/*
+ * partition CSR sparse matrix @A in @gridCols columns partitions as 
+ * indipended and allocated sparse matrixes and return them
+ * subdivide @A columns in uniform cols ranges in the output 
+ */
+spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
+//same as above but with (aux) use of offsets partitoning (also returned if colOffsets!=NULL
+spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,idx_t** colPartsOffsets);
+
+//same as checkOverallocPercent but with 2D partitioning - CSR col partitioning
+void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
+				  	      idx_t gridCols,idx_t* bColOffsets);
+///////////////////////////////////////////////////////////////////////////////
+
+///Single implementations headers
+#ifndef SPARSEUTILS_H_COMMON_IDX_IMPLS
+#define SPARSEUTILS_H_COMMON_IDX_IMPLS
+
+//shift every index about the sparse data for use the matrix in a fortran app
+inline void C_FortranShiftIdxs(spmat* m){
+	for(ulong r=0; r<m->M+1; m -> IRP[r]++,r++);
+	for(ulong i=0; i<m->NZ;  m -> JA[i]++, i++);
+}
+//shift every index about the sparse data for use the matric in a C app
+inline void Fortran_C_ShiftIdxs(spmat* m){	//TODO DBG ONLY and compleatness
+	for(ulong r=0; r<m->M+1; m -> IRP[r]--,r++);
+	for(ulong i=0; i<m->NZ;  m -> JA[i]--, i++);
+}
+
+/*
+ * check SpMM resulting matrix @AB = A * B nnz distribution in rows
+ * with the preallocated, forecasted size in @forecastedSizes 
+ * in @forecastedSizes there's for each row -> forecasted size 
+ * and in the last entry the cumulative of the whole matrix
+ */
+void checkOverallocPercent(ulong* forecastedSizes,spmat* AB);
+/*  
+	check if sparse matrixes A<->B differ up to 
+	DOUBLE_DIFF_THREASH per element
+*/
+int spmatDiff(spmat* A, spmat* B);
+////dyn alloc of spMM output matrix
+/*
+///size prediction of AB = @A * @B
+inline ulong SpMMPreAlloc(spmat* A,spmat* B){
+	//TODO BETTER PREALLOC HEURISTICS HERE 
+	return MAX(A->NZ,B->NZ);
+}
+//init a sparse matrix AB=@A * @B with a initial allocated space by an euristic
+inline spmat* initSpMatrixSpMM(spmat* A, spmat* B){
+	spmat* out;
+	if (!(out = allocSpMatrix(A->M,B->N)))  return NULL;
+	out -> NZ = SpMMPreAlloc(A,B);
+	if (!(out->AS = malloc(out->NZ*sizeof(*(out->AS))))){
+		ERRPRINT("initSpMatrix: out->AS malloc errd\n");
+		free(out);
+		return NULL;
+	}
+	if (!(out->JA = malloc(out->NZ*sizeof(*(out->JA))))){
+		ERRPRINT("initSpMatrix: out->JA malloc errd\n");
+		freeSpmat(out);
+		return NULL;
+	}
+	return out;
+}
+
+#define REALLOC_FACTOR  1.5
+//realloc sparse matrix NZ arrays
+inline int reallocSpMatrix(spmat* mat,ulong newSize){
+	mat->NZ *= newSize;
+	void* tmp;
+	if (!(tmp = realloc(mat->AS,mat->NZ * sizeof(*(mat->AS))))){
+		ERRPRINT("reallocSpMatrix:  realloc AS errd\n");
+		return EXIT_FAILURE;
+	}
+	mat->AS = tmp;
+	if (!(tmp = realloc(mat->JA,mat->NZ * sizeof(*(mat->JA))))){
+		ERRPRINT("reallocSpMatrix:  realloc JA errd\n");
+		return EXIT_FAILURE;
+	}
+	mat->JA = tmp;
+	return EXIT_SUCCESS;
+}
+*/
+////MISC
+//print useful information about 3SPMM about to compute
+void print3SPMMCore(spmat* R,spmat* AC,spmat* P,CONFIG* conf);
+void printSparseMatrix(spmat* sparseMat,char justNZMarkers);
+/*convert @sparseMat sparse matrix in dense matrix returned*/
+double* CSRToDense(spmat* sparseMat);
+
+void freeAccsDense(ACC_DENSE* vectors,ulong num);
+
+#endif //SPARSEUTILS_H_COMMON_IDX_IMPLS 
--- a/base/serial/impl/sp3mm4amg/include/sparseUtilsMulti.h
+++ b/base/serial/impl/sp3mm4amg/include/sparseUtilsMulti.h
@ -0,0 +1,52 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#ifndef SPARSEUTILSMULTI_H
+#define SPARSEUTILSMULTI_H
+
+#ifdef OFF_F	//save "includer" OFF_F value before overwriting it
+	#pragma push_macro("OFF_F")
+	#define _OFF_F_OLD
+	#undef  OFF_F
+#endif
+
+#define OFF_F 0
+#include "sparseUtilsGeneric.h"
+#undef  OFF_F
+
+#define OFF_F 1 
+#include "sparseUtilsGeneric.h"
+#undef  OFF_F
+
+#ifdef _OFF_F_OLD
+	#pragma pop_macro("OFF_F")
+	#undef  _OFF_F_OLD
+#endif
+
+#endif
--- a/base/serial/impl/sp3mm4amg/include/utils.h
+++ b/base/serial/impl/sp3mm4amg/include/utils.h
@ -0,0 +1,181 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#ifndef UTILS
+#define UTILS
+
+//#include <stddef.h> 
+#include "macros.h"
+
+#include "linuxK_rbtree_minimalized.h"
+extern int urndFd;	//file pointer to DRNG_DEVFILE O_RDONLY opened
+int init_urndfd(); // wrap init urndFd
+
+///IO
+//UNBUFFERED IO
+/*
+ * urndFd usage template to populate random timeout
+	if(_read_wrap(urndFd,(char*)&timeout,sizeof(timeout))<0){
+		fprintf(stderr,"rnd read for thread's timeout failed");
+		ret=EXIT_FAILURE;
+		goto end;
+	}
+ */
+//wrap read cycle over @fd
+int read_wrap(int fd,void* dst,size_t count);
+int fread_wrap(FILE* fp,void* dst,size_t count);
+//dual of read_wrap
+int write_wrap(int fd,void* src,size_t count);
+//create or open file at @outFpath for write
+int createNewFile(char* const outFpath);
+///STRUCTURED DATA IO
+#define DOUBLE_STR_FORMAT	"%25le\n"
+//write double vector @v as row sequence of double at @fpath
+//e.g. read with od -tf8 -w8 fpath : OCTALOFFSET:   DOUBLE FULL DIGITS
+int writeDoubleVector(char* fpath,double* v,ulong size);
+/*
+ * read vector of double [Str] of arbitrary size from @fpath, true lenght in *size
+ * if size point to a nnz value, the initial allocation will be of *size
+ * eventual successive reallocation done multipling *size with VECTOR_STEP_REALLOC
+ */
+double* readDoubleVector(char* fpath,ulong* size);
+double* readDoubleVectorStr(char* fpath,ulong* size);
+
+///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
+//dual of readDoubleVectorVector
+int writeDoubleVectorAsStr(char* fpath,double* v,ulong size);
+int MPI_Dims_create(int nnodes, int ndims, int dims[]);	//commons/ompi_dims_create/ompi_dims_create.c
+
+#include "config.h"
+///config from ENV
+#define GRID_ROWS   "GRID_ROWS"
+#define GRID_COLS   "GRID_COLS"
+//parse configuration from env
+int getConfig(CONFIG* conf);
+
+//append only list implemented with a reallocated array
+typedef struct{
+	ulong* a;
+	ulong  size;
+	ulong  lastIdx;
+} APPENDARRAY;
+//append @val to @list, reallocating if reached end
+//TODO inline int appendArr(ulong val,APPENDARRAY* list);
+
+void sortuint(uint* arr, uint len);     //sort uint array @arr of @len elements
+void sort_idx_t(idx_t* arr, idx_t len); 
+void sortulong(ulong* arr, ulong len);
+void sortRbNode(rbNode* arr,idx_t len);
+
+///ranges functions 
+/*
+ * considering @rangesN uniform ranges from [0,rangesEnd)
+ * return the rangeId that element @i is in
+ */
+inline ushort matchingUnifRangeIdxLinear(idx_t i,idx_t rangesEnd,ushort rangesN){
+	double rangeW = rangesEnd / (double) rangesN;
+	for(idx_t rEnd = rangeW,j=0; j < rangesN; rEnd = rangeW * (j++)){
+		if( i < rEnd )	return j;
+	}
+	assert( FALSE );	//i should be in a range!
+	return EXIT_FAILURE;
+}
+/*
+ * find which range @idx match among 
+ * a uniform range divion of @size element in @rangesN ranges with carried reminder
+ * return 0based idx of matched range
+ */
+inline ushort matchingUnifRangeIdx(idx_t idx, idx_t size, ushort rangesN){
+	idx_t rangeW = size/rangesN, rangeRem = size%rangesN;
+	idx_t searchStart,searchEnd;
+	//!IN_RANGE(idx,unifRemShareBlock(idx,rangeW,rangeRem)) && IN_RANGE(r,0,rangesN);){
+	for(ushort r=rangesN/2-1, hMoveWidth=rangesN/2; hMoveWidth>0;hMoveWidth/=2){
+		searchStart = unifRemShareStart(idx,rangeW,rangeRem);
+		searchEnd	= unifRemShareEnd(idx,rangeW,rangeRem);
+		if 		(IN_RANGE(idx,searchStart,searchEnd))	return r;
+		else if	(idx < searchStart)						r = AVG(0,r);
+		else											r = AVG(r,rangesN-1);
+	}
+	assert(FALSE);
+}
+
+///reductionUtils
+inline idx_t reductionSumSeq(idx_t* arr,idx_t arrLen){
+	idx_t i,out;
+	for(i=0,out=0; i<arrLen; out += arr[i++] );
+	return out;
+}
+inline idx_t reductionMaxSeq(idx_t* arr,idx_t arrLen){
+	idx_t i,out;
+	for( i=0,out=0; i<arrLen; i++,out=out<arr[i]?arr[i]:out );
+	return out;
+}
+inline idx_t reductionSumOmp(idx_t* arr,idx_t arrLen){
+	idx_t out;
+	#pragma omp parallel for reduction(+:out)
+	for(idx_t i=0; i<arrLen; i++){
+		out += arr[i];
+	}
+	return out;
+}
+inline idx_t reductionMaxOmp(idx_t* arr,idx_t arrLen){
+	idx_t out;
+	#pragma omp parallel for reduction(max:out)
+	for(idx_t i=0; i<arrLen; i++){	out=out<arr[i]?arr[i]:out;	}
+	return out;	
+}
+
+/*
+ * return 0 if vectors a and b has elements that differ at most of DOUBLE_DIFF_THREASH 
+ * if diffMax!=NULL save there the max difference value  
+ *  of the 2 entries of the input vectors, signed as a[i] - b[i] (as well as dump prints)
+ * CONVENTION:	@a = true result, @b vector to check with the true result 
+ */
+int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax);
+//fill a random vector in @v long @size doubles
+int fillRndVector(ulong size, double* v);
+//read vector as a sequence of space separated double from file at @fpath 
+#define VECTOR_STEP_MALLOC 100
+
+/* 
+ * decompress file at @path into @tmpFsDecompressPath, 
+ * decompression command obtanined first looking at the extension
+ * then matching it with a list of avaible decompression cmd
+ * that can be make as shell cmd adding @path > @tmpFsDecompressPath
+ * e.g. decompress xz -d -c @path > @tmpFsDecompressPath
+ * Returns: -1 if decompression wasn't possible otherwise decompress command exti status
+ */
+int extractInTmpFS(char* path, char* tmpFsDecompressPath);
+//compute E[@values] in @out[0] and VAR[@values] in @out[1] of @numVals values
+void statsAvgVar(double* values,uint numVals, double* out);
+void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers);
+void printVector(double* v,ulong size);
+
+void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
+#endif
--- a/base/serial/impl/sp3mm4amg/lib/linuxK_rbtree_minimalized.c
+++ b/base/serial/impl/sp3mm4amg/lib/linuxK_rbtree_minimalized.c
@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+
+  linux/lib/rbtree.c
+*/
+/*
+ *              RedBlackTree_linux_userspace
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the RedBlackTree_linux_userspace or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE RedBlackTree_linux_userspace GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+///#include <linux/rbtree_augmented.h>	//TODO LESS_DEPENDENCIES
+#include "linuxK_rbtree_minimalized.h"	///fulfill embedded deps needed //TODO LESS_DEPENDENCIES
+//#include <linux/export.h>
+
+/*	MINIMALIZED VERSION WITHOUT THE IMPORT OF rbtree_augmented 
+ * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
+
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
+///rb_set_black	//TODO ADDONLY_GENERIC
+
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
+{
+	return (struct rb_node *)red->__rb_parent_color;
+}
+
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	__rb_change_child(old, new, parent, root);
+}
+
+///TODO REMOVE AGUMENT
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+	while (true) {
+		/*
+		 * Loop invariant: node is red.
+		 */
+		if (unlikely(!parent)) {
+			/*
+			 * The inserted node is root. Either this is the
+			 * first node, or we recursed at Case 1 below and
+			 * are no longer violating 4).
+			 */
+			rb_set_parent_color(node, NULL, RB_BLACK);
+			break;
+		}
+
+		/*
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as,
+		 * per 4), we don't want a red root or two
+		 * consecutive red nodes.
+		 */
+		if(rb_is_black(parent))
+			break;
+
+		gparent = rb_red_parent(parent);
+
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - node's uncle is red (color flips).
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            n
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_right;
+			if (node == tmp) {
+				/*
+				 * Case 2 - node's uncle is black and node is
+				 * the parent's right child (left rotate at parent).
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				///augment_rotate(parent, node); //TODO DEL AUGMENTED DEP
+				parent = node;
+				tmp = node->rb_right;
+			}
+
+			/*
+			 * Case 3 - node's uncle is black and node is
+			 * the parent's left child (right rotate at gparent).
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			///augment_rotate(gparent, parent); //TODO DEL_AUGMENTED_DEP
+			break;
+		} else {
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_left;
+			if (node == tmp) {
+				/* Case 2 - right rotate at parent */
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				///augment_rotate(parent, node); //TODO DEL_AUGMENTED_DEP
+				parent = node;
+				tmp = node->rb_left;
+			}
+
+			/* Case 3 - left rotate at gparent */
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			///augment_rotate(gparent, parent); //TODO DEL_AUGMENTED_DEP
+			break;
+		}
+	}
+}
+
+/*
+ * Inline version for rb_erase() use - we want to be able to inline
+ * and eliminate the dummy_rotate callback there
+ */
+///____rb_erase_color	//TODO ADDONLY_GENERIC
+
+/* Non-inline version for rb_erase_augmented() use */
+///__rb_erase_color	//TODO ADDONLY_GENERIC
+
+/*	TODO LESS_DEPENDENCIES MINIMALIZED
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+
+static const struct rb_augment_callbacks dummy_callbacks = {
+	.propagate = dummy_propagate,
+	.copy = dummy_copy,
+	.rotate = dummy_rotate
+};*/
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	__rb_insert(node, root);
+}
+
+///rb_erase	//TODO ADDONLY_GENERIC
+
+/** TODO LESS_DEPENDENCIES - AUGMENTED REMOVE
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
+ */
+///__rb_insert_augmented todo LESS_DEPENDENCIES - AUGMENTED REMOVE
+
+////// Traversing
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right;
+		while (node->rb_left)
+			node = node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
+	if (node->rb_left) {
+		node = node->rb_left;
+		while (node->rb_right)
+			node = node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+
+///rb_replace_node	//TODO ADDONLY_GENERIC
+///rb_replace_node_rcu	//TODO LESS_DEPENDENCIES
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
--- a/base/serial/impl/sp3mm4amg/lib/mmio.c
+++ b/base/serial/impl/sp3mm4amg/lib/mmio.c
@ -0,0 +1,511 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
--- a/base/serial/impl/sp3mm4amg/lib/parser.c
+++ b/base/serial/impl/sp3mm4amg/lib/parser.c
@ -0,0 +1,391 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "sparseMatrix.h"
+#include "mmio.h"
+#include "parser.h"
+#include "macros.h"
+#include "utils.h"
+
+////COO PARSE
+int MMCheck(MM_typecode mcode) {
+	if (!mm_is_matrix(mcode)){  //consistency checks among flags in @mcode
+		ERRPRINT("invalid matrix: not a matrix\n");
+		return EXIT_FAILURE;
+	}
+	if (mm_is_dense(mcode) ){   //|| mm_is_array(mcode) ){
+		ERRPRINT("invalid matrix: not a supported sparse matrix\tDENSE MAT\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens){
+	int scanndRet=0;
+	ulong nzTrgt=*NZ,nzIdx=0;	//expanded num of nz (in case of sym matrix)
+	ulong diagEntries=0, row = 0, col = 0;//current entry's row,col from MM -> 1 based
+	double val = 0;
+	entry* entries = NULL;		//COO parsed entries
+	///init
+	if (mm_is_symmetric(mcode)){
+		nzTrgt = 2* (*NZ); //upscale max num of nz in the matrix
+		VERBOSE	 printf("MMtoCOO:\tparsing a simmetric matrix\n");
+	}
+	if (!(entries	 = malloc(nzTrgt * sizeof(*entries)))){
+		ERRPRINT("MMtoCOO:  entries malloc errd\n");
+		return NULL;
+	}
+	///parse MM fp lines into COOordinate entries
+	while (1) { // Reading the fp until EOF
+		if (mm_is_pattern(mcode)){
+			scanndRet = fscanf(fp, "%lu %lu\n", &row, &col);
+			val = 1.0;
+		} else if (mm_is_real(mcode) || (mm_is_integer(mcode))){
+			scanndRet = fscanf(fp, "%lu %lu %lf\n", &row, &col, &val);
+		}
+		
+
+		if (scanndRet == EOF){ //TODO more strict check with type&ret?
+			if (ferror(fp)){
+				perror("fscanf EOF");
+				goto _err; 
+			} else  break;
+		}
+		CONSISTENCY_CHECKS{ 
+			//TODO USELESS ? ? ?
+			if ((mm_is_pattern(mcode) && scanndRet != 2) || 
+				(!mm_is_pattern(mcode) && scanndRet != 3)){
+				ERRPRINT("invalid matrix: not consistent entry scannable\n");
+				goto _err; 
+			}
+		}
+		////ADD THE CURRENT MATRIX ENTRY
+		rowLens[row-1]++;
+		entries[nzIdx++]=(entry) { .row=row-1, .col=col-1, .val=val };
+		//also mirrored entry if sym.matrix with reflected idx inside matrix limits
+		if (mm_is_symmetric(mcode) && row != col ){
+			//TODO COSTRAINED FORMAT ?&& row <= mat->N && col <= mat->M ){
+			SWAP(row,col);
+			rowLens[row-1]++;
+			entries[nzIdx++]=(entry) { .row=row-1, .col=col-1, .val=val };
+		}
+		else	diagEntries++;	//for CONSISTENCY_CHECKS only
+	}
+ 
+	//CONSISTENCY_CHECKS 
+	nzTrgt = *NZ;
+	if(mm_is_symmetric(mcode))	nzTrgt = 2*(*NZ) - diagEntries;
+	assert( nzIdx == nzTrgt );
+	
+	//update NZ 
+	*NZ = nzIdx;
+	return entries;
+	
+	_err:
+	free(entries);  return NULL;
+} 
+
+void freeMatrixMarket(MatrixMarket* mm){
+	if (!mm)	return;
+	free(mm->entries);
+	free(mm->rowLens);
+	free(mm);
+}
+MatrixMarket* MMRead(char* matPath){
+	FILE* fp = fopen(matPath, "r");
+	if (!fp){
+		perror("fopen");
+		return NULL;
+	}
+	MatrixMarket* out = calloc(1,sizeof(*out));
+	if (!out){
+		ERRPRINT("MMRead out malloc errd\n");
+		goto err;
+	}
+	//banner -> parse  matrix specs
+	if (mm_read_banner(fp, &out->mcode) != 0) {
+		fprintf(stderr,"mm_read_banner err at:%s\n",matPath);
+		goto err;
+	}
+	//assert matrix is compatible with this app scope
+	if (MMCheck(out->mcode))	 goto err;
+
+	//parse sizes
+	//TODO OVERCOME uint limitation?
+	if(mm_read_mtx_crd_size(fp,(uint*) &out->M, (uint*) &out->N, (uint*) &out->NZ)){
+		fprintf(stderr,"mm_read_mtx_crd_size err at %s:\n",matPath);
+		goto err;
+	}
+	if (!(out->rowLens = calloc(out->M,sizeof(*(out->rowLens))))){
+		ERRPRINT("MMRead:\trowLens calloc errd\n");
+		goto err;
+	}
+	if (!(out->entries = MMtoCOO(&out->NZ, fp, out->mcode,out->rowLens))){
+		ERRPRINTS("MAT PARSE TO CSR ERR at:%s\n",matPath);
+		goto err;
+	}
+	goto _end;
+
+	err:
+	freeMatrixMarket(out);
+	out = NULL;
+	_end:
+	fclose(fp);
+	return out;
+}
+
+
+////COO -> ANYTHING ELSE CONVERSION
+int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){ 
+	int out = EXIT_FAILURE;
+	ulong idx;
+	long* _rowsLastCol = NULL;	//for each row -> last added entry's columnIdx 
+	ulong* rowsNextIdx = NULL;	//for each row -> next entry progressive idx
+	if (!(rowsNextIdx = calloc(mat->M,sizeof(*rowsNextIdx)))){
+		ERRPRINT("MMtoCOO:  rowsNextIdx calloc errd\n");
+		goto _end;
+	}
+	CONSISTENCY_CHECKS{ //alloc and init aux arr for entries sort check
+		if (!(_rowsLastCol = malloc(mat->M*sizeof(*_rowsLastCol)))){
+			ERRPRINT("MMtoCOO:  _rowsLastCol malloc errd\n");
+			goto _end;
+		}
+		memset(_rowsLastCol,-1,mat->M*sizeof(*_rowsLastCol));
+	}
+	/*TODO OLD
+	 * //get rowLens->IRP (partial), TODO moved MMtoCOO to avoid FULL rescan entries
+	 * for (ulong i=0; i<mat->NZ; i++)	 mat->IRP[entries[i].row+1]++;
+	 * memcpy(mat->RL,mat->IRP + 1,sizeof(*mat->IRP) * mat->M); //TODO in next ifdef
+	 * for (ulong i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
+	 * OLD2: rowLens memcpy ... no just moved the pointer
+	 * #ifdef ROWLENS
+	 * memcpy(mat->RL,rowLens,sizeof(*rowLens) * mat->M); //TODO in next ifdef
+	 * #endif
+	 */
+	//IRP: trasform rows lens as increments to build row index "pointer"
+	//0th -> 0 mandatory; 1th = 0th row len, ...., M+1th = end of Mth row
+	memcpy(mat->IRP+1,rowLens,sizeof(*rowLens) * mat->M);//init IRP with rows lens
+	for (ulong i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
+	CONSISTENCY_CHECKS  assert(mat->IRP[mat->M] == mat->NZ);
+	///FILL 
+	//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
+	//sorted for each row (even nn sequential in @entries)
+	//entries write in CSR format
+	entry* e;
+	for (ulong i=0; i<mat->NZ; i++) {
+		e = entries+i;
+		CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE SORTED
+			/*#pragma message("COO sorting check enabled")*/
+			if (_rowsLastCol[e->row] >= (long) e->col){
+				ERRPRINTS("not sorted entry:%ld,%ld,%lf",e->row,e->col,e->val);
+				goto _end;
+			}
+			_rowsLastCol[e->row] = e->col;
+		}
+		idx = mat -> IRP[e->row] + rowsNextIdx[e->row]++;
+		mat -> AS[idx] = e->val;
+		mat -> JA[idx] = e->col;
+	}
+	
+	out = EXIT_SUCCESS;
+
+	_end:
+	if(rowsNextIdx)	 free(rowsNextIdx);
+	if(_rowsLastCol)	free(_rowsLastCol);
+
+	return out;
+}
+
+int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
+	int out=EXIT_FAILURE;
+	ulong maxRow = 0, col, _ellEntriesTot, *rowsNextCol;
+	long* _rowsLastCol=NULL;
+	entry* e;
+	for (ulong i=0; i<mat->M; i++)  maxRow = MAX(maxRow,rowLens[i]); 
+	_ellEntriesTot = 2*mat->M*maxRow;
+	#ifdef LIMIT_ELL_SIZE
+	if ( _ellEntriesTot > ELL_MAX_ENTRIES ){
+		ERRPRINTS("Required entries %lu -> %lu uMB for the matrix exceed the "
+		  "designated threashold of: %lu  -> %lu MB for ellpack\n",
+		  _ellEntriesTot,(sizeof(double)*_ellEntriesTot) >> 20,
+		  ELL_MAX_ENTRIES,(sizeof(double)*ELL_MAX_ENTRIES) >> 20);
+		return EXIT_FAILURE;
+	}
+	#endif
+	//malloc aux vects
+	if (!(rowsNextCol = calloc(mat->M,sizeof(*rowsNextCol)))){
+		ERRPRINT("MMtoELL:\trowsNextCol calloc errd\n");
+		goto _end;
+	}
+	CONSISTENCY_CHECKS{ //alloc and init aux arr for entries SORT CHECK
+		if (!(_rowsLastCol = malloc(mat->M*sizeof(*_rowsLastCol)))){
+			ERRPRINT("MMtoELL:\trowsLastCol malloc errd\n");
+			goto _end;
+		}
+		memset(_rowsLastCol,-1,mat->M*sizeof(*_rowsLastCol));
+	}
+	///malloc dependant to MAX ROW LEN, err free in the caller
+	if (!(mat->AS = calloc(mat->M * maxRow,  sizeof(*(mat->AS))))){
+		ERRPRINT("MMtoELL:\tELL->AS calloc errd\n");
+		goto _end;
+	} //zero init for auto rows residual fill with 0
+	if (!(mat->JA = calloc(mat->M * maxRow,  sizeof(*(mat->JA))))){
+		ERRPRINT("MMtoELL:\tELL->JA calloc errd\n");
+		goto _end;
+	}
+
+	mat->MAX_ROW_NZ = maxRow;
+	/*#ifdef ROWLENS
+	 *memcpy(mat->RL,rowLens,sizeof(*rowLens) * mat->M); //TODO in next ifdef
+	 *#endif 
+	 */
+	///FILL NZ
+	//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
+	//sorted for each row (even nn sequential in @entries)
+	for (ulong i=0; i<mat->NZ; i++){
+		e = entries + i;
+		CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE COLS SORTED for righe successive
+			/*#pragma message("COO sorting check enabled")*/
+			if (_rowsLastCol[e->row] >= (long) e->col){
+				ERRPRINTS("not sorted entry:%ld,%ld,%lf",
+					  e->row,e->col,e->val);
+				goto _end;
+			}
+			_rowsLastCol[e->row] = e->col;
+		}
+		col = rowsNextCol[e->row]++;	//place entry in its row's sequent spot
+		mat->AS[ IDX2D(e->row,col,maxRow) ] = e->val;
+		mat->JA[ IDX2D(e->row,col,maxRow) ] = e->col;
+		 
+	}
+	///FILL PAD
+	ulong padded = 0,paddedEntries = mat->M*mat->MAX_ROW_NZ;
+	for (ulong r=0; r<mat->M; r++){
+		for (ulong c=rowLens[r],j=IDX2D(r,c,maxRow); c<maxRow; c++,j++,padded++){
+			//mat->AS[j] = ELL_AS_FILLER; //TODO ALREADY DONE IN CALLOC
+			//mat->JA[j] = mat->JA[rowLens[r]-1]; //ELL_JA_FILLER; //TODO calloc CUDA benefit?
+		}
+	}
+	VERBOSE{ 
+	  printf("padded %lu entries = %lf%% of NZ\n",padded,100*padded/(double) mat->NZ);
+	  printf("ELL matrix of: %lu paddedEntries -> %lu MB of JA+AS\n",
+		paddedEntries,(paddedEntries*sizeof(*(mat->JA))+paddedEntries*sizeof(*(mat->AS))) >> 20);
+	}
+	out = EXIT_SUCCESS;
+	_end:
+	if(rowsNextCol)  free(rowsNextCol);
+	if(_rowsLastCol) free(_rowsLastCol);
+	return out;
+}
+////wrapper MM -> specialized target
+spmat* MMtoCSR(char* matPath){
+	spmat* mat	= NULL;
+	MatrixMarket* mm = MMRead(matPath);
+	if (!mm){
+		ERRPRINT("MMtoCSR parse err\n");
+		return NULL;
+	}
+	if (!(mat = calloc(1,sizeof(*mat)))){
+		ERRPRINT("MMtoCSR: mat struct alloc errd");
+		goto err;
+	}
+	mat -> M = mm->M;
+	mat -> N = mm->N;
+	mat -> NZ= mm->NZ;
+	//alloc sparse matrix components
+	if (!(mat->IRP = calloc(mat->M+1,sizeof(*(mat->IRP))))){
+		ERRPRINT("MMtoCSR: IRP calloc err\n");
+		goto err;
+	}
+	////alloc core struct of CSR
+	if(!(mat->JA = malloc(mat->NZ*sizeof(*(mat->JA))))){
+		ERRPRINT("MMtoCSR: JA malloc err\n");
+		goto err; 
+	}
+	if(!(mat->AS = malloc(mat->NZ*sizeof(*(mat->AS))))){
+		ERRPRINT("MMtoCSR: AS malloc err\n");
+		goto err;  
+	}
+	if (COOtoCSR(mm->entries,mat,mm->rowLens))  goto err;
+	#ifdef ROWLENS
+	mat->RL = mm->rowLens;
+	mm->rowLens = NULL; //avoid free in @freeMatrixMarket
+	#endif
+
+	VERBOSE
+	printf("MMtoCSR: %lu NZ entries-> %lu MB of AS+JA+IRP\n",mat->NZ,
+	(mat->NZ*sizeof(*(mat->AS))+mat->NZ*sizeof(*(mat->JA))+(1+mat->M*sizeof(*(mat->IRP))))>>20);
+	goto _free;
+
+
+	err:
+	if (mat)	freeSpmat(mat);
+	mat = NULL;
+	_free:
+	freeMatrixMarket(mm);
+	return mat;
+}
+
+
+spmat* MMtoELL(char* matPath){
+	spmat* mat	= NULL;
+	MatrixMarket* mm = MMRead(matPath);
+	if (!mm){
+		ERRPRINT("MMtoELL: parse err\n");
+		return NULL;
+	}
+	if (!(mat = calloc(1,sizeof(*mat)))){
+		ERRPRINT("MMtoELL:  mat struct alloc errd");
+		goto err;
+	}
+	////alloc core struct of CSR
+	mat -> M = mm->M;
+	mat -> N = mm->N;
+	mat -> NZ= mm->NZ;
+	if (COOtoELL(mm->entries,mat,mm->rowLens))  goto err;
+	#ifdef ROWLENS
+	mat->RL = mm->rowLens;
+	mm->rowLens = NULL; //avoid free in @freeMatrixMarket
+	#endif
+
+	goto _free;
+
+	err:
+	if(mat) freeSpmat(mat);
+	mat = NULL;
+	_free:
+	freeMatrixMarket(mm);
+	return mat;
+}
--- a/cbind/base/psb_objhandle_mod.F90
+++ b/cbind/base/psb_objhandle_mod.F90
@ -40,35 +40,4 @@ module psb_objhandle_mod

  type, bind(c) :: psb_c_zspmat
    type(c_ptr) :: item = c_null_ptr
-  end type psb_c_zspmat
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! sp3mm c code structs
-! TODO : rename to conventions
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  type, bind(C, name=spmat) :: spmat_t
-    ! number of non zeros and dimensions
-    integer(c_size_t)               :: nz, m, n
-    ! value array
-    real(c_double), allocatable      :: as(:)
-    ! columns array
-    integer(c_size_t), allocatable  :: ja(:)
-    ! row index pointers array
-    integer(c_size_t), allocatable  :: irp(:)
-    ! lengths of the rows
-    integer(c_size_t), allocatable  :: rl(:)
-    ! max value of rl
-    integer(c_size_t)               :: max_row_nz
-  end type spmat_t
-
-  type, bind(C, name=CONFIG) :: config_t
-    ! dimensions of the grid
-    integer(c_short)  :: grid_rows, grid_cols
-    ! how to compute symb mul (if required)
-    integer(c_int)    :: symb_mm_row_impl_id
-    ! thread num to use in OMP parallel region
-    integer(c_int)    :: thread_num
-    ! CHUNKS_DISTR_INTERF func pntr
-    type(c_ptr)       :: chunk_distrb_func
-  end type config_t
-end module psb_objhandle_mod
+  end type psb_c_zspmat