From 49299912497b2781550786e2be318f5c36833a2f Mon Sep 17 00:00:00 2001
From: wlthr <dwalther@enseirb-matmeca.fr>
Date: Tue, 18 Jul 2023 10:44:04 +0200
Subject: [PATCH] changed all unsigned longs to int, modified makefile for
 debug purposes, reallocate c matrix arrays in sp3mm_impl.f90

---
 base/serial/impl/sp3mm4amg/Makefile           |   2 +-
 .../sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c     |  18 +--
 .../Sp3MM_CSR_OMP_SymbStep_Generic.c          |  21 ++--
 .../impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c |  78 ++++++------
 .../sp3mm4amg/commons/sparseUtilsGeneric.c    | 112 +++++++++---------
 base/serial/impl/sp3mm4amg/commons/utils.c    |  74 ++++++------
 .../impl/sp3mm4amg/fbind/psb_f_spmm_ub.c      |  74 +++++++++---
 .../sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h   |   6 +-
 .../include/Sp3MM_CSR_OMP_UB_Generic.h        |   2 +-
 .../impl/sp3mm4amg/include/SpMMUtilsGeneric.h |  68 +++++------
 base/serial/impl/sp3mm4amg/include/config.h   |   6 +-
 .../impl/sp3mm4amg/include/inlineExports.h    |  14 +--
 .../sp3mm4amg/include/inlineExports_Generic.h |   8 +-
 base/serial/impl/sp3mm4amg/include/macros.h   |   2 +-
 .../impl/sp3mm4amg/include/ompChunksDivide.h  |  10 +-
 base/serial/impl/sp3mm4amg/include/parser.h   |  14 +--
 .../impl/sp3mm4amg/include/sparseMatrix.h     |   2 +-
 .../sp3mm4amg/include/sparseUtilsGeneric.h    |  24 ++--
 base/serial/impl/sp3mm4amg/include/utils.h    |  26 ++--
 base/serial/impl/sp3mm4amg/lib/parser.c       |  52 ++++----
 base/serial/impl/sp3mm_impl.f90               |  52 ++++----
 21 files changed, 351 insertions(+), 314 deletions(-)
diff --git a/base/serial/impl/sp3mm4amg/Makefile b/base/serial/impl/sp3mm4amg/Makefile
index 9ceb1a75..d92af835 100644
--- a/base/serial/impl/sp3mm4amg/Makefile
+++ b/base/serial/impl/sp3mm4amg/Makefile
@@ -10,7 +10,7 @@ CWALL+=-Wno-unused-label -Wfatal-errors
 CINCL   = -Iinclude/ 
 CFLAGS  = -g -O3 $(CWALL) $(CINCL) -fopenmp $(RUNTIME)
 MACROS = -DDEBUGPRINT="if(FALSE)" -DDEBUG="if(FALSE)" -DCONSISTENCY_CHECKS="if(FALSE)" -DVERBOSE="if(FALSE)"  -DDEBUGCHECKS="if(FALSE)" 
-MACROSDBG = -DCONSISTENCY_CHECKS="if(TRUE)"  -DDEBUGCHECKS="if(TRUE)" -DVERBOSE="if(TRUE)" -DDEBUG="if(TRUE)"
+MACROSDBG = -DCONSISTENCY_CHECKS="if(TRUE)"  -DDEBUGCHECKS="if(TRUE)" -DVERBOSE="if(TRUE)" -DDEBUG="if(TRUE)" -DDEBUGPRINT="if(TRUE)"
 LDFLAGS = -lm
 
 LIBDIR=../../../
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
index fc4306c1..5867de7d 100644
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
@@ -122,7 +122,7 @@ spmat* CAT(spmmRowByRow_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 			bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
 			CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
 		} */
-		for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
+		for (idx_t ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
 			CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
 		//direct sparsify: trasform accumulated dense vector to a CSR row
 		sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
@@ -165,10 +165,10 @@ spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 	
 	///NUMERIC STEP
 	//perform Gustavson over rows blocks -> M / @cfg->gridRows
-	ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
+	idx_t rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
 	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
 	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
-	ulong b,startRow,block; //omp for aux vars
+	idx_t b,startRow,block; //omp for aux vars
 	#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
 	for (b=0;   b < cfg->gridRows; b++){
 		block	  = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
@@ -184,7 +184,7 @@ spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 				bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
 				CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
 			} */
-			for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
+			for (idx_t ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
 				CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
 			//direct sparsify: trasform accumulated dense vector to a CSR row
 			sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
@@ -209,7 +209,7 @@ spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 
 }
 spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ 
-	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
+	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%dxN=%d\n",A->M,B->N);
 	idx_t* bColOffsets = NULL;   //B group columns starting offset for each row
 	ACC_DENSE *accVectors=NULL,*accV;
 	SPACC* accRowPart;
@@ -218,10 +218,10 @@ spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 	idx_t	*rowsPartsSizes=NULL, *rowSizes=NULL;	//for rows' cols partition, correct len
 	if (!AB)	goto _err;
 	//2D indexing aux vars
-	ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
-	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
-	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
-	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	idx_t _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	idx_t _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	idx_t startRow,startCol,rowBlock,colBlock; //data division aux variables
 	////get bColOffsets for B column groups 
 	if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
 		goto _err;
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
index c41242a5..4ae12197 100644
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
@@ -114,7 +114,7 @@ static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)
  	 * sorting inplace the nodes inserted in the rbtree */
 	sortRbNode(nodes,abRowLen);
 	#elif	_OUT_IDXS == T || _COL_PARTS == T
-	uint i=0;
+	int i=0;
 	idx_t k;
 	#if _COL_PARTS == T
 	//colParts aux vars
@@ -144,7 +144,7 @@ static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)
 		idx_t k;
   		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
 			k = rb_entry(n,rbNode,rb)->key;
-			printf("%lu, ",k);
+			printf("%d, ",k);
 		}
 		printf("\n");
 	}*/
@@ -327,7 +327,8 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
 		goto _err;
 	}
 	#endif //_COL_PARTS
-	uint maxThreads	= omp_get_max_threads();	//TODO FROM CFG
+	int maxThreads;
+	maxThreads = omp_get_max_threads();	//TODO FROM CFG
 	//index keeping aux struct
 	//rbtree implementation or idxMap with aux of symbTree for tmp outIdx keeping
 	if ( rbTreeUsed ){
@@ -340,7 +341,7 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
 			goto _err;
 		}
 		//init roots
-		for (uint i=0; i<maxThreads; i++)
+		for (int i=0; i<maxThreads; i++)
 			rbRoots[i] = RB_ROOT_CACHED;
 	} 
 	if (symbRowImplID == IDXMAP){  //idxMap implementation && not outIdxs via rbtree
@@ -350,7 +351,7 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
 			goto _err;
 		}
 		//init idxs maps
-		for (uint i=0; i<maxThreads; i++){
+		for (int i=0; i<maxThreads; i++){
 			if(initSpVectIdxDenseAcc(b->N,idxsMapAccs+i))	goto _err;
 		}
 	}
@@ -409,7 +410,7 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
 	free(rbRoots);
 	free(rbNodes);
 	if (idxsMapAccs){
-		for (uint i=0; i<maxThreads; i++)	free(idxsMapAccs[i].idxsMap);
+		for (int i=0; i<maxThreads; i++)	free(idxsMapAccs[i].idxsMap);
 		free(idxsMapAccs);
 	}
 
@@ -439,7 +440,7 @@ idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F)
 	#if 	_OUT_IDXS == TRUE
 	#ifndef OUT_IDXS_RBTREE_NODES	
 	//return the mul.result nnz index inside the rbNodes
-	uint i=0;
+	int i=0;
 	//rbNodeOrderedVisit(n,root)	outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
   	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
 		outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
@@ -495,7 +496,7 @@ idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F)
 		*outIdxs[i] = upperBoundedSymMat + cumul; 
   	#endif	//#if _OUT_IDXS  == TRUE
 	//rbTrees for index keeping
-	uint maxThreads 	= omp_get_max_threads(); //TODO FROM CFG
+	int maxThreads 	= omp_get_max_threads(); //TODO FROM CFG
 	idx_t abMaxRowLen 	= reductionMaxSeq(abUpperBoundedRowsLens, a->M);
 	#ifdef 	  HEURISTICS_UB
 	idx_t maxRowLenUB 	= abMaxRowLen * SP3MM_UB_HEURISTIC; //TODO UB HEURISTC
@@ -515,7 +516,7 @@ idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F)
 			goto _err;
 		}
 		//init roots
-		for (uint i=0; i<maxThreads; i++)	rbRoots[i] = RB_ROOT_CACHED;
+		for (int i=0; i<maxThreads; i++)	rbRoots[i] = RB_ROOT_CACHED;
 		if (!( rbNodes	= malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes)) )){
 			ERRPRINT("Sp3MM_Symb_ rbNodes malloc errdn\n");
 			goto _err;
@@ -527,7 +528,7 @@ idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F)
 			goto _err;
 		}	
 		//init idxs maps
-		for (uint i=0; i<maxThreads; i++){
+		for (int i=0; i<maxThreads; i++){
 			if(initSpVectIdxDenseAcc(c->N,idxsMapAccs+i))	goto _err;
 		}
 	}
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
index a5e6d3be..7112d1a0 100644
--- a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
@@ -42,7 +42,7 @@ spmat* CAT(spmmSerial_,OFF_F)(spmat* A,spmat* B, CONFIG* _cfg){	//serial impleme
 	if ( allocAccDense(&acc,B->N) )			goto _free;
 	if (!(AB = allocSpMatrix(A->M,B->N)))	goto _free;
 	for( idx_t r=0; r<A->M; r++ ){
-		for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
+		for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
 			CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, &acc);
 		sparsifyDirect(&acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
 	}
@@ -54,7 +54,7 @@ spmat* CAT(spmmSerial_,OFF_F)(spmat* A,spmat* B, CONFIG* _cfg){	//serial impleme
 ////////Sp3MM as 2 x SpMM
 ///1D
 spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
-	DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
+	DEBUG printf("spmm\trows of A,\tfull B\tM=%d x N=%d\n",A->M,B->N);
 	///thread aux
 	ACC_DENSE *accVects = NULL,*acc;
 	SPMM_ACC* outAccumul=NULL;
@@ -83,7 +83,7 @@ spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (AB->M,AB,cfg);
 	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
 	#pragma omp parallel for schedule(runtime) private(acc)
-	for (ulong r=0;  r<A->M; r++){	//row-by-row formulation
+	for (idx_t r=0;  r<A->M; r++){	//row-by-row formulation
 		//iterate over nz entry index c inside current row r
 		acc = accVects + omp_get_thread_num();
 		/* direct use of sparse scalar vector multiplication
@@ -93,7 +93,7 @@ spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 			bRowLen = B->IRP[ca+1] - B->IRP[ca];
 			CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc);
 		}*/
-		for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
+		for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
 			CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
 		//trasform accumulated dense vector to a CSR row
 		#if SPARSIFY_PRE_PARTITIONING == T
@@ -124,7 +124,7 @@ spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 }
 
 idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void** accumul, void **rows_sizes, void** tmp_matrix){
-	DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
+	DEBUG printf("spmm\trows of A,\tfull B\tM=%d x N=%d\n",A->M,B->N);
 	///thread aux
 	ACC_DENSE *accVects = NULL,*acc;
 	SPMM_ACC* outAccumul=NULL;
@@ -153,7 +153,7 @@ idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void
 	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (AB->M,AB,cfg);
 	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
 	#pragma omp parallel for schedule(runtime) private(acc)
-	for (ulong r=0;  r<A->M; r++){	//row-by-row formulation
+	for (idx_t r=0;  r<A->M; r++){	//row-by-row formulation
 		//iterate over nz entry index c inside current row r
 		acc = accVects + omp_get_thread_num();
 		/* direct use of sparse scalar vector multiplication
@@ -163,7 +163,7 @@ idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void
 			bRowLen = B->IRP[ca+1] - B->IRP[ca];
 			CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc);
 		}*/
-		for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
+		for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
 			CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
 		//trasform accumulated dense vector to a CSR row
 		#if SPARSIFY_PRE_PARTITIONING == T
@@ -197,7 +197,7 @@ idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void
 	if(outAccumul)  freeSpMMAcc(outAccumul);
 }
 
-void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double** AS, idx_t** JA, idx_t** IRP){
+void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double* AS, idx_t* JA, idx_t* IRP){
 	SPMM_ACC* outAccumul= *accumul;
 	idx_t* rowsSizes = *rows_sizes;
 	spmat *AB = *tmp_matrix;
@@ -216,7 +216,7 @@ void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void**
 }
 
 spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
-	DEBUG printf("spmm\trowBlocks of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
+	DEBUG printf("spmm\trowBlocks of A,\tfull B\tM=%d x N=%d\n",A->M,B->N);
 	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
 	///thread aux
 	ACC_DENSE *accVects = NULL,*acc;
@@ -243,10 +243,10 @@ spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 	#endif
    
 	//perform Gustavson over rows blocks -> M / @cfg->gridRows
-	ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
+	idx_t rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
 	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
 	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
-	ulong b,startRow,block; //omp for aux vars
+	idx_t b,startRow,block; //omp for aux vars
 	#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
 	for (b=0;   b < cfg->gridRows; b++){
 		block	= UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
@@ -255,13 +255,13 @@ spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 	   
 		DEBUGPRINT{
 			fflush(NULL);
-			printf("block %lu\t%lu:%lu(%lu)\n",b,startRow,startRow+block-1,block);
+			printf("block %d\t%d:%d(%d)\n",b,startRow,startRow+block-1,block);
 			fflush(NULL);
 		}
 		//row-by-row formulation in the given row block
-		for (ulong r=startRow;  r<startRow+block;  r++){
+		for (idx_t r=startRow;  r<startRow+block;  r++){
 			//iterate over nz entry index c inside current row r
-			for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) 
+			for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) 
 				CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
 			//trasform accumulated dense vector to a CSR row
 			#if SPARSIFY_PRE_PARTITIONING == T
@@ -295,7 +295,7 @@ spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 ///2D
 //PARTITIONS NOT ALLOCATED
 spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ 
-	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
+	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%dxN=%d\n",A->M,B->N);
 	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
 	idx_t* bColOffsets = NULL;   //B group columns starting offset for each row
 	ACC_DENSE *accVectors=NULL,*accV;
@@ -305,10 +305,10 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 	idx_t*	rowsPartsSizes=NULL;
 	if (!AB)	goto _err;
 	//2D indexing aux vars
-	ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
-	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
-	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
-	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	idx_t _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	idx_t _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	idx_t startRow,startCol,rowBlock,colBlock; //data division aux variables
 	////get bColOffsets for B column groups 
 	if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
 		goto _err;
@@ -344,8 +344,8 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 	
 	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
 	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
-	ulong tileID,t_i,t_j;							//for aux vars
-	ulong bPartLen,bPartID,bPartOffset;//B partition acces aux vars
+	idx_t tileID,t_i,t_j;							//for aux vars
+	idx_t bPartLen,bPartID,bPartOffset;//B partition acces aux vars
 	#pragma omp parallel for schedule(runtime) \
 	  private(accV,accRowPart,rowBlock,colBlock,startRow,startCol,\
 	  bPartLen,bPartID,bPartOffset,t_i,t_j)
@@ -364,15 +364,15 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 		DEBUGPRINT{
 			fflush(NULL);
 			colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem);
-			printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",t_i,startRow,startRow+rowBlock-1,rowBlock);
-			printf("colBlock [%lu\t%lu:%lu(%lu)]\n",t_j,startCol,startCol+colBlock-1,colBlock);
+			printf("rowBlock [%d\t%d:%d(%d)]\t",t_i,startRow,startRow+rowBlock-1,rowBlock);
+			printf("colBlock [%d\t%d:%d(%d)]\n",t_j,startCol,startCol+colBlock-1,colBlock);
 			fflush(NULL);
 		}
 		///AB[t_i][t_j] block compute
-		for (ulong r=startRow;  r<startRow+rowBlock;  r++){
+		for (idx_t r=startRow;  r<startRow+rowBlock;  r++){
 			//iterate over nz col index j inside current row r
 			//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
-			for (ulong j=A->IRP[r]-OFF_F,c; j<A->IRP[r+1]-OFF_F; j++){
+			for (idx_t j=A->IRP[r]-OFF_F,c; j<A->IRP[r+1]-OFF_F; j++){
 				//get start of B[A->JA[j]][:colBlock:]
 				c = A->JA[j]-OFF_F; // col of nnz in A[r][:] <-> target B row
 				bPartID	 = IDX2D(c,t_j,cfg->gridCols); 
@@ -415,7 +415,7 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
 }
 
 spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
-	DEBUG printf("spmm\trowBlocks of A,\tcolBlocks (allcd) of B\tM=%luxN=%lu\n",A->M,B->N);
+	DEBUG printf("spmm\trowBlocks of A,\tcolBlocks (allcd) of B\tM=%dxN=%d\n",A->M,B->N);
 	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
 	spmat *AB = NULL, *colPartsB = NULL, *colPart;
 	idx_t*   rowsPartsSizes=NULL;
@@ -423,14 +423,14 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
 	SPMM_ACC* outAccumul=NULL;
 	ACC_DENSE *accVectors=NULL,*accV;
 	SPACC* accRowPart;
-	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	idx_t startRow,startCol,rowBlock,colBlock; //data division aux variables
 	//2D indexing aux vars
 	idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
 	idx_t* bColOffsets = NULL;
 	
 	if (!(AB = allocSpMatrix(A->M,B->N)))		   goto _err;
-	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
-	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	idx_t _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	idx_t _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
 	
 	////B cols  partition in CSRs
 	//if (!(colPartsB = CAT(colsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))  goto _err;
@@ -465,7 +465,7 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
 	
 	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
 	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
-	ulong tileID,t_i,t_j;	//for aux vars
+	idx_t tileID,t_i,t_j;	//for aux vars
 	#pragma omp parallel for schedule(runtime) \
 	  private(accV,accRowPart,colPart,rowBlock,colBlock,startRow,startCol,t_i,t_j)
 	for (tileID = 0; tileID < gridSize; tileID++){
@@ -484,17 +484,17 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
 		 
 		DEBUGPRINT{
 			fflush(NULL);
-			printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",
+			printf("rowBlock [%d\t%d:%d(%d)]\t",
 				t_i,startRow,startRow+rowBlock-1,rowBlock);
-			printf("colBlock [%lu\t%lu:%lu(%lu)]\n",
+			printf("colBlock [%d\t%d:%d(%d)]\n",
 				t_j,startCol,startCol+colBlock-1,colBlock);
 			fflush(NULL);
 		}
 		///AB[t_i][t_j] block compute
-		for (ulong r=startRow;  r<startRow+rowBlock;  r++){
+		for (idx_t r=startRow;  r<startRow+rowBlock;  r++){
 			//iterate over nz col index j inside current row r
 			//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
-			for (ulong j=A->IRP[r]-OFF_F,c,bRowStart,bRowLen; j<A->IRP[r+1]-OFF_F; j++){
+			for (idx_t j=A->IRP[r]-OFF_F,c,bRowStart,bRowLen; j<A->IRP[r+1]-OFF_F; j++){
 				//get start of B[A->JA[j]][:colBlock:]
 				c = A->JA[j]-OFF_F; // column of nnz entry in A[r][:] <-> target B row
 				bRowStart = colPart->IRP[c];
@@ -532,7 +532,7 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
 	AB = NULL; 
 	_free:
 	if (colPartsB){
-		for (ulong i=0; i<cfg->gridCols; i++)
+		for (idx_t i=0; i<cfg->gridCols; i++)
 			freeSpmatInternal(colPartsB+i);
 		free(colPartsB);
 	}
@@ -558,7 +558,7 @@ spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,
 	/* TODO 
 	alloc dense aux vector, reusable over 3 product 
 	TODO arrays sovrallocati per poter essere riusati nelle 2 SpMM
-	ulong auxVectSize = MAX(R->N,AC->N);
+	idx_t auxVectSize = MAX(R->N,AC->N);
 	auxVectSize	  = MAX(auxVectSize,P->N);
 	*/
 	
@@ -590,7 +590,7 @@ spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,
 spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,
 	   SPMM_INTERF spmm){
 	
-	ulong* rowSizes = NULL;
+	idx_t* rowSizes = NULL;
 	SPMM_ACC* outAccumul=NULL;
 	ACC_DENSE *accVectorsR_AC=NULL,*accVectorsRAC_P=NULL,*accRAC,*accRACP;
 
@@ -619,11 +619,11 @@ spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,
 		goto _err;
 	}
 
-	ulong c;
+	idx_t c;
 	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (R->M,R,cfg);
 	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
 	#pragma omp parallel for schedule(runtime) private(accRAC,accRACP,c)
-	for (ulong r=0;  r<R->M; r++){	//row-by-row formulation
+	for (idx_t r=0;  r<R->M; r++){	//row-by-row formulation
 		//iterate over nz entry index c inside current row r
 		accRAC  = accVectorsR_AC  + omp_get_thread_num();
 		accRACP = accVectorsRAC_P + omp_get_thread_num();
diff --git a/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c b/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
index e3cea72b..08720fd6 100644
--- a/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
+++ b/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
@@ -46,10 +46,10 @@
  * alloc threads' aux arrays once and split them in threads' structures
  * so free them once from the first thread struct, with the original pointers returned from the alloc
  */
-ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEXT
+ACC_DENSE* _initAccVectors_monoalloc(idx_t num,idx_t size){ //TODO PERF WITH NEXT
 	ACC_DENSE* out	 = NULL;
 	double* vAll	 = NULL;
-	ulong* vAllNzIdx = NULL;
+	idx_t* vAllNzIdx = NULL;
 	if (!(out = calloc(num,sizeof(*out)))){
 		ERRPRINT("_initAccVectors aux struct alloc failed\n");
 		return NULL;
@@ -62,7 +62,7 @@ ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEX
 		ERRPRINT("_initAccVectors aux dense vectors' idx alloc failed\n");
 		goto err;
 	}
-	for (ulong i=0; i<num; i++){
+	for (idx_t i=0; i<num; i++){
 		out[i].vLen	= size; //TODO USELESS INFO?
 		out[i].v	= vAll	  + i*size;  
 		out[i].nnzIdx	= vAllNzIdx + i*size;;
@@ -76,7 +76,7 @@ ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEX
 	free(vAllNzIdx);
 	return NULL;
 }
-int allocAccDense(ACC_DENSE* v,ulong size){
+int allocAccDense(ACC_DENSE* v,idx_t size){
 		v->vLen = size; 
 		if (!(v->v = calloc(size,sizeof(*(v->v))))) {
 			ERRPRINT("_initAccVectors aux dense vector alloc failed\n");
@@ -91,19 +91,19 @@ int allocAccDense(ACC_DENSE* v,ulong size){
 		return EXIT_SUCCESS;
 }
 
-ACC_DENSE* _initAccVectors(ulong num,ulong size){
+ACC_DENSE* _initAccVectors(idx_t num,idx_t size){
 	ACC_DENSE* out	= NULL;
 	if (!(out = calloc(num,sizeof(*out)))){
 		ERRPRINT("_initAccVectors aux struct alloc failed\n");
 		return NULL;
 	}
-	for (ulong i=0; i<num; i++){
+	for (idx_t i=0; i<num; i++){
 		if (allocAccDense(out+i,size))   goto _err;
 	}
 	return out;
 	
 	_err:
-	for (ulong i=0; i<num; i++){
+	for (idx_t i=0; i<num; i++){
 		if (out[i].v)	   free(out[i].v);
 		if (out[i].nnzIdx)  free(out[i].nnzIdx);
 	}
@@ -111,17 +111,17 @@ ACC_DENSE* _initAccVectors(ulong num,ulong size){
 	return NULL;
 }
 
-void freeAccsDense(ACC_DENSE* vectors,ulong num){
-	for (ulong i=0; i<num; i++){
+void freeAccsDense(ACC_DENSE* vectors,idx_t num){
+	for (idx_t i=0; i<num; i++){
 		free(vectors[i].v);
 		free(vectors[i].nnzIdx);
 		free(vectors[i].nnzIdxMap.idxsMap);
 	}
 	free(vectors);
 }
-void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num){ 
+void _freeAccsDenseChecks(ACC_DENSE* vectors,idx_t num){ 
 	if (!vectors)   return;
-	for (ulong i=0; i<num; i++){
+	for (idx_t i=0; i<num; i++){
 		if(vectors[i].v)	free(vectors[i].v);
 		if(vectors[i].nnzIdx)   free(vectors[i].nnzIdx);
 	}
@@ -143,8 +143,8 @@ int initSpVectIdxDenseAcc(idx_t idxMax,SPVECT_IDX_DENSE_MAP* vectIdxsMap){
 	return EXIT_SUCCESS;
 }
 
-void checkOverallocPercent(ulong* forecastedSizes,spmat* AB){
-	for (ulong r=0,rSize,forecastedSize; r < AB->M; r++){
+void checkOverallocPercent(idx_t* forecastedSizes,spmat* AB){
+	for (idx_t r=0,rSize,forecastedSize; r < AB->M; r++){
 		forecastedSize = forecastedSizes[r];
 		#ifdef ROWLENS
 		rSize = AB->RL[r];
@@ -158,11 +158,11 @@ void checkOverallocPercent(ulong* forecastedSizes,spmat* AB){
 			}
 		}
 		DEBUGPRINT
-			printf("extra forecastedSize of row: %lu\t=\t%lf %% \n",
+			printf("extra forecastedSize of row: %d\t=\t%lf %% \n",
 			  r,100*(forecastedSize-rSize) / (double) forecastedSize);
 	}
 	idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
-	printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
+	printf("extra forecastedSize of the matrix: \t%d\t = %lf %% \n",
 	  extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
 }
 int spmatDiff(spmat* A, spmat* B){
@@ -187,8 +187,8 @@ int spmatDiff(spmat* A, spmat* B){
 
 double* CSRToDense(spmat* sparseMat){
 	double* denseMat;
-	ulong i,j,idxNZ,denseSize;
-	if (__builtin_umull_overflow(sparseMat->M,sparseMat->N,&denseSize)){
+	idx_t i,j,idxNZ,denseSize;
+	if (__builtin_smul_overflow(sparseMat->M,sparseMat->N,&denseSize)){
 		ERRPRINT("overflow in dense allocation\n");
 		return NULL;
 	}
@@ -200,7 +200,7 @@ double* CSRToDense(spmat* sparseMat){
 		for (idxNZ=sparseMat->IRP[i]; idxNZ<sparseMat->IRP[i+1]; ++idxNZ){
 			 j = sparseMat->JA[idxNZ];
 			 //converting sparse item into dense entry
-			 denseMat[(ulong) IDX2D(i,j,sparseMat->N)] = sparseMat->AS[idxNZ]; 
+			 denseMat[(idx_t) IDX2D(i,j,sparseMat->N)] = sparseMat->AS[idxNZ]; 
 		}
 	}
 	return denseMat;
@@ -214,17 +214,17 @@ void printSparseMatrix(spmat* spMatrix,char justNZMarkers){
 
 
 
-static inline int _colsPartitioningUnifRanges_init(spmat* A,uint gridCols,
+static inline int _colsPartitioningUnifRanges_init(spmat* A,int gridCols,
   spmat** colParts,idx_t** colPartsLens){
 
 	spmat* colPart;
-	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *tmpJA;
+	idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *tmpJA;
 	///alloc/init partitions structures
 	if (!(*colParts = calloc(gridCols, sizeof(**colParts)))){
 		ERRPRINT("colsPartitioningUnifRanges\tcolumns partitions of A calloc fail\n");
 		return EXIT_FAILURE;
 	}
-	for (ulong i=0,colBlock; i<gridCols; i++){
+	for (idx_t i=0,colBlock; i<gridCols; i++){
 		colBlock = UNIF_REMINDER_DISTRI(i,_colBlock,_colBlockRem);
 		colPart  = *colParts + i;
 		if (allocSpMatrixInternal(A->M,colBlock,colPart)){
@@ -249,13 +249,13 @@ static inline int _colsPartitioningUnifRanges_init(spmat* A,uint gridCols,
 	return EXIT_SUCCESS;
 }
 
-static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,uint gridCols,
+static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,int gridCols,
   spmat* colParts,idx_t* colPartsLens){
 
 	spmat* colPart;
 	double* tmpAS; idx_t* tmpJA;
 	//realloc overallcd A parts NZ arrays (TODO -> downsizing -> nofails?)
-	for (ulong i=0,partLen; i<gridCols; i++){
+	for (idx_t i=0,partLen; i<gridCols; i++){
 		colPart = colParts + i;
 		partLen = colPartsLens[i];
 		if (!(tmpAS = realloc(colPart->AS,partLen*sizeof(*(colPart->AS))))){
@@ -281,22 +281,22 @@ static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,uint gridCol
 #endif
 ////////////////////////  CSR SPECIFIC -- TODO RENAME //////////////////
 ///SPARSE MATRIX PARTITIONING
-idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
-	ulong subRowsN = A->M * gridCols;
-	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
-	ulong* offsets = malloc( (subRowsN+1) * sizeof(*offsets) );
+idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols){
+	idx_t subRowsN = A->M * gridCols;
+	idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
+	idx_t* offsets = malloc( (subRowsN+1) * sizeof(*offsets) );
 	if (!offsets)  {
 		ERRPRINT("colsOffsetsPartitioningUnifRanges:\toffsets malloc errd\n");
 		return NULL;
 	}
 	///OFFSETS COMPUTE FOR COL GROUPS -> O( A.NZ )
-	for (ulong r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
+	for (idx_t r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
 		offsets[ IDX2D(r,0,gridCols) ] = j;  //row's first gc start is costrained
 		//navigate column groups inside current row
-		for (ulong gc=1,gcStartCol;  gc<gridCols;  gc++){
+		for (idx_t gc=1,gcStartCol;  gc<gridCols;  gc++){
 			gcStartCol = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
 			//goto GroupCols start entry,keeping A's nnz entries navigation (idx j)
-			//for (ulong c=A->JA[j]-OFF_F; c<gcStartCol && j < A->IRP[r+1]-OFF_F; c=A->JA[++j]-OFF_F);
+			//for (idx_t c=A->JA[j]-OFF_F; c<gcStartCol && j < A->IRP[r+1]-OFF_F; c=A->JA[++j]-OFF_F);
 			while (j < A->IRP[r+1]-OFF_F &&  A->JA[j]-OFF_F < gcStartCol)  
 				j++;
 			offsets[ IDX2D(r,gc,gridCols) ] = j;  //row's gc group startIdx
@@ -306,12 +306,12 @@ idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
 	return offsets;
 }
 
-spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,
+spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,int gridCols,
 	   idx_t** colPartsOffsets)
 {
 	spmat *colParts = NULL, *colPart;
-	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
-	ulong *colPartsLens=NULL, *tmpJA;
+	idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
+	idx_t *colPartsLens=NULL, *tmpJA;
 	double* tmpAS;
 
 	///alloc/init partitions structures
@@ -349,15 +349,15 @@ spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,
 	return colParts;
 	_err:
 	free(*colPartsOffsets);
-	for (ulong i=0; i<gridCols; i++)   	freeSpmatInternal(colParts+i);
+	for (idx_t i=0; i<gridCols; i++)   	freeSpmatInternal(colParts+i);
 	free(colOffsets);
 	free(colParts);
 	free(colPartsLens);
 	return NULL;
 }
-spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
+spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols){
 	spmat *colParts, *colPart;
-	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *colPartsLens=NULL, *tmpJA;
+	idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *colPartsLens=NULL, *tmpJA;
 	double* tmpAS;
 	///alloc/init partitions structures
 	if (_colsPartitioningUnifRanges_init(A,gridCols,&colParts,&colPartsLens))
@@ -366,9 +366,9 @@ spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
 	 * Parallelize: 2for collapse OMP, gcEndCol -> startIdxize, ...
 	 * oppure wrappare cio in static inline 
 	 */
-	for (ulong r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
+	for (idx_t r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
 		//navigate column groups inside current row
-		for (ulong gc=0,gcEndCol=0,i;  gc<gridCols ;  gc++,j+=i){
+		for (idx_t gc=0,gcEndCol=0,i;  gc<gridCols ;  gc++,j+=i){
 			i = 0;  //@i=len current subpartition of row @r to copy
 			colPart = colParts + gc;
 			//NB: not shifting IRP because is handled as internal implementation component
@@ -376,7 +376,7 @@ spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
 			colPart->IRP[r] = colPartsLens[gc];	
 			gcEndCol += UNIF_REMINDER_DISTRI(gc,_colBlock,_colBlockRem);
 			//goto next GroupCols,keeping A's nnz entries navigation ( index j+i )
-			//for (ulong c=A->JA[j+i]-OFF_F; c<gcEndCol && j+i  < A->IRP[r+1]-OFF_F; c=A->JA[j+ ++i]-OFF_F);
+			//for (idx_t c=A->JA[j+i]-OFF_F; c<gcEndCol && j+i  < A->IRP[r+1]-OFF_F; c=A->JA[j+ ++i]-OFF_F);
 			while ( j+i < A->IRP[r+1]-OFF_F && A->JA[j+i]-OFF_F < gcEndCol ) i++;
 			memcpy(colPart->AS+colPart->IRP[r], A->AS+j, i*sizeof(*A->AS));
 			memcpy(colPart->JA+colPart->IRP[r], A->JA+j, i*sizeof(*A->JA));
@@ -393,14 +393,14 @@ spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
 	free(colPartsLens);
 	return colParts;
 	_err:
-	for (ulong i=0; i<gridCols; i++)   freeSpmatInternal(colParts+i);
+	for (idx_t i=0; i<gridCols; i++)   freeSpmatInternal(colParts+i);
 	if(colParts)		free(colParts);
 	if(colPartsLens)	free(colPartsLens);
 	return NULL;
 }
 
 
-void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
+void CAT(checkOverallocRowPartsPercent_,OFF_F)(idx_t* forecastedSizes,spmat* AB,
   				   idx_t gridCols,idx_t* bColOffsets){
 	idx_t* abColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(AB, gridCols);
 	assert(abColOffsets);	//partitioning error
@@ -410,7 +410,7 @@ void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
 		DEBUGCHECKS	assert(forecast >= rLen);
 	}
 	idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
-	printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
+	printf("extra forecastedSize of the matrix: \t%d\t = %lf %% \n",
 		extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
 
 	free(abColOffsets);
@@ -422,25 +422,25 @@ void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
 
 ///inline export here 
 //SPMV_CHUNKS_DISTR spmvChunksFair; 
-spmat* allocSpMatrix(ulong rows, ulong cols);
-int allocSpMatrixInternal(ulong rows, ulong cols, spmat* mat);
+spmat* allocSpMatrix(idx_t rows, idx_t cols);
+int allocSpMatrixInternal(idx_t rows, idx_t cols, spmat* mat);
 void freeSpmatInternal(spmat* mat);
 void freeSpmat(spmat* mat);
 
 ////INTERNAL TEST FUNCTIONS
 //test that each row's partition from colsOffsetsPartitioningUnifRanges is in the correct index range
 #include <alloca.h>
-int testColsOffsetsPartitioningUnifRanges(spmat* mat,ulong gridCols,ulong* partsOffs){
-	ulong _colBlock = mat->N/gridCols, _colBlockRem = mat->N%gridCols;
-	ulong j=0;	//CSR scanning nnz idx
-	ulong* colPartsPopulations = alloca(gridCols * sizeof(*colPartsPopulations));
+int testColsOffsetsPartitioningUnifRanges(spmat* mat,idx_t gridCols,idx_t* partsOffs){
+	idx_t _colBlock = mat->N/gridCols, _colBlockRem = mat->N%gridCols;
+	idx_t j=0;	//CSR scanning nnz idx
+	idx_t* colPartsPopulations = alloca(gridCols * sizeof(*colPartsPopulations));
 	memset(colPartsPopulations,0,gridCols * sizeof(*colPartsPopulations));
-	for (ulong r=0,pId=0; r<mat->M; r++){
-		for (ulong gc=0,pStartIdx,pEndIdx; gc<gridCols; gc++,pId++){
+	for (idx_t r=0,pId=0; r<mat->M; r++){
+		for (idx_t gc=0,pStartIdx,pEndIdx; gc<gridCols; gc++,pId++){
 			pStartIdx = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
 			pEndIdx   = UNIF_REMINDER_DISTRI_STARTIDX(gc+1,_colBlock,_colBlockRem)-1; 
 			//pId=IDX2D(r,gc,gridCols);
-			for (ulong idx=partsOffs[pId],c; idx<partsOffs[pId+1]; idx++,j++){
+			for (idx_t idx=partsOffs[pId],c; idx<partsOffs[pId+1]; idx++,j++){
 				c = mat->JA[idx];
 				assert(j == idx); //consecutive index in partitioning
 				assert(pStartIdx <= c && c <= pEndIdx);	//colRange
@@ -450,12 +450,12 @@ int testColsOffsetsPartitioningUnifRanges(spmat* mat,ulong gridCols,ulong* parts
 		}
 	}
 	assert(j == mat->NZ);
-	ulong s=0;
-	for (ulong gc=0,partSize; gc < gridCols; gc++,s+=partSize){
+	idx_t s=0;
+	for (idx_t gc=0,partSize; gc < gridCols; gc++,s+=partSize){
 		partSize = colPartsPopulations[gc];
 		double partShare=partSize/(double)mat->NZ,partsAvg=1/(double)gridCols;
 		double partShareAvgDiff = partShare - partsAvg;
-		printf("colPartition %lu has:\t%lu = %lf of NNZ\t\t .. %lf\tAVG diff\n",
+		printf("colPartition %d has:\t%d = %lf of NNZ\t\t .. %lf\tAVG diff\n",
 		  gc,partSize,partShare,partShareAvgDiff);
 	}
 	assert(s == mat->NZ); //TODO DUPLICATED
@@ -482,14 +482,14 @@ int main(int argc, char** argv){
 		return out;
 	}
 	////partitioning test
-	ulong* colsPartitions = colsOffsetsPartitioningUnifRanges_0(mat,Conf.gridCols);
+	idx_t* colsPartitions = colsOffsetsPartitioningUnifRanges_0(mat,Conf.gridCols);
 	if (!colsPartitions)	goto _free;
 	if (testColsOffsetsPartitioningUnifRanges(mat,Conf.gridCols,colsPartitions))
 		goto _free;
 
 	out=EXIT_SUCCESS;
 	printf("testColsOffsetsPartitioningUnifRanges passed with "
-		   "mat: %lux%lu-%luNNZ\tgrid: %dx%d\n",
+		   "mat: %dx%d-%dNNZ\tgrid: %dx%d\n",
 			mat->M,mat->N,mat->NZ,Conf.gridRows,Conf.gridCols);
 	_free:
 	if (colsPartitions) free(colsPartitions);
diff --git a/base/serial/impl/sp3mm4amg/commons/utils.c b/base/serial/impl/sp3mm4amg/commons/utils.c
index d969bf82..8705bece 100644
--- a/base/serial/impl/sp3mm4amg/commons/utils.c
+++ b/base/serial/impl/sp3mm4amg/commons/utils.c
@@ -145,12 +145,12 @@ int fread_wrap(FILE* fp,void* dst,size_t count){
 	return rd;
 }
 ///STRUCTURED DATA IO
-int writeDoubleVector(char* fpath,double* v,ulong size){
+int writeDoubleVector(char* fpath,double* v,idx_t size){
 	int fd,out=EXIT_FAILURE;
 	if ( (fd = createNewFile(fpath) ) < 0 ) goto _end;
 	write_wrap(fd,v,size * sizeof(*v)); //raw write of vector
 	out = EXIT_SUCCESS;
-	DEBUG printf("written double vector into %s RAW of %lu elements\n",fpath,size);
+	DEBUG printf("written double vector into %s RAW of %d elements\n",fpath,size);
 
 	_end:
 	if (close(fd) == EOF)  perror("close errd\n");
@@ -158,21 +158,21 @@ int writeDoubleVector(char* fpath,double* v,ulong size){
 }
 
 ///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
-int writeDoubleVectorAsStr(char* fpath,double* v,ulong size){
+int writeDoubleVectorAsStr(char* fpath,double* v,idx_t size){
 	int out=EXIT_FAILURE;
 	FILE* fp = fopen(fpath,"w");
 	if (!fp){
 		perror("fopen vector file write");
 		return EXIT_FAILURE;
 	}
-	for (ulong i=0; i<size; i++){
+	for (idx_t i=0; i<size; i++){
 		if (fprintf(fp,DOUBLE_STR_FORMAT,v[i]) < 0){
 			ERRPRINT("fprintf to out vector file errd\n");
 			goto _end;
 		} 
 	}
 	out = EXIT_SUCCESS;
-	DEBUG printf("written vector into %s of %lu elements\n",fpath,size);
+	DEBUG printf("written vector into %s of %d elements\n",fpath,size);
 
 	_end:
 	if (fclose(fp) == EOF)  perror("fclose errd\n");
@@ -180,10 +180,10 @@ int writeDoubleVectorAsStr(char* fpath,double* v,ulong size){
 
 }
 
-double* readDoubleVector(char* fpath,ulong* size){
+double* readDoubleVector(char* fpath,idx_t* size){
 	int rd,hleft;
 	double *out,*tmp;
-	ulong i=0,vectorSize=RNDVECTORSIZE;
+	idx_t i=0,vectorSize=RNDVECTORSIZE;
 	if (*size)   vectorSize = *size;
 	FILE* fp = fopen(fpath,"r");
 	if (!fp){
@@ -198,13 +198,13 @@ double* readDoubleVector(char* fpath,ulong* size){
 		if (i >= vectorSize ){ //realloc the array
 			vectorSize *= VECTOR_STEP_REALLOC;
 			if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
-				ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
+				ERRPRINTS("realloc errd to ~~ %d MB\n",vectorSize >> 20);
 				goto _err;
 			}
 			out = tmp;
-			DEBUG   printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
+			DEBUG   printf("reallocd to ~~ %d MB\n",vectorSize >> 20);
 		}
-		ulong toRead = MIN(VECTOR_READ_BLOCK,(vectorSize-i));
+		idx_t toRead = MIN(VECTOR_READ_BLOCK,(vectorSize-i));
 		if((rd = fread_wrap(fp,out + i,sizeof(*out)*toRead)) == -2){
 			ERRPRINT("fread_wrap errd\n");
 			goto _err;
@@ -227,7 +227,7 @@ double* readDoubleVector(char* fpath,ulong* size){
 		goto _err;
 	}
 	out = tmp;
-	DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
+	DEBUG printf("readed vector from %s of %d elements\n",fpath,*size);
 	goto _free;
 
 	_err:
@@ -238,10 +238,10 @@ double* readDoubleVector(char* fpath,ulong* size){
 	return out;
 }
 
-double* readDoubleVectorStr(char* fpath,ulong* size){
+double* readDoubleVectorStr(char* fpath,idx_t* size){
 	int fscanfOut;
 	double *out,*tmp;
-	ulong i=0,vectorSize=RNDVECTORSIZE;
+	idx_t i=0,vectorSize=RNDVECTORSIZE;
 	if (*size)   vectorSize = *size;
 	FILE* fp = fopen(fpath,"r");
 	if (!fp){
@@ -256,11 +256,11 @@ double* readDoubleVectorStr(char* fpath,ulong* size){
 		if (i >= vectorSize ){ //realloc the array
 			vectorSize *= VECTOR_STEP_REALLOC;
 			if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
-				ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
+				ERRPRINTS("realloc errd to ~~ %d MB\n",vectorSize >> 20);
 				goto _err;
 			}
 			out = tmp;
-			DEBUG   printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
+			DEBUG   printf("reallocd to ~~ %d MB\n",vectorSize >> 20);
 		}
 		fscanfOut = fscanf(fp,DOUBLE_STR_FORMAT,out + i++ );
 		if ( fscanfOut == EOF && ferror(fp)){
@@ -277,7 +277,7 @@ double* readDoubleVectorStr(char* fpath,ulong* size){
 		goto _err;
 	}
 	out = tmp;
-	DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
+	DEBUG printf("readed vector from %s of %d elements\n",fpath,*size);
 	goto _free;
 
 	_err:
@@ -292,10 +292,10 @@ double* readDoubleVectorStr(char* fpath,ulong* size){
 int getConfig(CONFIG* conf){
 	int changes=EXIT_FAILURE;
 	char *varVal,*ptr;
-	ulong val;
+	idx_t val;
 	if ((varVal = getenv(GRID_ROWS))){
 		val=strtoul(varVal,&ptr,10);
-		if (ptr==varVal || val>= UINT_MAX){
+		if (ptr==varVal || val>= INT_MAX){
 			perror("strtol errd");
 		} else {
 			conf->gridRows = val;
@@ -304,7 +304,7 @@ int getConfig(CONFIG* conf){
 	}
 	if ((varVal = getenv(GRID_COLS))){
 		val=strtoul(varVal,&ptr,10);
-		if (ptr==varVal || val>= UINT_MAX){
+		if (ptr==varVal || val>= INT_MAX){
 			perror("strtol errd");
 		} else {
 			conf->gridCols = val;
@@ -317,11 +317,11 @@ int getConfig(CONFIG* conf){
 /////LIB-SORTING -- WRAPPERS
 //comparing functions
 static inline int cmp_idx_t(const void* a, const void*b){
-	idx_t aa=*((ulong*) a), bb = *((ulong*) b);
+	idx_t aa=*((idx_t*) a), bb = *((idx_t*) b);
 	return aa==bb?0:aa>bb?1:-1;
 }
-static inline int cmpulong(const void* a, const void*b){
-	ulong aa=*((ulong*) a), bb = *((ulong*) b);
+static inline int cmpidx_t(const void* a, const void*b){
+	idx_t aa=*((idx_t*) a), bb = *((idx_t*) b);
 	return aa==bb?0:aa>bb?1:-1;
 }
 static inline int cmpuint(const void* a, const void*b){
@@ -336,8 +336,8 @@ static inline int cmpRbNode(const void* a, const void* b){
 void sort_idx_t(idx_t* arr, idx_t len){
 	qsort(arr,len,sizeof(*arr),cmp_idx_t);
 }
-void sortulong(ulong* arr, ulong len){
-	qsort(arr,len,sizeof(*arr),cmpulong);
+void sortidx_t(idx_t* arr, idx_t len){
+	qsort(arr,len,sizeof(*arr),cmpidx_t);
 }
 void sortuint(uint* arr, uint len){
 	qsort(arr,len,sizeof(*arr),cmpuint);
@@ -368,7 +368,7 @@ static inline int rndDouble_sinDecimal(double* d){
 		ERRPRINT("read_wrap failed to read holder for rnd double\n");
 		return EXIT_FAILURE;
 	}
-	*d = (_rndHold % (ulong) ceil(MAXRND)) + sin(_rndHold);
+	*d = (_rndHold % (idx_t) ceil(MAXRND)) + sin(_rndHold);
 	return EXIT_SUCCESS;
 }
    
@@ -383,8 +383,8 @@ void statsAvgVar(double* values,uint numVals, double* out){
 }
 
 /// MATRIX - VECTOR COMPUTE UTILS
-int fillRndVector(ulong size, double* v){
-	for( ulong x=0; x<size; ++x ){
+int fillRndVector(idx_t size, double* v){
+	for( idx_t x=0; x<size; ++x ){
 		if(rndDouble_sinAll( v+x )) return EXIT_FAILURE;
 		#ifdef RNDVECTMIN
 		v[x] += RNDVECTMIN;
@@ -394,18 +394,18 @@ int fillRndVector(ulong size, double* v){
 }
 
 //convention true result in @a, toCheck in @b
-int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
+int doubleVectorsDiff(double* a, double* b, idx_t n,double* diffMax){
 	int out = EXIT_SUCCESS;
 	double diff,diffAbs,_diffMax=0;
 	if (diffMax)	*diffMax = 0;
 	else diffMax = &_diffMax;
-	for (ulong i=0; i<n; i++){
+	for (idx_t i=0; i<n; i++){
 		diff	= a[i] - b[i];
 		diffAbs = ABS( diff );
 		if( diffAbs > DOUBLE_DIFF_THREASH ){
 			out = EXIT_FAILURE;
 			ERRPRINTS("DOUBLE VECTORS DIFF: DOUBLE_DIFF_THREASH=%lf\t<\t"
-				  "|%13lg| = %lf %% of @a[%lu]\n",
+				  "|%13lg| = %lf %% of @a[%d]\n",
 			  	  DOUBLE_DIFF_THREASH,diff,100*diffAbs/ABS(a[i]),i);
 			#ifdef DOUBLE_VECT_DIFF_EARLY_EXIT
 			/*#pragma message("DOUBLE_VECT_DIFF_EARLY_EXIT: only first diff double reported")*/
@@ -415,7 +415,7 @@ int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
 		if ( ABS(*diffMax) < diffAbs )   *diffMax = diff;
 	}
 	DEBUG{
-		printf("\nchecked diff %s"CEND" between 2 double vector of %lu elements"
+		printf("\nchecked diff %s"CEND" between 2 double vector of %d elements"
 			   "\tmax diff: %le %s threash: %le\n", !out?CCC"OK":CCCERR"ERR!",
 				n,*diffMax,*diffMax<DOUBLE_DIFF_THREASH?"<":">=",
 			   DOUBLE_DIFF_THREASH);
@@ -427,9 +427,9 @@ int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
 	return out;
 }
 
-void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers){
-	printf("printing matrix: %lu x %lu\n",m,n);
-	ulong i,j;
+void printMatrix(double* mat,idx_t m,idx_t n,char justNZMarkers){
+	printf("printing matrix: %d x %d\n",m,n);
+	idx_t i,j;
 	for (i=0;i<m;i++){
 		for (j=0;j<n;j++){
 			if (justNZMarkers)  printf("%s",mat[IDX2D(i,j,n)]?".":" ");
@@ -441,8 +441,8 @@ void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers){
 }
 
 
-void printVector(double* v,ulong size){
-	for( ulong i=0;i<size;i++)   printf("%1.1lf ",v[i]);
+void printVector(double* v,idx_t size){
+	for( idx_t i=0;i<size;i++)   printf("%1.1lf ",v[i]);
 	printf("\n");
 }
 
@@ -464,7 +464,7 @@ static inline char* searchPatternInStrs(char* pattern,char** strings){
 	return out;
 }
 
-inline int appendArr(ulong val,APPENDARRAY* list){
+inline int appendArr(idx_t val,APPENDARRAY* list){
 	return 0;   //TODO
 }
 
diff --git a/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c b/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
index 1f339bd6..2f77f2a1 100644
--- a/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
+++ b/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
@@ -9,13 +9,23 @@ enum impl_types
 };
 
 CHUNKS_DISTR chunksFair;
+CHUNKS_DISTR chunksFairFolded;
+CHUNKS_DISTR chunksNOOP;
+
+CHUNKS_DISTR_INTERF chunkDistrbFunc=&chunksFairFolded;
+
+static CONFIG Conf = {
+	.gridRows  = 20,
+	.gridCols  = 2,
+	.symbMMRowImplID = RBTREE,
+};
 
 void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
                              double **a_as_ptr, idx_t **a_ja_ptr,
-                             idx_t **a_irp_ptr, idx_t a_max_row_nz,
+                             idx_t **a_irp_ptr,
                              idx_t b_m, idx_t b_n, idx_t b_nz,
                              double **b_as_ptr, idx_t **b_ja_ptr,
-                             idx_t **b_irp_ptr, idx_t b_max_row_nz,
+                             idx_t **b_irp_ptr,
                              enum impl_types impl_choice,
                              void **accumul,
                              void **rows_sizes,
@@ -24,7 +34,6 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
 {
     int rc;
     spmat a, b;
-    CONFIG cfg;
 
     double *a_as = *a_as_ptr;
     idx_t *a_ja = *a_ja_ptr;
@@ -39,15 +48,32 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
     b->RL = b_rl;
 #endif // ROWLENS
 
-    // setting up cfg
-    // TODO : CHECK THAT THIS IS COMPATIBLE WITH PSB
-    rc = getConfig(&cfg);
+	if (!getConfig(&Conf)){
+		VERBOSE printf("configuration changed from env");
+	}
+	//save exported parallelization grid configured ... see Config.md in the top folder
+	//const ushort GRIDROWS = Conf.gridRows;
+	//const ushort GRIDCOLS = Conf.gridCols;
+	const ushort ORIG_GRID_CONF[2] = {Conf.gridRows,Conf.gridCols};
+
+	int maxThreads = omp_get_max_threads();
+	Conf.threadNum = (uint) maxThreads;
+	DEBUG   printf("omp_get_max_threads:\t%d\n",maxThreads); 
 
-    // TODO : change chunk distribution with a choice ?
-    cfg.chunkDistrbFunc = &chunksFair;
-    cfg.threadNum = 1;
+	/*
+	 * get exported schedule configuration, 
+	 * if schedule != static -> dyn like -> set a chunk division function before omp for
+	 */
+	int schedKind_chunk_monotonic[3];
+	ompGetRuntimeSchedule(schedKind_chunk_monotonic);
+	Conf.chunkDistrbFunc = &chunksNOOP; 
+	if (schedKind_chunk_monotonic[0] != omp_sched_static)
+		Conf.chunkDistrbFunc = chunkDistrbFunc;
+	VERBOSE 
+	  printf("%s",Conf.chunkDistrbFunc == &chunksNOOP?"static schedule =>chunkDistrbFunc NOOP\n":"");
+
+    int i, row_nz;
 
-    printf("irp %d %d %d %d %d\n", a_irp[0], a_irp[1], a_irp[2], a_irp[3], a_irp[4]);
     // setting up spmat type matrices
     a.M = a_m;
     a.N = a_n;
@@ -55,7 +81,13 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
     a.AS = a_as;
     a.JA = a_ja;
     a.IRP = a_irp;
-    a.MAX_ROW_NZ = a_max_row_nz;
+    a.MAX_ROW_NZ = a.IRP[1] - a.IRP[0];
+    for (i = 1; i < a.M; i ++){
+        row_nz = a.IRP[i+1] - a.IRP[i];
+        if (row_nz > a.MAX_ROW_NZ){
+            a.MAX_ROW_NZ = row_nz;
+        }
+    }
 
     b.M = b_m;
     b.N = b_n;
@@ -63,13 +95,20 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
     b.AS = b_as;
     b.JA = b_ja;
     b.IRP = b_irp;
-    b.MAX_ROW_NZ = b_max_row_nz;
+    b.MAX_ROW_NZ = b.IRP[1] - b.IRP[0];
+    for (i = 1; i < b.M; i ++){
+        row_nz = b.IRP[i+1] - b.IRP[i];
+        if (row_nz > b.MAX_ROW_NZ){
+            b.MAX_ROW_NZ = row_nz;
+        }
+    }
+
 
     // computing the size
     switch (impl_choice)
     {
     case ROW_BY_ROW_UB:
-        *nnz = spmmRowByRowCalculateSize_1(&a, &b, &cfg, accumul, rows_sizes, tmp_matrix);
+        *nnz = spmmRowByRowCalculateSize_1(&a, &b, &Conf, accumul, rows_sizes, tmp_matrix);
     default:
         break;
     }
@@ -79,11 +118,14 @@ void psb_f_spmm_merge_spacc(void **accumul,
                             void **rows_sizes,
                             void **tmp_matrix,
                             enum impl_types impl_choice,
-                            double **c_as,
-                            idx_t **c_ja,
-                            idx_t **c_irp,
+                            double **c_as_ptr,
+                            idx_t **c_ja_ptr,
+                            idx_t **c_irp_ptr,
                             int *info)
 {
+    double *c_as = *c_as_ptr;
+    idx_t *c_ja = *c_ja_ptr;
+    idx_t *c_irp = *c_irp_ptr;
     // merging the rows into the correct arrays
     switch (impl_choice)
     {
diff --git a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
index 57676b59..6dd4dc0d 100644
--- a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
@@ -39,10 +39,10 @@
 //hold SPMM result over a unpartitionated space among threads-row[s' blocks]
 typedef struct{
 	//space to hold SPMM output
-	ulong*  JA;
+	idx_t*  JA;
 	double* AS;
-	ulong   size;			//num of entries allocated -> only dbg checks
-	ulong   lastAssigned;	//last JA&AS assigned index to an accumulator(atom)
+	idx_t   size;			//num of entries allocated -> only dbg checks
+	idx_t   lastAssigned;	//last JA&AS assigned index to an accumulator(atom)
 	SPACC*  accs;			//SPARSIFIED ACC POINTERS
 	uint	accsNum;	
 } SPMM_ACC; //accumulator for SPMM
diff --git a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
index f1e19402..b4e471cc 100644
--- a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
@@ -61,7 +61,7 @@ SPMM CAT(spmmRowByRow_,OFF_F);
 
 idx_t CAT(spmmRowByRowCalculateSize_,OFF_F) (spmat* A, spmat*B, CONFIG* cfg, void** accumul, void **rows_sizes, void** tmp_matrix);
 
-void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double** AS, idx_t** JA, idx_t** IRP);
+void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double* AS, idx_t* JA, idx_t* IRP);
 
 /*
  * sparse parallel implementation of @A * @B parallelizing Gustavson 
diff --git a/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h b/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
index 85ad4e82..779c3cb6 100644
--- a/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
+++ b/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
@@ -37,12 +37,12 @@
  * both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
  */
 inline void CAT(scSparseVectMul_,OFF_F)
-  (double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux){
-	for (ulong i=0,j; i<vectLen; i++){
+  (double scalar,double* vectVals,idx_t* vectIdxs,idx_t vectLen, ACC_DENSE* aux){
+	for (idx_t i=0,j; i<vectLen; i++){
 		j = vectIdxs[i]-OFF_F;
 		DEBUGCHECKS{
 			if (j>=aux->vLen){
-				fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
+				fprintf(stderr,"index %d outside vLen %d\n",j,aux->vLen);
 				assert(j < aux->vLen);
 			}
 		}
@@ -65,12 +65,12 @@ inline void CAT(scSparseVectMul_,OFF_F)
  * both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
  */
 inline void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,
-  ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux){
-	for (ulong i=0,j; i<vectLen; i++){
+  idx_t* vectIdxs,idx_t vectLen,idx_t startIdx,ACC_DENSE* aux){
+	for (idx_t i=0,j; i<vectLen; i++){
 		j = vectIdxs[i]-OFF_F - startIdx;
 		DEBUGCHECKS{
 			if (j>=aux->vLen){
-				fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
+				fprintf(stderr,"index %d outside vLen %d\n",j,aux->vLen);
 				assert(j < aux->vLen);
 			}
 		}
@@ -90,9 +90,9 @@ inline void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,
 #endif
 
 ///TODO IMPLEMENT SCALAR<->ROW MUL AS GENERIC SPARSE VECTOR<->SCALAR MUL
-//inline void scSparseRowMul(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
-inline void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
-	ulong  rowStartIdx = mat->IRP[trgtR]-OFF_F,rowLen;
+//inline void scSparseRowMul(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux){
+inline void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux){
+	idx_t  rowStartIdx = mat->IRP[trgtR]-OFF_F,rowLen;
 	#ifdef ROWLENS
 	rowLen = mat->RL[trgtR];
 	#else
@@ -116,7 +116,6 @@ inline idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B){
 		ERRPRINT("spMMSizeUpperbound: rowSizes calloc errd\n");
 		return NULL;
 	}
-	printf("A->IRP %ld %ld %ld %ld %ld\n", A->IRP[0], A->IRP[1], A->IRP[2], A->IRP[3], A->IRP[4]);
 	idx_t fullMatBound = 0;
 	#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
 	for (idx_t r=0;  r<A->M; r++){
@@ -135,7 +134,7 @@ inline idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B){
 	rowSizes[A->M]  = fullMatBound;
 	AUDIT_INTERNAL_TIMES	End= omp_get_wtime();
 	VERBOSE 
-		printf("spMMSizeUpperbound:%lu\t%le s\n",rowSizes[A->M],End-Start);
+		printf("spMMSizeUpperbound:%d\t%le s\n",rowSizes[A->M],End-Start);
 	return rowSizes;
 }
 
@@ -170,7 +169,7 @@ inline idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)
 	rowPartsSizes[ A->M*gridCols ]  = fullMatBound;
 	AUDIT_INTERNAL_TIMES	End= omp_get_wtime();
 	VERBOSE 
-	 printf("spMMSizeUpperboundColParts_:%lu\t%le s\n",rowPartsSizes[A->M],End-Start);
+	 printf("spMMSizeUpperboundColParts_:%d\t%le s\n",rowPartsSizes[A->M],End-Start);
 	return rowPartsSizes;
 }
 
@@ -182,7 +181,7 @@ inline idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)
 #include "utils.h"
 ///Allocs - Free
 //SpMM holder of accumulats 
-inline SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum){
+inline SPMM_ACC* initSpMMAcc(idx_t entriesNum, idx_t accumulatorsNum){
 	SPMM_ACC* out = calloc(1,sizeof(*out));
 	if (!out){
 		ERRPRINT("initSpMMAcc:	out calloc errd\n");
@@ -275,7 +274,7 @@ static inline void _sparsifyUB(ACC_DENSE* accV,SPACC* accSparse,idx_t startColAc
 
 //row[Part] sparsified in a thread safe (exactly long) reserved area using atomics
 static inline void sparsifyUBNoPartsBounds
-  (SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc){
+  (SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, idx_t startColAcc){
 	//sort nnz indexes of dense accumulator
 	idx_t nnz = accV->nnzIdxMap.len;
 	idx_t sparsifyStartV;		 //start index(inside @accSparse) of @accV to sparsify
@@ -305,11 +304,11 @@ static inline void sparsifyUBNoPartsBounds
  */
 inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
   CONFIG* conf){
-	ulong nzNum=0,j,rLen,idx,partsNum = mat->M * conf->gridCols;
+	idx_t nzNum=0,j,rLen,idx,partsNum = mat->M * conf->gridCols;
 	//TODO PARALLEL MERGE - SAVE STARTING offset OF EACH PARTITION IN THE OUT MATRIX
-	ulong* rowsPartsOffsets=alloca(partsNum*sizeof(*rowsPartsOffsets));
+	idx_t* rowsPartsOffsets=alloca(partsNum*sizeof(*rowsPartsOffsets));
 	///count nnz entries and alloc arrays for them
-	for (ulong r=0; r<mat->M; r++){
+	for (idx_t r=0; r<mat->M; r++){
 		//for each partition ->get len -> outMat.IRP and aux offsets  
 		for (j=0,rLen=0; j<conf->gridCols; j++){
 			idx = IDX2D(r,j,conf->gridCols);
@@ -332,20 +331,20 @@ inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
 		return EXIT_FAILURE;
 	}
 	///popolate with rows nnz values and indexes
-	ulong pLen; //omp for aux vars
+	idx_t pLen; //omp for aux vars
 	#pragma omp parallel for schedule(static) private(pLen)
-	for (ulong i=0;  i<partsNum; i++){
+	for (idx_t i=0;  i<partsNum; i++){
 		pLen = rowsParts[i].len;
 		memcpy(mat->AS + rowsPartsOffsets[i],rowsParts[i].AS,pLen*sizeof(*(mat->AS)));
 		memcpy(mat->JA + rowsPartsOffsets[i],rowsParts[i].JA,pLen*sizeof(*(mat->JA)));
 	}
 	CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
-		for (ulong i=0,w=0; i<mat->M; i++){
+		for (idx_t i=0,w=0; i<mat->M; i++){
 			if (mat->IRP[i] != w) 
 				{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
 			for (j=0; j<conf->gridCols; j++){
 				SPACC r = rowsParts[IDX2D(i,j,conf->gridCols)];
-				for (ulong jj=0; jj<r.len; jj++,w++){
+				for (idx_t jj=0; jj<r.len; jj++,w++){
 					if (mat->AS[w]!= r.AS[jj]){
 						ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
 					if (mat->JA[w]!= r.JA[jj]){
@@ -363,9 +362,9 @@ inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
  * allocd arrays to hold non zero values and indexes into @mat
  */
 inline int mergeRows(SPACC* rows,spmat* mat){
-	ulong nzNum=0;
+	idx_t nzNum=0;
 	//count nnz entries and alloc arrays for them
-	for (ulong r=0;   r<mat->M;   ++r){
+	for (idx_t r=0;   r<mat->M;   ++r){
 		nzNum += rows[r].len;
 		mat->IRP[r+1] = nzNum;
 		#ifdef ROWLENS
@@ -384,15 +383,15 @@ inline int mergeRows(SPACC* rows,spmat* mat){
 	///POPOLATE WITH ROWS NNZ VALUES AND INDEXES
 	//TODO PARALLEL COPY
 	#pragma omp parallel for schedule(static)
-	for (ulong r=0; r<mat->M; r++){
+	for (idx_t r=0; r<mat->M; r++){
 		memcpy(mat->AS+mat->IRP[r], rows[r].AS, rows[r].len*sizeof(*(mat->AS)));
 		memcpy(mat->JA+mat->IRP[r], rows[r].JA, rows[r].len*sizeof(*(mat->JA)));
 	}
 	CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
-		for (ulong r=0,i=0; r<mat->M; r++){
+		for (idx_t r=0,i=0; r<mat->M; r++){
 			if (i != mat->IRP[r])
 				{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
-			for (ulong j=0; j<rows[r].len; j++,i++){
+			for (idx_t j=0; j<rows[r].len; j++,i++){
 				if (mat->AS[i]!= rows[r].AS[j]){
 					ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
 				if (mat->JA[i]   != rows[r].JA[j]){
@@ -415,9 +414,9 @@ inline int mergeRows(SPACC* rows,spmat* mat){
 */
 
 inline idx_t calculateSize(SPACC* rows, spmat *mat){
-	ulong nzNum=0;
+	idx_t nzNum=0;
 	//count nnz entries and alloc arrays for them
-	for (ulong r=0;   r<mat->M;   ++r){
+	for (idx_t r=0;   r<mat->M;   ++r){
 		nzNum += rows[r].len;
 		mat->IRP[r+1] = nzNum;
 		#ifdef ROWLENS
@@ -429,16 +428,17 @@ inline idx_t calculateSize(SPACC* rows, spmat *mat){
 	return nzNum;
 }
 
-inline int mergeRowsPopulate(SPACC* rows, spmat* mat, double** AS, idx_t** JA, idx_t** IRP) {
+inline int mergeRowsPopulate(SPACC* rows, spmat* mat, double* AS, idx_t* JA, idx_t* IRP) {
     // Populate with rows' non-zero values and indexes
     // TODO PARALLEL COPY
-    #pragma omp parallel for schedule(static)
-    for (ulong r = 0; r < mat->M; r++) {
-        memcpy(*AS + mat->IRP[r], rows[r].AS, rows[r].len * sizeof(double));
-        memcpy(*JA + mat->IRP[r], rows[r].JA, rows[r].len * sizeof(idx_t));
+    // #pragma omp parallel for schedule(static)
+    for (idx_t r = 0; r < mat->M; r++) {
+		fprintf(stderr, "r %d, AS %p, mat->IRP[r] %d, rows[r].AS %p, rows[r].len %d\n", r,AS,mat->IRP[r],rows[r].AS, rows[r].len);
+        memcpy(AS + mat->IRP[r], rows[r].AS, rows[r].len * sizeof(double));
+        memcpy(JA + mat->IRP[r], rows[r].JA, rows[r].len * sizeof(idx_t));
     }
     
-	memcpy(*IRP, mat->IRP, (mat->M + 1) * sizeof(idx_t));
+	memcpy(IRP, mat->IRP, (mat->M + 1) * sizeof(idx_t));
 	
     // Consistency checks
     // TODO: Add your consistency checks here
diff --git a/base/serial/impl/sp3mm4amg/include/config.h b/base/serial/impl/sp3mm4amg/include/config.h
index c504dfcc..d8d516ac 100644
--- a/base/serial/impl/sp3mm4amg/include/config.h
+++ b/base/serial/impl/sp3mm4amg/include/config.h
@@ -40,11 +40,11 @@ typedef enum{
 
 //
 typedef struct{
-	ushort gridRows;
-	ushort gridCols;
+	int gridRows;
+	int gridCols;
 	//TODO FULL CONFIG DOCCED HERE
 	ROW_MMSYM_IMPL_MODE symbMMRowImplID; //how to compute symb mul (if required)
-	uint threadNum;  //thread num to use in an OMP parallel region ...
+	int threadNum;  //thread num to use in an OMP parallel region ...
 	void* chunkDistrbFunc;  //CHUNKS_DISTR_INTERF func pntr
 } CONFIG;  
 ///Smart controls
diff --git a/base/serial/impl/sp3mm4amg/include/inlineExports.h b/base/serial/impl/sp3mm4amg/include/inlineExports.h
index cc3e0a83..820af188 100644
--- a/base/serial/impl/sp3mm4amg/include/inlineExports.h
+++ b/base/serial/impl/sp3mm4amg/include/inlineExports.h
@@ -33,17 +33,17 @@ void cleanRbNodes(rbRoot* root,rbNode* nodes,idx_t nodesNum);
 int rbInsertNewKey(rbRoot *root,rbNode *node, idx_t key);
 //void C_FortranShiftIdxs(spmat* outMat);
 //void Fortran_C_ShiftIdxs(spmat* m);
-ACC_DENSE* _initAccVectors(ulong num,ulong size);
-ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size); //TODO PERF WITH NEXT
-SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum);
+ACC_DENSE* _initAccVectors(idx_t num,idx_t size);
+ACC_DENSE* _initAccVectors_monoalloc(idx_t num,idx_t size); //TODO PERF WITH NEXT
+SPMM_ACC* initSpMMAcc(idx_t entriesNum, idx_t accumulatorsNum);
 idx_t reductionMaxSeq(idx_t* arr,idx_t arrLen);
-int _allocAccDense(ACC_DENSE* v,ulong size);
+int _allocAccDense(ACC_DENSE* v,idx_t size);
 int mergeRows(SPACC* rows,spmat* mat);
 int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,CONFIG* conf);
-void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num); 
+void _freeAccsDenseChecks(ACC_DENSE* vectors,idx_t num); 
 void _resetAccVect(ACC_DENSE* acc);
 void _resetIdxMap(SPVECT_IDX_DENSE_MAP* acc);
 void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
-void freeAccsDense(ACC_DENSE* vectors,ulong num);
+void freeAccsDense(ACC_DENSE* vectors,idx_t num);
 void freeSpMMAcc(SPMM_ACC* acc);
-void sparsifyDenseVect(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc);
+void sparsifyDenseVect(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, idx_t startColAcc);
diff --git a/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h b/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
index eba852f2..08255ba0 100644
--- a/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
+++ b/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
@@ -35,9 +35,9 @@
 
 ////inline exports
 //multi implmentation functions
-void CAT(scSparseVectMul_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux);
-void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux);
-void CAT(_scRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
-void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
+void CAT(scSparseVectMul_,OFF_F)(double scalar,double* vectVals,idx_t* vectIdxs,idx_t vectLen, ACC_DENSE* aux);
+void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,idx_t* vectIdxs,idx_t vectLen,idx_t startIdx,ACC_DENSE* aux);
+void CAT(_scRowMul_,OFF_F)(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux);
+void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux);
 idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B);
 idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)(spmat* A,spmat* B,ushort gridCols,idx_t* bColPartOffsets);
diff --git a/base/serial/impl/sp3mm4amg/include/macros.h b/base/serial/impl/sp3mm4amg/include/macros.h
index c75017ee..539430e8 100644
--- a/base/serial/impl/sp3mm4amg/include/macros.h
+++ b/base/serial/impl/sp3mm4amg/include/macros.h
@@ -42,7 +42,7 @@
 #define IN_RANGE(i,start,end)		( (start) <= (i) && (i) <= (end) )
 
 //ceil(x/y) with integers
-#define INT_DIV_CEIL(x,y)		    ( ( (x) - 1) / (y) + 1 )
+#define INT_DIV_CEIL(x,y)	    ( ( (x) - 1) / (y) + 1 )
 //2D ROW MAJOR indexing wrap compute
 #define IDX2D(i,j,nCols)            ((j) + (i)*(nCols))
 
diff --git a/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h b/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
index 6d0ba45c..a97cefca 100644
--- a/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
+++ b/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
@@ -40,14 +40,14 @@
 #include "config.h"
 #include "omp.h"
 //distribution of @rows|blocks of @matrix, exploiting @config
-typedef void (CHUNKS_DISTR )           (ulong,spmat*,CONFIG*);
-typedef void (*CHUNKS_DISTR_INTERF )   (ulong,spmat*,CONFIG*);
+typedef void (CHUNKS_DISTR )           (idx_t,spmat*,CONFIG*);
+typedef void (*CHUNKS_DISTR_INTERF )   (idx_t,spmat*,CONFIG*);
 
 //NOOP chunks division for manual configuration via export OMP_SCHEDULE  
-inline void chunksNOOP(ulong r,spmat* mat,CONFIG* cfg){ return; }
+inline void chunksNOOP(idx_t r,spmat* mat,CONFIG* cfg){ return; }
 #include "ompGetICV.h"
 //fair division of @r elements from matrix @mat with threads num in @cfg
-inline void chunksFair(ulong r,spmat* mat,CONFIG* cfg){
+inline void chunksFair(idx_t r,spmat* mat,CONFIG* cfg){
 	assert(cfg->threadNum > 0); //configured target thread num
 	omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
 	omp_get_schedule(&kind,&chunk_size);
@@ -75,7 +75,7 @@ inline void chunksFair(ulong r,spmat* mat,CONFIG* cfg){
 }
 //fair division of @r elements from matrix @mat of threads in @cfg
 //subdividing the fair share with a factor of @FAIR_CHUNKS_FOLDING
-inline void chunksFairFolded(ulong r,spmat* mat,CONFIG* cfg){
+inline void chunksFairFolded(idx_t r,spmat* mat,CONFIG* cfg){
 	assert(cfg->threadNum > 0); //configured target thread num
 	omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
 	omp_get_schedule(&kind,&chunk_size);
diff --git a/base/serial/impl/sp3mm4amg/include/parser.h b/base/serial/impl/sp3mm4amg/include/parser.h
index c482f7e1..2d5f25ec 100644
--- a/base/serial/impl/sp3mm4amg/include/parser.h
+++ b/base/serial/impl/sp3mm4amg/include/parser.h
@@ -35,16 +35,16 @@
 #include "sparseMatrix.h" 
 
 typedef struct{
-	ulong row;
-	ulong col;
+	idx_t row;
+	idx_t col;
 	double val;
 } entry;	 //MatrixMarket COO entry
 
 typedef struct{
 	MM_typecode mcode;
 	entry* entries;
-	ulong* rowLens;
-	ulong M,N,NZ;   //spmat sizes
+	idx_t* rowLens;
+	idx_t M,N,NZ;   //spmat sizes
 } MatrixMarket;
 
 ////COO PARSE
@@ -61,7 +61,7 @@ int MMCheck(MM_typecode typecode);
  * return allocated and filled COO entries with the NNZ number into
  * NO SORT CHECKING HERE
  */
-entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens);
+entry* MMtoCOO(idx_t* NZ, FILE *fp, MM_typecode mcode,idx_t* rowLens);
 
 ////COO -> ANYTHING ELSE CONVERSION
 /*
@@ -69,13 +69,13 @@ entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens);
  * EXPECTED: CSR arrays allocated, @entries col sorted in (not madatory consecut) rows
  * [simmetrical parts explicitly rappresented --> not important here]
  */
-int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens);
+int COOtoCSR(entry* entries, spmat* mat,idx_t* rowLens);
 /*
  * write COO entries in @entries inside sparse matrix @mat in ELL format
  * EXPECTED: @entries col sorted in (not madatory consecut) rows
  * ELL internal array allocated in this function, not freed in case of error
  */
-int COOtoELL(entry* entries, spmat* mat, ulong* rowLens);
+int COOtoELL(entry* entries, spmat* mat, idx_t* rowLens);
 ////wrapper MM -> specialized target
 /*
  * Parse MatrixMarket matrix stored in file at @matPath
diff --git a/base/serial/impl/sp3mm4amg/include/sparseMatrix.h b/base/serial/impl/sp3mm4amg/include/sparseMatrix.h
index 8fc769e8..e146024a 100644
--- a/base/serial/impl/sp3mm4amg/include/sparseMatrix.h
+++ b/base/serial/impl/sp3mm4amg/include/sparseMatrix.h
@@ -75,7 +75,7 @@ typedef struct{
 	idx_t    vLen;       			//size of the aux dense vector //TODO USELESS?
 	SPVECT_IDX_DENSE_MAP nnzIdxMap;	//
 } ACC_DENSE;
-int allocAccDense(ACC_DENSE* v, ulong size);
+int allocAccDense(ACC_DENSE* v, idx_t size);
 
 /*  
  **** NNZ IDXS PRESENCE FLAGS ACCESS INTERFACE:  ***
diff --git a/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h b/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
index fbcab700..6d56c4d8 100644
--- a/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
+++ b/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
@@ -40,19 +40,19 @@
  * returning an offsets matrix out[i][j] = start of jth colPartition of row i
  * subdivide @A columns in uniform cols ranges in the output 
  */
-idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
+idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols);
 
 /*
  * partition CSR sparse matrix @A in @gridCols columns partitions as 
  * indipended and allocated sparse matrixes and return them
  * subdivide @A columns in uniform cols ranges in the output 
  */
-spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
+spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols);
 //same as above but with (aux) use of offsets partitoning (also returned if colOffsets!=NULL
-spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,idx_t** colPartsOffsets);
+spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,int gridCols,idx_t** colPartsOffsets);
 
 //same as checkOverallocPercent but with 2D partitioning - CSR col partitioning
-void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
+void CAT(checkOverallocRowPartsPercent_,OFF_F)(idx_t* forecastedSizes,spmat* AB,
 				  	      idx_t gridCols,idx_t* bColOffsets);
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -62,13 +62,13 @@ void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
 
 //shift every index about the sparse data for use the matrix in a fortran app
 inline void C_FortranShiftIdxs(spmat* m){
-	for(ulong r=0; r<m->M+1; m -> IRP[r]++,r++);
-	for(ulong i=0; i<m->NZ;  m -> JA[i]++, i++);
+	for(idx_t r=0; r<m->M+1; m -> IRP[r]++,r++);
+	for(idx_t i=0; i<m->NZ;  m -> JA[i]++, i++);
 }
 //shift every index about the sparse data for use the matric in a C app
 inline void Fortran_C_ShiftIdxs(spmat* m){	//TODO DBG ONLY and compleatness
-	for(ulong r=0; r<m->M+1; m -> IRP[r]--,r++);
-	for(ulong i=0; i<m->NZ;  m -> JA[i]--, i++);
+	for(idx_t r=0; r<m->M+1; m -> IRP[r]--,r++);
+	for(idx_t i=0; i<m->NZ;  m -> JA[i]--, i++);
 }
 
 /*
@@ -77,7 +77,7 @@ inline void Fortran_C_ShiftIdxs(spmat* m){	//TODO DBG ONLY and compleatness
  * in @forecastedSizes there's for each row -> forecasted size 
  * and in the last entry the cumulative of the whole matrix
  */
-void checkOverallocPercent(ulong* forecastedSizes,spmat* AB);
+void checkOverallocPercent(idx_t* forecastedSizes,spmat* AB);
 /*  
 	check if sparse matrixes A<->B differ up to 
 	DOUBLE_DIFF_THREASH per element
@@ -86,7 +86,7 @@ int spmatDiff(spmat* A, spmat* B);
 ////dyn alloc of spMM output matrix
 /*
 ///size prediction of AB = @A * @B
-inline ulong SpMMPreAlloc(spmat* A,spmat* B){
+inline idx_t SpMMPreAlloc(spmat* A,spmat* B){
 	//TODO BETTER PREALLOC HEURISTICS HERE 
 	return MAX(A->NZ,B->NZ);
 }
@@ -110,7 +110,7 @@ inline spmat* initSpMatrixSpMM(spmat* A, spmat* B){
 
 #define REALLOC_FACTOR  1.5
 //realloc sparse matrix NZ arrays
-inline int reallocSpMatrix(spmat* mat,ulong newSize){
+inline int reallocSpMatrix(spmat* mat,idx_t newSize){
 	mat->NZ *= newSize;
 	void* tmp;
 	if (!(tmp = realloc(mat->AS,mat->NZ * sizeof(*(mat->AS))))){
@@ -133,6 +133,6 @@ void printSparseMatrix(spmat* sparseMat,char justNZMarkers);
 /*convert @sparseMat sparse matrix in dense matrix returned*/
 double* CSRToDense(spmat* sparseMat);
 
-void freeAccsDense(ACC_DENSE* vectors,ulong num);
+void freeAccsDense(ACC_DENSE* vectors,idx_t num);
 
 #endif //SPARSEUTILS_H_COMMON_IDX_IMPLS 
diff --git a/base/serial/impl/sp3mm4amg/include/utils.h b/base/serial/impl/sp3mm4amg/include/utils.h
index 8ef47492..eb62dec6 100644
--- a/base/serial/impl/sp3mm4amg/include/utils.h
+++ b/base/serial/impl/sp3mm4amg/include/utils.h
@@ -58,18 +58,18 @@ int createNewFile(char* const outFpath);
 #define DOUBLE_STR_FORMAT	"%25le\n"
 //write double vector @v as row sequence of double at @fpath
 //e.g. read with od -tf8 -w8 fpath : OCTALOFFSET:   DOUBLE FULL DIGITS
-int writeDoubleVector(char* fpath,double* v,ulong size);
+int writeDoubleVector(char* fpath,double* v,idx_t size);
 /*
  * read vector of double [Str] of arbitrary size from @fpath, true lenght in *size
  * if size point to a nnz value, the initial allocation will be of *size
  * eventual successive reallocation done multipling *size with VECTOR_STEP_REALLOC
  */
-double* readDoubleVector(char* fpath,ulong* size);
-double* readDoubleVectorStr(char* fpath,ulong* size);
+double* readDoubleVector(char* fpath,idx_t* size);
+double* readDoubleVectorStr(char* fpath,idx_t* size);
 
 ///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
 //dual of readDoubleVectorVector
-int writeDoubleVectorAsStr(char* fpath,double* v,ulong size);
+int writeDoubleVectorAsStr(char* fpath,double* v,idx_t size);
 int MPI_Dims_create(int nnodes, int ndims, int dims[]);	//commons/ompi_dims_create/ompi_dims_create.c
 
 #include "config.h"
@@ -81,16 +81,16 @@ int getConfig(CONFIG* conf);
 
 //append only list implemented with a reallocated array
 typedef struct{
-	ulong* a;
-	ulong  size;
-	ulong  lastIdx;
+	idx_t* a;
+	idx_t  size;
+	idx_t  lastIdx;
 } APPENDARRAY;
 //append @val to @list, reallocating if reached end
-//TODO inline int appendArr(ulong val,APPENDARRAY* list);
+//TODO inline int appendArr(idx_t val,APPENDARRAY* list);
 
 void sortuint(uint* arr, uint len);     //sort uint array @arr of @len elements
 void sort_idx_t(idx_t* arr, idx_t len); 
-void sortulong(ulong* arr, ulong len);
+void sortidx_t(idx_t* arr, idx_t len);
 void sortRbNode(rbNode* arr,idx_t len);
 
 ///ranges functions 
@@ -157,9 +157,9 @@ inline idx_t reductionMaxOmp(idx_t* arr,idx_t arrLen){
  *  of the 2 entries of the input vectors, signed as a[i] - b[i] (as well as dump prints)
  * CONVENTION:	@a = true result, @b vector to check with the true result 
  */
-int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax);
+int doubleVectorsDiff(double* a, double* b, idx_t n,double* diffMax);
 //fill a random vector in @v long @size doubles
-int fillRndVector(ulong size, double* v);
+int fillRndVector(idx_t size, double* v);
 //read vector as a sequence of space separated double from file at @fpath 
 #define VECTOR_STEP_MALLOC 100
 
@@ -174,8 +174,8 @@ int fillRndVector(ulong size, double* v);
 int extractInTmpFS(char* path, char* tmpFsDecompressPath);
 //compute E[@values] in @out[0] and VAR[@values] in @out[1] of @numVals values
 void statsAvgVar(double* values,uint numVals, double* out);
-void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers);
-void printVector(double* v,ulong size);
+void printMatrix(double* mat,idx_t m,idx_t n,char justNZMarkers);
+void printVector(double* v,idx_t size);
 
 void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
 #endif
diff --git a/base/serial/impl/sp3mm4amg/lib/parser.c b/base/serial/impl/sp3mm4amg/lib/parser.c
index f527500f..ed59ac75 100644
--- a/base/serial/impl/sp3mm4amg/lib/parser.c
+++ b/base/serial/impl/sp3mm4amg/lib/parser.c
@@ -52,10 +52,10 @@ int MMCheck(MM_typecode mcode) {
 	return EXIT_SUCCESS;
 }
 
-entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens){
+entry* MMtoCOO(idx_t* NZ, FILE *fp, MM_typecode mcode,idx_t* rowLens){
 	int scanndRet=0;
-	ulong nzTrgt=*NZ,nzIdx=0;	//expanded num of nz (in case of sym matrix)
-	ulong diagEntries=0, row = 0, col = 0;//current entry's row,col from MM -> 1 based
+	idx_t nzTrgt=*NZ,nzIdx=0;	//expanded num of nz (in case of sym matrix)
+	idx_t diagEntries=0, row = 0, col = 0;//current entry's row,col from MM -> 1 based
 	double val = 0;
 	entry* entries = NULL;		//COO parsed entries
 	///init
@@ -70,10 +70,10 @@ entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens){
 	///parse MM fp lines into COOordinate entries
 	while (1) { // Reading the fp until EOF
 		if (mm_is_pattern(mcode)){
-			scanndRet = fscanf(fp, "%lu %lu\n", &row, &col);
+			scanndRet = fscanf(fp, "%d %d\n", &row, &col);
 			val = 1.0;
 		} else if (mm_is_real(mcode) || (mm_is_integer(mcode))){
-			scanndRet = fscanf(fp, "%lu %lu %lf\n", &row, &col, &val);
+			scanndRet = fscanf(fp, "%d %d %lf\n", &row, &col, &val);
 		}
 		
 
@@ -168,11 +168,11 @@ MatrixMarket* MMRead(char* matPath){
 
 
 ////COO -> ANYTHING ELSE CONVERSION
-int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){ 
+int COOtoCSR(entry* entries, spmat* mat,idx_t* rowLens){ 
 	int out = EXIT_FAILURE;
-	ulong idx;
+	idx_t idx;
 	long* _rowsLastCol = NULL;	//for each row -> last added entry's columnIdx 
-	ulong* rowsNextIdx = NULL;	//for each row -> next entry progressive idx
+	idx_t* rowsNextIdx = NULL;	//for each row -> next entry progressive idx
 	if (!(rowsNextIdx = calloc(mat->M,sizeof(*rowsNextIdx)))){
 		ERRPRINT("MMtoCOO:  rowsNextIdx calloc errd\n");
 		goto _end;
@@ -186,9 +186,9 @@ int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){
 	}
 	/*TODO OLD
 	 * //get rowLens->IRP (partial), TODO moved MMtoCOO to avoid FULL rescan entries
-	 * for (ulong i=0; i<mat->NZ; i++)	 mat->IRP[entries[i].row+1]++;
+	 * for (idx_t i=0; i<mat->NZ; i++)	 mat->IRP[entries[i].row+1]++;
 	 * memcpy(mat->RL,mat->IRP + 1,sizeof(*mat->IRP) * mat->M); //TODO in next ifdef
-	 * for (ulong i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
+	 * for (idx_t i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
 	 * OLD2: rowLens memcpy ... no just moved the pointer
 	 * #ifdef ROWLENS
 	 * memcpy(mat->RL,rowLens,sizeof(*rowLens) * mat->M); //TODO in next ifdef
@@ -197,19 +197,19 @@ int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){
 	//IRP: trasform rows lens as increments to build row index "pointer"
 	//0th -> 0 mandatory; 1th = 0th row len, ...., M+1th = end of Mth row
 	memcpy(mat->IRP+1,rowLens,sizeof(*rowLens) * mat->M);//init IRP with rows lens
-	for (ulong i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
+	for (idx_t i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
 	CONSISTENCY_CHECKS  assert(mat->IRP[mat->M] == mat->NZ);
 	///FILL 
 	//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
 	//sorted for each row (even nn sequential in @entries)
 	//entries write in CSR format
 	entry* e;
-	for (ulong i=0; i<mat->NZ; i++) {
+	for (idx_t i=0; i<mat->NZ; i++) {
 		e = entries+i;
 		CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE SORTED
 			/*#pragma message("COO sorting check enabled")*/
 			if (_rowsLastCol[e->row] >= (long) e->col){
-				ERRPRINTS("not sorted entry:%ld,%ld,%lf",e->row,e->col,e->val);
+				ERRPRINTS("not sorted entry:%d,%d,%lf",e->row,e->col,e->val);
 				goto _end;
 			}
 			_rowsLastCol[e->row] = e->col;
@@ -228,16 +228,16 @@ int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){
 	return out;
 }
 
-int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
+int COOtoELL(entry* entries, spmat* mat, idx_t* rowLens){
 	int out=EXIT_FAILURE;
-	ulong maxRow = 0, col, _ellEntriesTot, *rowsNextCol;
+	idx_t maxRow = 0, col, _ellEntriesTot, *rowsNextCol;
 	long* _rowsLastCol=NULL;
 	entry* e;
-	for (ulong i=0; i<mat->M; i++)  maxRow = MAX(maxRow,rowLens[i]); 
+	for (idx_t i=0; i<mat->M; i++)  maxRow = MAX(maxRow,rowLens[i]); 
 	_ellEntriesTot = 2*mat->M*maxRow;
 	#ifdef LIMIT_ELL_SIZE
 	if ( _ellEntriesTot > ELL_MAX_ENTRIES ){
-		ERRPRINTS("Required entries %lu -> %lu uMB for the matrix exceed the "
+		ERRPRINTS("Required entries %d -> %lu uMB for the matrix exceed the "
 		  "designated threashold of: %lu  -> %lu MB for ellpack\n",
 		  _ellEntriesTot,(sizeof(double)*_ellEntriesTot) >> 20,
 		  ELL_MAX_ENTRIES,(sizeof(double)*ELL_MAX_ENTRIES) >> 20);
@@ -274,12 +274,12 @@ int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
 	///FILL NZ
 	//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
 	//sorted for each row (even nn sequential in @entries)
-	for (ulong i=0; i<mat->NZ; i++){
+	for (idx_t i=0; i<mat->NZ; i++){
 		e = entries + i;
 		CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE COLS SORTED for righe successive
 			/*#pragma message("COO sorting check enabled")*/
 			if (_rowsLastCol[e->row] >= (long) e->col){
-				ERRPRINTS("not sorted entry:%ld,%ld,%lf",
+				ERRPRINTS("not sorted entry:%d,%d,%lf",
 					  e->row,e->col,e->val);
 				goto _end;
 			}
@@ -291,16 +291,16 @@ int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
 		 
 	}
 	///FILL PAD
-	ulong padded = 0,paddedEntries = mat->M*mat->MAX_ROW_NZ;
-	for (ulong r=0; r<mat->M; r++){
-		for (ulong c=rowLens[r],j=IDX2D(r,c,maxRow); c<maxRow; c++,j++,padded++){
+	idx_t padded = 0,paddedEntries = mat->M*mat->MAX_ROW_NZ;
+	for (idx_t r=0; r<mat->M; r++){
+		for (idx_t c=rowLens[r],j=IDX2D(r,c,maxRow); c<maxRow; c++,j++,padded++){
 			//mat->AS[j] = ELL_AS_FILLER; //TODO ALREADY DONE IN CALLOC
 			//mat->JA[j] = mat->JA[rowLens[r]-1]; //ELL_JA_FILLER; //TODO calloc CUDA benefit?
 		}
 	}
 	VERBOSE{ 
-	  printf("padded %lu entries = %lf%% of NZ\n",padded,100*padded/(double) mat->NZ);
-	  printf("ELL matrix of: %lu paddedEntries -> %lu MB of JA+AS\n",
+	  printf("padded %d entries = %lf%% of NZ\n",padded,100*padded/(double) mat->NZ);
+	  printf("ELL matrix of: %d paddedEntries -> %ld MB of JA+AS\n",
 		paddedEntries,(paddedEntries*sizeof(*(mat->JA))+paddedEntries*sizeof(*(mat->AS))) >> 20);
 	}
 	out = EXIT_SUCCESS;
@@ -318,7 +318,7 @@ spmat* MMtoCSR(char* matPath){
 		return NULL;
 	}
 	if (!(mat = calloc(1,sizeof(*mat)))){
-		ERRPRINT("MMtoCSR: mat struct alloc errd");
+		ERRPRINT("MMtoCSR: mat struct alloc err\n");
 		goto err;
 	}
 	mat -> M = mm->M;
@@ -345,7 +345,7 @@ spmat* MMtoCSR(char* matPath){
 	#endif
 
 	VERBOSE
-	printf("MMtoCSR: %lu NZ entries-> %lu MB of AS+JA+IRP\n",mat->NZ,
+	printf("MMtoCSR: %d NZ entries-> %ld MB of AS+JA+IRP\n",mat->NZ,
 	(mat->NZ*sizeof(*(mat->AS))+mat->NZ*sizeof(*(mat->JA))+(1+mat->M*sizeof(*(mat->IRP))))>>20);
 	goto _free;
 
diff --git a/base/serial/impl/sp3mm_impl.f90 b/base/serial/impl/sp3mm_impl.f90
index 972b4530..995a44c9 100644
--- a/base/serial/impl/sp3mm_impl.f90
+++ b/base/serial/impl/sp3mm_impl.f90
@@ -15,24 +15,20 @@ subroutine dspmm(a,b,c,info, impl_choice)
     integer(psb_ipk_), intent(in)       :: impl_choice
 
     ! Internal variables
-    integer(c_size_t)   :: a_m,a_n,a_nz
-    type(c_ptr)         :: a_as,a_ja,a_irp
-    integer(c_size_t)   :: a_max_row_nz
-    integer(c_size_t)   :: b_m,b_n,b_nz
-    type(c_ptr)         :: b_as,b_ja,b_irp
-    integer(c_size_t)   :: b_max_row_nz
-    integer(c_int)      :: impl_choice_
-    type(c_ptr)         :: accumul, rows_sizes, tmp_matrix
-    integer(c_size_t)   :: nnz
-    type(c_ptr)         :: c_as,c_ja,c_irp
+    integer(c_int)  :: a_m,a_n,a_nz
+    type(c_ptr)     :: a_as,a_ja,a_irp
+    integer(c_int)  :: b_m,b_n,b_nz
+    type(c_ptr)     :: b_as,b_ja,b_irp
+    integer(c_int)  :: impl_choice_
+    type(c_ptr)     :: accumul, rows_sizes, tmp_matrix
+    integer(c_int)  :: nnz
+    type(c_ptr)     :: c_as,c_ja,c_irp
 
     interface spmm_build_spacc
         subroutine psb_f_spmm_build_spacc(c_a_m,c_a_n,c_a_nz,&
                             c_a_as,c_a_ja,c_a_irp,&
-                            c_a_max_row_nz,&
                             c_b_m,c_b_n,c_b_nz,&
                             c_b_as,c_b_ja,c_b_irp,&
-                            c_b_max_row_nz,&
                             c_impl_choice,&
                             c_accumul,&
                             c_rows_sizes,&
@@ -41,16 +37,14 @@ subroutine dspmm(a,b,c,info, impl_choice)
                             c_nnz) bind(C)
             use iso_c_binding
             use psb_base_mod
-            integer(c_size_t), intent(in), value :: c_a_m,c_a_n,c_a_nz
+            integer(c_int), intent(in), value :: c_a_m,c_a_n,c_a_nz
             type(c_ptr), intent(in) :: c_a_as,c_a_ja,c_a_irp
-            integer(c_size_t), intent(in), value :: c_a_max_row_nz
-            integer(c_size_t), intent(in), value :: c_b_m,c_b_n,c_b_nz
+            integer(c_int), intent(in), value :: c_b_m,c_b_n,c_b_nz
             type(c_ptr), intent(in) :: c_b_as,c_b_ja,c_b_irp
-            integer(c_size_t), intent(in), value :: c_b_max_row_nz
             integer(c_int), intent(in), value :: c_impl_choice
             type(c_ptr), intent(out) :: c_accumul,c_rows_sizes,c_tmp_matrix
             integer(psb_ipk_), intent(out) :: c_info
-            integer(c_size_t), intent(out) :: c_nnz
+            integer(c_int), intent(out) :: c_nnz
         end subroutine psb_f_spmm_build_spacc
     end interface spmm_build_spacc
 
@@ -74,7 +68,6 @@ subroutine dspmm(a,b,c,info, impl_choice)
     a_m = a%get_nrows()
     a_n = a%get_ncols()
     a_nz = a%get_nzeros()
-    write(*,*) 'IRP(1:5) ',a%irp(1:5)
     a_as = c_loc(a%val)
     a_ja = c_loc(a%ja)
     a_irp = c_loc(a%irp)
@@ -89,24 +82,25 @@ subroutine dspmm(a,b,c,info, impl_choice)
 
     ! call calculateSize
     call psb_f_spmm_build_spacc(a_m,a_n,a_nz,a_as,&
-                                a_ja,a_irp,a_max_row_nz,&
+                                a_ja,a_irp,&
                                 b_m,b_n,b_nz,b_as,b_ja,&
-                                b_irp,b_max_row_nz,&
+                                b_irp,&
                                 impl_choice_,accumul,&
                                 rows_sizes,tmp_matrix,&
                                 info,nnz)
 
     ! allocate c%val, c%ja and c%irp
-    allocate(c%val(nnz))
-    allocate(c%ja(nnz))
-    allocate(c%irp(a_m + 1))
-    
-    ! c_as = c_loc(c%val)
-    ! c_ja = c_loc(c%ja)
-    ! c_irp = c_loc(c%irp)
+    call psb_realloc(nnz,c%val,info)
+    if (info /= 0) write(*,*) "reallocation failed"
+    call psb_realloc(nnz,c%ja,info)
+    call psb_realloc(a_m+1,c%irp,info)
 
-    ! c%set_nrows(a_m)
-    ! c%set_ncols(b_n)
+    c_as = c_loc(c%val)
+    c_ja = c_loc(c%ja)
+    c_irp = c_loc(c%irp)
+
+    call c%set_nrows(a_m)
+    call c%set_ncols(b_n)
 
     ! call spmmRowByRowPopulate
     call psb_f_spmm_merge_spacc(accumul,&