changed all unsigned longs to int, modified makefile for debug purposes, reallocate c matrix arrays in sp3mm_impl.f90

sp3mm-interface
wlthr 2 years ago
parent 5e252b77bb
commit 4929991249

@ -10,7 +10,7 @@ CWALL+=-Wno-unused-label -Wfatal-errors
CINCL = -Iinclude/
CFLAGS = -g -O3 $(CWALL) $(CINCL) -fopenmp $(RUNTIME)
MACROS = -DDEBUGPRINT="if(FALSE)" -DDEBUG="if(FALSE)" -DCONSISTENCY_CHECKS="if(FALSE)" -DVERBOSE="if(FALSE)" -DDEBUGCHECKS="if(FALSE)"
MACROSDBG = -DCONSISTENCY_CHECKS="if(TRUE)" -DDEBUGCHECKS="if(TRUE)" -DVERBOSE="if(TRUE)" -DDEBUG="if(TRUE)"
MACROSDBG = -DCONSISTENCY_CHECKS="if(TRUE)" -DDEBUGCHECKS="if(TRUE)" -DVERBOSE="if(TRUE)" -DDEBUG="if(TRUE)" -DDEBUGPRINT="if(TRUE)"
LDFLAGS = -lm
LIBDIR=../../../

@ -122,7 +122,7 @@ spmat* CAT(spmmRowByRow_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
bRowLen = B->IRP[ca+1] - B->IRP[ca];
CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
} */
for (ulong ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul
for (idx_t ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul
CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
//direct sparsify: trasform accumulated dense vector to a CSR row
sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
@ -165,10 +165,10 @@ spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
///NUMERIC STEP
//perform Gustavson over rows blocks -> M / @cfg->gridRows
ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
idx_t rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
ulong b,startRow,block; //omp for aux vars
idx_t b,startRow,block; //omp for aux vars
#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
for (b=0; b < cfg->gridRows; b++){
block = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
@ -184,7 +184,7 @@ spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
bRowLen = B->IRP[ca+1] - B->IRP[ca];
CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
} */
for (ulong ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul
for (idx_t ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul
CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
//direct sparsify: trasform accumulated dense vector to a CSR row
sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
@ -209,7 +209,7 @@ spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
}
spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%dxN=%d\n",A->M,B->N);
idx_t* bColOffsets = NULL; //B group columns starting offset for each row
ACC_DENSE *accVectors=NULL,*accV;
SPACC* accRowPart;
@ -218,10 +218,10 @@ spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
idx_t *rowsPartsSizes=NULL, *rowSizes=NULL; //for rows' cols partition, correct len
if (!AB) goto _err;
//2D indexing aux vars
ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
idx_t _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
idx_t _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
idx_t startRow,startCol,rowBlock,colBlock; //data division aux variables
////get bColOffsets for B column groups
if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
goto _err;

@ -114,7 +114,7 @@ static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)
* sorting inplace the nodes inserted in the rbtree */
sortRbNode(nodes,abRowLen);
#elif _OUT_IDXS == T || _COL_PARTS == T
uint i=0;
int i=0;
idx_t k;
#if _COL_PARTS == T
//colParts aux vars
@ -144,7 +144,7 @@ static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)
idx_t k;
for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
k = rb_entry(n,rbNode,rb)->key;
printf("%lu, ",k);
printf("%d, ",k);
}
printf("\n");
}*/
@ -327,7 +327,8 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
goto _err;
}
#endif //_COL_PARTS
uint maxThreads = omp_get_max_threads(); //TODO FROM CFG
int maxThreads;
maxThreads = omp_get_max_threads(); //TODO FROM CFG
//index keeping aux struct
//rbtree implementation or idxMap with aux of symbTree for tmp outIdx keeping
if ( rbTreeUsed ){
@ -340,7 +341,7 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
goto _err;
}
//init roots
for (uint i=0; i<maxThreads; i++)
for (int i=0; i<maxThreads; i++)
rbRoots[i] = RB_ROOT_CACHED;
}
if (symbRowImplID == IDXMAP){ //idxMap implementation && not outIdxs via rbtree
@ -350,7 +351,7 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
goto _err;
}
//init idxs maps
for (uint i=0; i<maxThreads; i++){
for (int i=0; i<maxThreads; i++){
if(initSpVectIdxDenseAcc(b->N,idxsMapAccs+i)) goto _err;
}
}
@ -409,7 +410,7 @@ idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
free(rbRoots);
free(rbNodes);
if (idxsMapAccs){
for (uint i=0; i<maxThreads; i++) free(idxsMapAccs[i].idxsMap);
for (int i=0; i<maxThreads; i++) free(idxsMapAccs[i].idxsMap);
free(idxsMapAccs);
}
@ -439,7 +440,7 @@ idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F)
#if _OUT_IDXS == TRUE
#ifndef OUT_IDXS_RBTREE_NODES
//return the mul.result nnz index inside the rbNodes
uint i=0;
int i=0;
//rbNodeOrderedVisit(n,root) outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
@ -495,7 +496,7 @@ idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F)
*outIdxs[i] = upperBoundedSymMat + cumul;
#endif //#if _OUT_IDXS == TRUE
//rbTrees for index keeping
uint maxThreads = omp_get_max_threads(); //TODO FROM CFG
int maxThreads = omp_get_max_threads(); //TODO FROM CFG
idx_t abMaxRowLen = reductionMaxSeq(abUpperBoundedRowsLens, a->M);
#ifdef HEURISTICS_UB
idx_t maxRowLenUB = abMaxRowLen * SP3MM_UB_HEURISTIC; //TODO UB HEURISTC
@ -515,7 +516,7 @@ idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F)
goto _err;
}
//init roots
for (uint i=0; i<maxThreads; i++) rbRoots[i] = RB_ROOT_CACHED;
for (int i=0; i<maxThreads; i++) rbRoots[i] = RB_ROOT_CACHED;
if (!( rbNodes = malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes)) )){
ERRPRINT("Sp3MM_Symb_ rbNodes malloc errdn\n");
goto _err;
@ -527,7 +528,7 @@ idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F)
goto _err;
}
//init idxs maps
for (uint i=0; i<maxThreads; i++){
for (int i=0; i<maxThreads; i++){
if(initSpVectIdxDenseAcc(c->N,idxsMapAccs+i)) goto _err;
}
}

@ -42,7 +42,7 @@ spmat* CAT(spmmSerial_,OFF_F)(spmat* A,spmat* B, CONFIG* _cfg){ //serial impleme
if ( allocAccDense(&acc,B->N) ) goto _free;
if (!(AB = allocSpMatrix(A->M,B->N))) goto _free;
for( idx_t r=0; r<A->M; r++ ){
for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, &acc);
sparsifyDirect(&acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
}
@ -54,7 +54,7 @@ spmat* CAT(spmmSerial_,OFF_F)(spmat* A,spmat* B, CONFIG* _cfg){ //serial impleme
////////Sp3MM as 2 x SpMM
///1D
spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
DEBUG printf("spmm\trows of A,\tfull B\tM=%d x N=%d\n",A->M,B->N);
///thread aux
ACC_DENSE *accVects = NULL,*acc;
SPMM_ACC* outAccumul=NULL;
@ -83,7 +83,7 @@ spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (AB->M,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
#pragma omp parallel for schedule(runtime) private(acc)
for (ulong r=0; r<A->M; r++){ //row-by-row formulation
for (idx_t r=0; r<A->M; r++){ //row-by-row formulation
//iterate over nz entry index c inside current row r
acc = accVects + omp_get_thread_num();
/* direct use of sparse scalar vector multiplication
@ -93,7 +93,7 @@ spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
bRowLen = B->IRP[ca+1] - B->IRP[ca];
CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc);
}*/
for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
//trasform accumulated dense vector to a CSR row
#if SPARSIFY_PRE_PARTITIONING == T
@ -124,7 +124,7 @@ spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
}
idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void** accumul, void **rows_sizes, void** tmp_matrix){
DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
DEBUG printf("spmm\trows of A,\tfull B\tM=%d x N=%d\n",A->M,B->N);
///thread aux
ACC_DENSE *accVects = NULL,*acc;
SPMM_ACC* outAccumul=NULL;
@ -153,7 +153,7 @@ idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (AB->M,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
#pragma omp parallel for schedule(runtime) private(acc)
for (ulong r=0; r<A->M; r++){ //row-by-row formulation
for (idx_t r=0; r<A->M; r++){ //row-by-row formulation
//iterate over nz entry index c inside current row r
acc = accVects + omp_get_thread_num();
/* direct use of sparse scalar vector multiplication
@ -163,7 +163,7 @@ idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void
bRowLen = B->IRP[ca+1] - B->IRP[ca];
CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc);
}*/
for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
//trasform accumulated dense vector to a CSR row
#if SPARSIFY_PRE_PARTITIONING == T
@ -197,7 +197,7 @@ idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void
if(outAccumul) freeSpMMAcc(outAccumul);
}
void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double** AS, idx_t** JA, idx_t** IRP){
void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double* AS, idx_t* JA, idx_t* IRP){
SPMM_ACC* outAccumul= *accumul;
idx_t* rowsSizes = *rows_sizes;
spmat *AB = *tmp_matrix;
@ -216,7 +216,7 @@ void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void**
}
spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUG printf("spmm\trowBlocks of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
DEBUG printf("spmm\trowBlocks of A,\tfull B\tM=%d x N=%d\n",A->M,B->N);
DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
///thread aux
ACC_DENSE *accVects = NULL,*acc;
@ -243,10 +243,10 @@ spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
#endif
//perform Gustavson over rows blocks -> M / @cfg->gridRows
ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
idx_t rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
ulong b,startRow,block; //omp for aux vars
idx_t b,startRow,block; //omp for aux vars
#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
for (b=0; b < cfg->gridRows; b++){
block = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
@ -255,13 +255,13 @@ spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUGPRINT{
fflush(NULL);
printf("block %lu\t%lu:%lu(%lu)\n",b,startRow,startRow+block-1,block);
printf("block %d\t%d:%d(%d)\n",b,startRow,startRow+block-1,block);
fflush(NULL);
}
//row-by-row formulation in the given row block
for (ulong r=startRow; r<startRow+block; r++){
for (idx_t r=startRow; r<startRow+block; r++){
//iterate over nz entry index c inside current row r
for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++)
for (idx_t c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++)
CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
//trasform accumulated dense vector to a CSR row
#if SPARSIFY_PRE_PARTITIONING == T
@ -295,7 +295,7 @@ spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
///2D
//PARTITIONS NOT ALLOCATED
spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%dxN=%d\n",A->M,B->N);
DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
idx_t* bColOffsets = NULL; //B group columns starting offset for each row
ACC_DENSE *accVectors=NULL,*accV;
@ -305,10 +305,10 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
idx_t* rowsPartsSizes=NULL;
if (!AB) goto _err;
//2D indexing aux vars
ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
idx_t _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
idx_t _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
idx_t startRow,startCol,rowBlock,colBlock; //data division aux variables
////get bColOffsets for B column groups
if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
goto _err;
@ -344,8 +344,8 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
ulong tileID,t_i,t_j; //for aux vars
ulong bPartLen,bPartID,bPartOffset;//B partition acces aux vars
idx_t tileID,t_i,t_j; //for aux vars
idx_t bPartLen,bPartID,bPartOffset;//B partition acces aux vars
#pragma omp parallel for schedule(runtime) \
private(accV,accRowPart,rowBlock,colBlock,startRow,startCol,\
bPartLen,bPartID,bPartOffset,t_i,t_j)
@ -364,15 +364,15 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUGPRINT{
fflush(NULL);
colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem);
printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",t_i,startRow,startRow+rowBlock-1,rowBlock);
printf("colBlock [%lu\t%lu:%lu(%lu)]\n",t_j,startCol,startCol+colBlock-1,colBlock);
printf("rowBlock [%d\t%d:%d(%d)]\t",t_i,startRow,startRow+rowBlock-1,rowBlock);
printf("colBlock [%d\t%d:%d(%d)]\n",t_j,startCol,startCol+colBlock-1,colBlock);
fflush(NULL);
}
///AB[t_i][t_j] block compute
for (ulong r=startRow; r<startRow+rowBlock; r++){
for (idx_t r=startRow; r<startRow+rowBlock; r++){
//iterate over nz col index j inside current row r
//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
for (ulong j=A->IRP[r]-OFF_F,c; j<A->IRP[r+1]-OFF_F; j++){
for (idx_t j=A->IRP[r]-OFF_F,c; j<A->IRP[r+1]-OFF_F; j++){
//get start of B[A->JA[j]][:colBlock:]
c = A->JA[j]-OFF_F; // col of nnz in A[r][:] <-> target B row
bPartID = IDX2D(c,t_j,cfg->gridCols);
@ -415,7 +415,7 @@ spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
}
spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
DEBUG printf("spmm\trowBlocks of A,\tcolBlocks (allcd) of B\tM=%luxN=%lu\n",A->M,B->N);
DEBUG printf("spmm\trowBlocks of A,\tcolBlocks (allcd) of B\tM=%dxN=%d\n",A->M,B->N);
DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
spmat *AB = NULL, *colPartsB = NULL, *colPart;
idx_t* rowsPartsSizes=NULL;
@ -423,14 +423,14 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
SPMM_ACC* outAccumul=NULL;
ACC_DENSE *accVectors=NULL,*accV;
SPACC* accRowPart;
ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
idx_t startRow,startCol,rowBlock,colBlock; //data division aux variables
//2D indexing aux vars
idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
idx_t* bColOffsets = NULL;
if (!(AB = allocSpMatrix(A->M,B->N))) goto _err;
ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
idx_t _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
idx_t _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
////B cols partition in CSRs
//if (!(colPartsB = CAT(colsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols))) goto _err;
@ -465,7 +465,7 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
ulong tileID,t_i,t_j; //for aux vars
idx_t tileID,t_i,t_j; //for aux vars
#pragma omp parallel for schedule(runtime) \
private(accV,accRowPart,colPart,rowBlock,colBlock,startRow,startCol,t_i,t_j)
for (tileID = 0; tileID < gridSize; tileID++){
@ -484,17 +484,17 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
DEBUGPRINT{
fflush(NULL);
printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",
printf("rowBlock [%d\t%d:%d(%d)]\t",
t_i,startRow,startRow+rowBlock-1,rowBlock);
printf("colBlock [%lu\t%lu:%lu(%lu)]\n",
printf("colBlock [%d\t%d:%d(%d)]\n",
t_j,startCol,startCol+colBlock-1,colBlock);
fflush(NULL);
}
///AB[t_i][t_j] block compute
for (ulong r=startRow; r<startRow+rowBlock; r++){
for (idx_t r=startRow; r<startRow+rowBlock; r++){
//iterate over nz col index j inside current row r
//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
for (ulong j=A->IRP[r]-OFF_F,c,bRowStart,bRowLen; j<A->IRP[r+1]-OFF_F; j++){
for (idx_t j=A->IRP[r]-OFF_F,c,bRowStart,bRowLen; j<A->IRP[r+1]-OFF_F; j++){
//get start of B[A->JA[j]][:colBlock:]
c = A->JA[j]-OFF_F; // column of nnz entry in A[r][:] <-> target B row
bRowStart = colPart->IRP[c];
@ -532,7 +532,7 @@ spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg)
AB = NULL;
_free:
if (colPartsB){
for (ulong i=0; i<cfg->gridCols; i++)
for (idx_t i=0; i<cfg->gridCols; i++)
freeSpmatInternal(colPartsB+i);
free(colPartsB);
}
@ -558,7 +558,7 @@ spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,
/* TODO
alloc dense aux vector, reusable over 3 product
TODO arrays sovrallocati per poter essere riusati nelle 2 SpMM
ulong auxVectSize = MAX(R->N,AC->N);
idx_t auxVectSize = MAX(R->N,AC->N);
auxVectSize = MAX(auxVectSize,P->N);
*/
@ -590,7 +590,7 @@ spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,
spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,
SPMM_INTERF spmm){
ulong* rowSizes = NULL;
idx_t* rowSizes = NULL;
SPMM_ACC* outAccumul=NULL;
ACC_DENSE *accVectorsR_AC=NULL,*accVectorsRAC_P=NULL,*accRAC,*accRACP;
@ -619,11 +619,11 @@ spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,
goto _err;
}
ulong c;
idx_t c;
((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (R->M,R,cfg);
AUDIT_INTERNAL_TIMES Start=omp_get_wtime();
#pragma omp parallel for schedule(runtime) private(accRAC,accRACP,c)
for (ulong r=0; r<R->M; r++){ //row-by-row formulation
for (idx_t r=0; r<R->M; r++){ //row-by-row formulation
//iterate over nz entry index c inside current row r
accRAC = accVectorsR_AC + omp_get_thread_num();
accRACP = accVectorsRAC_P + omp_get_thread_num();

@ -46,10 +46,10 @@
* alloc threads' aux arrays once and split them in threads' structures
* so free them once from the first thread struct, with the original pointers returned from the alloc
*/
ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEXT
ACC_DENSE* _initAccVectors_monoalloc(idx_t num,idx_t size){ //TODO PERF WITH NEXT
ACC_DENSE* out = NULL;
double* vAll = NULL;
ulong* vAllNzIdx = NULL;
idx_t* vAllNzIdx = NULL;
if (!(out = calloc(num,sizeof(*out)))){
ERRPRINT("_initAccVectors aux struct alloc failed\n");
return NULL;
@ -62,7 +62,7 @@ ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEX
ERRPRINT("_initAccVectors aux dense vectors' idx alloc failed\n");
goto err;
}
for (ulong i=0; i<num; i++){
for (idx_t i=0; i<num; i++){
out[i].vLen = size; //TODO USELESS INFO?
out[i].v = vAll + i*size;
out[i].nnzIdx = vAllNzIdx + i*size;;
@ -76,7 +76,7 @@ ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEX
free(vAllNzIdx);
return NULL;
}
int allocAccDense(ACC_DENSE* v,ulong size){
int allocAccDense(ACC_DENSE* v,idx_t size){
v->vLen = size;
if (!(v->v = calloc(size,sizeof(*(v->v))))) {
ERRPRINT("_initAccVectors aux dense vector alloc failed\n");
@ -91,19 +91,19 @@ int allocAccDense(ACC_DENSE* v,ulong size){
return EXIT_SUCCESS;
}
ACC_DENSE* _initAccVectors(ulong num,ulong size){
ACC_DENSE* _initAccVectors(idx_t num,idx_t size){
ACC_DENSE* out = NULL;
if (!(out = calloc(num,sizeof(*out)))){
ERRPRINT("_initAccVectors aux struct alloc failed\n");
return NULL;
}
for (ulong i=0; i<num; i++){
for (idx_t i=0; i<num; i++){
if (allocAccDense(out+i,size)) goto _err;
}
return out;
_err:
for (ulong i=0; i<num; i++){
for (idx_t i=0; i<num; i++){
if (out[i].v) free(out[i].v);
if (out[i].nnzIdx) free(out[i].nnzIdx);
}
@ -111,17 +111,17 @@ ACC_DENSE* _initAccVectors(ulong num,ulong size){
return NULL;
}
void freeAccsDense(ACC_DENSE* vectors,ulong num){
for (ulong i=0; i<num; i++){
void freeAccsDense(ACC_DENSE* vectors,idx_t num){
for (idx_t i=0; i<num; i++){
free(vectors[i].v);
free(vectors[i].nnzIdx);
free(vectors[i].nnzIdxMap.idxsMap);
}
free(vectors);
}
void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num){
void _freeAccsDenseChecks(ACC_DENSE* vectors,idx_t num){
if (!vectors) return;
for (ulong i=0; i<num; i++){
for (idx_t i=0; i<num; i++){
if(vectors[i].v) free(vectors[i].v);
if(vectors[i].nnzIdx) free(vectors[i].nnzIdx);
}
@ -143,8 +143,8 @@ int initSpVectIdxDenseAcc(idx_t idxMax,SPVECT_IDX_DENSE_MAP* vectIdxsMap){
return EXIT_SUCCESS;
}
void checkOverallocPercent(ulong* forecastedSizes,spmat* AB){
for (ulong r=0,rSize,forecastedSize; r < AB->M; r++){
void checkOverallocPercent(idx_t* forecastedSizes,spmat* AB){
for (idx_t r=0,rSize,forecastedSize; r < AB->M; r++){
forecastedSize = forecastedSizes[r];
#ifdef ROWLENS
rSize = AB->RL[r];
@ -158,11 +158,11 @@ void checkOverallocPercent(ulong* forecastedSizes,spmat* AB){
}
}
DEBUGPRINT
printf("extra forecastedSize of row: %lu\t=\t%lf %% \n",
printf("extra forecastedSize of row: %d\t=\t%lf %% \n",
r,100*(forecastedSize-rSize) / (double) forecastedSize);
}
idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
printf("extra forecastedSize of the matrix: \t%d\t = %lf %% \n",
extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
}
int spmatDiff(spmat* A, spmat* B){
@ -187,8 +187,8 @@ int spmatDiff(spmat* A, spmat* B){
double* CSRToDense(spmat* sparseMat){
double* denseMat;
ulong i,j,idxNZ,denseSize;
if (__builtin_umull_overflow(sparseMat->M,sparseMat->N,&denseSize)){
idx_t i,j,idxNZ,denseSize;
if (__builtin_smul_overflow(sparseMat->M,sparseMat->N,&denseSize)){
ERRPRINT("overflow in dense allocation\n");
return NULL;
}
@ -200,7 +200,7 @@ double* CSRToDense(spmat* sparseMat){
for (idxNZ=sparseMat->IRP[i]; idxNZ<sparseMat->IRP[i+1]; ++idxNZ){
j = sparseMat->JA[idxNZ];
//converting sparse item into dense entry
denseMat[(ulong) IDX2D(i,j,sparseMat->N)] = sparseMat->AS[idxNZ];
denseMat[(idx_t) IDX2D(i,j,sparseMat->N)] = sparseMat->AS[idxNZ];
}
}
return denseMat;
@ -214,17 +214,17 @@ void printSparseMatrix(spmat* spMatrix,char justNZMarkers){
static inline int _colsPartitioningUnifRanges_init(spmat* A,uint gridCols,
static inline int _colsPartitioningUnifRanges_init(spmat* A,int gridCols,
spmat** colParts,idx_t** colPartsLens){
spmat* colPart;
ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *tmpJA;
idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *tmpJA;
///alloc/init partitions structures
if (!(*colParts = calloc(gridCols, sizeof(**colParts)))){
ERRPRINT("colsPartitioningUnifRanges\tcolumns partitions of A calloc fail\n");
return EXIT_FAILURE;
}
for (ulong i=0,colBlock; i<gridCols; i++){
for (idx_t i=0,colBlock; i<gridCols; i++){
colBlock = UNIF_REMINDER_DISTRI(i,_colBlock,_colBlockRem);
colPart = *colParts + i;
if (allocSpMatrixInternal(A->M,colBlock,colPart)){
@ -249,13 +249,13 @@ static inline int _colsPartitioningUnifRanges_init(spmat* A,uint gridCols,
return EXIT_SUCCESS;
}
static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,uint gridCols,
static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,int gridCols,
spmat* colParts,idx_t* colPartsLens){
spmat* colPart;
double* tmpAS; idx_t* tmpJA;
//realloc overallcd A parts NZ arrays (TODO -> downsizing -> nofails?)
for (ulong i=0,partLen; i<gridCols; i++){
for (idx_t i=0,partLen; i<gridCols; i++){
colPart = colParts + i;
partLen = colPartsLens[i];
if (!(tmpAS = realloc(colPart->AS,partLen*sizeof(*(colPart->AS))))){
@ -281,22 +281,22 @@ static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,uint gridCol
#endif
//////////////////////// CSR SPECIFIC -- TODO RENAME //////////////////
///SPARSE MATRIX PARTITIONING
idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
ulong subRowsN = A->M * gridCols;
ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
ulong* offsets = malloc( (subRowsN+1) * sizeof(*offsets) );
idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols){
idx_t subRowsN = A->M * gridCols;
idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
idx_t* offsets = malloc( (subRowsN+1) * sizeof(*offsets) );
if (!offsets) {
ERRPRINT("colsOffsetsPartitioningUnifRanges:\toffsets malloc errd\n");
return NULL;
}
///OFFSETS COMPUTE FOR COL GROUPS -> O( A.NZ )
for (ulong r=0, j=0; r<A->M; j=A->IRP[++r]-OFF_F){
for (idx_t r=0, j=0; r<A->M; j=A->IRP[++r]-OFF_F){
offsets[ IDX2D(r,0,gridCols) ] = j; //row's first gc start is costrained
//navigate column groups inside current row
for (ulong gc=1,gcStartCol; gc<gridCols; gc++){
for (idx_t gc=1,gcStartCol; gc<gridCols; gc++){
gcStartCol = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
//goto GroupCols start entry,keeping A's nnz entries navigation (idx j)
//for (ulong c=A->JA[j]-OFF_F; c<gcStartCol && j < A->IRP[r+1]-OFF_F; c=A->JA[++j]-OFF_F);
//for (idx_t c=A->JA[j]-OFF_F; c<gcStartCol && j < A->IRP[r+1]-OFF_F; c=A->JA[++j]-OFF_F);
while (j < A->IRP[r+1]-OFF_F && A->JA[j]-OFF_F < gcStartCol)
j++;
offsets[ IDX2D(r,gc,gridCols) ] = j; //row's gc group startIdx
@ -306,12 +306,12 @@ idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
return offsets;
}
spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,
spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,int gridCols,
idx_t** colPartsOffsets)
{
spmat *colParts = NULL, *colPart;
ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
ulong *colPartsLens=NULL, *tmpJA;
idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
idx_t *colPartsLens=NULL, *tmpJA;
double* tmpAS;
///alloc/init partitions structures
@ -349,15 +349,15 @@ spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,
return colParts;
_err:
free(*colPartsOffsets);
for (ulong i=0; i<gridCols; i++) freeSpmatInternal(colParts+i);
for (idx_t i=0; i<gridCols; i++) freeSpmatInternal(colParts+i);
free(colOffsets);
free(colParts);
free(colPartsLens);
return NULL;
}
spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols){
spmat *colParts, *colPart;
ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *colPartsLens=NULL, *tmpJA;
idx_t _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *colPartsLens=NULL, *tmpJA;
double* tmpAS;
///alloc/init partitions structures
if (_colsPartitioningUnifRanges_init(A,gridCols,&colParts,&colPartsLens))
@ -366,9 +366,9 @@ spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
* Parallelize: 2for collapse OMP, gcEndCol -> startIdxize, ...
* oppure wrappare cio in static inline
*/
for (ulong r=0, j=0; r<A->M; j=A->IRP[++r]-OFF_F){
for (idx_t r=0, j=0; r<A->M; j=A->IRP[++r]-OFF_F){
//navigate column groups inside current row
for (ulong gc=0,gcEndCol=0,i; gc<gridCols ; gc++,j+=i){
for (idx_t gc=0,gcEndCol=0,i; gc<gridCols ; gc++,j+=i){
i = 0; //@i=len current subpartition of row @r to copy
colPart = colParts + gc;
//NB: not shifting IRP because is handled as internal implementation component
@ -376,7 +376,7 @@ spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
colPart->IRP[r] = colPartsLens[gc];
gcEndCol += UNIF_REMINDER_DISTRI(gc,_colBlock,_colBlockRem);
//goto next GroupCols,keeping A's nnz entries navigation ( index j+i )
//for (ulong c=A->JA[j+i]-OFF_F; c<gcEndCol && j+i < A->IRP[r+1]-OFF_F; c=A->JA[j+ ++i]-OFF_F);
//for (idx_t c=A->JA[j+i]-OFF_F; c<gcEndCol && j+i < A->IRP[r+1]-OFF_F; c=A->JA[j+ ++i]-OFF_F);
while ( j+i < A->IRP[r+1]-OFF_F && A->JA[j+i]-OFF_F < gcEndCol ) i++;
memcpy(colPart->AS+colPart->IRP[r], A->AS+j, i*sizeof(*A->AS));
memcpy(colPart->JA+colPart->IRP[r], A->JA+j, i*sizeof(*A->JA));
@ -393,14 +393,14 @@ spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
free(colPartsLens);
return colParts;
_err:
for (ulong i=0; i<gridCols; i++) freeSpmatInternal(colParts+i);
for (idx_t i=0; i<gridCols; i++) freeSpmatInternal(colParts+i);
if(colParts) free(colParts);
if(colPartsLens) free(colPartsLens);
return NULL;
}
void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
void CAT(checkOverallocRowPartsPercent_,OFF_F)(idx_t* forecastedSizes,spmat* AB,
idx_t gridCols,idx_t* bColOffsets){
idx_t* abColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(AB, gridCols);
assert(abColOffsets); //partitioning error
@ -410,7 +410,7 @@ void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
DEBUGCHECKS assert(forecast >= rLen);
}
idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
printf("extra forecastedSize of the matrix: \t%d\t = %lf %% \n",
extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
free(abColOffsets);
@ -422,25 +422,25 @@ void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
///inline export here
//SPMV_CHUNKS_DISTR spmvChunksFair;
spmat* allocSpMatrix(ulong rows, ulong cols);
int allocSpMatrixInternal(ulong rows, ulong cols, spmat* mat);
spmat* allocSpMatrix(idx_t rows, idx_t cols);
int allocSpMatrixInternal(idx_t rows, idx_t cols, spmat* mat);
void freeSpmatInternal(spmat* mat);
void freeSpmat(spmat* mat);
////INTERNAL TEST FUNCTIONS
//test that each row's partition from colsOffsetsPartitioningUnifRanges is in the correct index range
#include <alloca.h>
int testColsOffsetsPartitioningUnifRanges(spmat* mat,ulong gridCols,ulong* partsOffs){
ulong _colBlock = mat->N/gridCols, _colBlockRem = mat->N%gridCols;
ulong j=0; //CSR scanning nnz idx
ulong* colPartsPopulations = alloca(gridCols * sizeof(*colPartsPopulations));
int testColsOffsetsPartitioningUnifRanges(spmat* mat,idx_t gridCols,idx_t* partsOffs){
idx_t _colBlock = mat->N/gridCols, _colBlockRem = mat->N%gridCols;
idx_t j=0; //CSR scanning nnz idx
idx_t* colPartsPopulations = alloca(gridCols * sizeof(*colPartsPopulations));
memset(colPartsPopulations,0,gridCols * sizeof(*colPartsPopulations));
for (ulong r=0,pId=0; r<mat->M; r++){
for (ulong gc=0,pStartIdx,pEndIdx; gc<gridCols; gc++,pId++){
for (idx_t r=0,pId=0; r<mat->M; r++){
for (idx_t gc=0,pStartIdx,pEndIdx; gc<gridCols; gc++,pId++){
pStartIdx = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
pEndIdx = UNIF_REMINDER_DISTRI_STARTIDX(gc+1,_colBlock,_colBlockRem)-1;
//pId=IDX2D(r,gc,gridCols);
for (ulong idx=partsOffs[pId],c; idx<partsOffs[pId+1]; idx++,j++){
for (idx_t idx=partsOffs[pId],c; idx<partsOffs[pId+1]; idx++,j++){
c = mat->JA[idx];
assert(j == idx); //consecutive index in partitioning
assert(pStartIdx <= c && c <= pEndIdx); //colRange
@ -450,12 +450,12 @@ int testColsOffsetsPartitioningUnifRanges(spmat* mat,ulong gridCols,ulong* parts
}
}
assert(j == mat->NZ);
ulong s=0;
for (ulong gc=0,partSize; gc < gridCols; gc++,s+=partSize){
idx_t s=0;
for (idx_t gc=0,partSize; gc < gridCols; gc++,s+=partSize){
partSize = colPartsPopulations[gc];
double partShare=partSize/(double)mat->NZ,partsAvg=1/(double)gridCols;
double partShareAvgDiff = partShare - partsAvg;
printf("colPartition %lu has:\t%lu = %lf of NNZ\t\t .. %lf\tAVG diff\n",
printf("colPartition %d has:\t%d = %lf of NNZ\t\t .. %lf\tAVG diff\n",
gc,partSize,partShare,partShareAvgDiff);
}
assert(s == mat->NZ); //TODO DUPLICATED
@ -482,14 +482,14 @@ int main(int argc, char** argv){
return out;
}
////partitioning test
ulong* colsPartitions = colsOffsetsPartitioningUnifRanges_0(mat,Conf.gridCols);
idx_t* colsPartitions = colsOffsetsPartitioningUnifRanges_0(mat,Conf.gridCols);
if (!colsPartitions) goto _free;
if (testColsOffsetsPartitioningUnifRanges(mat,Conf.gridCols,colsPartitions))
goto _free;
out=EXIT_SUCCESS;
printf("testColsOffsetsPartitioningUnifRanges passed with "
"mat: %lux%lu-%luNNZ\tgrid: %dx%d\n",
"mat: %dx%d-%dNNZ\tgrid: %dx%d\n",
mat->M,mat->N,mat->NZ,Conf.gridRows,Conf.gridCols);
_free:
if (colsPartitions) free(colsPartitions);

@ -145,12 +145,12 @@ int fread_wrap(FILE* fp,void* dst,size_t count){
return rd;
}
///STRUCTURED DATA IO
int writeDoubleVector(char* fpath,double* v,ulong size){
int writeDoubleVector(char* fpath,double* v,idx_t size){
int fd,out=EXIT_FAILURE;
if ( (fd = createNewFile(fpath) ) < 0 ) goto _end;
write_wrap(fd,v,size * sizeof(*v)); //raw write of vector
out = EXIT_SUCCESS;
DEBUG printf("written double vector into %s RAW of %lu elements\n",fpath,size);
DEBUG printf("written double vector into %s RAW of %d elements\n",fpath,size);
_end:
if (close(fd) == EOF) perror("close errd\n");
@ -158,21 +158,21 @@ int writeDoubleVector(char* fpath,double* v,ulong size){
}
///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
int writeDoubleVectorAsStr(char* fpath,double* v,ulong size){
int writeDoubleVectorAsStr(char* fpath,double* v,idx_t size){
int out=EXIT_FAILURE;
FILE* fp = fopen(fpath,"w");
if (!fp){
perror("fopen vector file write");
return EXIT_FAILURE;
}
for (ulong i=0; i<size; i++){
for (idx_t i=0; i<size; i++){
if (fprintf(fp,DOUBLE_STR_FORMAT,v[i]) < 0){
ERRPRINT("fprintf to out vector file errd\n");
goto _end;
}
}
out = EXIT_SUCCESS;
DEBUG printf("written vector into %s of %lu elements\n",fpath,size);
DEBUG printf("written vector into %s of %d elements\n",fpath,size);
_end:
if (fclose(fp) == EOF) perror("fclose errd\n");
@ -180,10 +180,10 @@ int writeDoubleVectorAsStr(char* fpath,double* v,ulong size){
}
double* readDoubleVector(char* fpath,ulong* size){
double* readDoubleVector(char* fpath,idx_t* size){
int rd,hleft;
double *out,*tmp;
ulong i=0,vectorSize=RNDVECTORSIZE;
idx_t i=0,vectorSize=RNDVECTORSIZE;
if (*size) vectorSize = *size;
FILE* fp = fopen(fpath,"r");
if (!fp){
@ -198,13 +198,13 @@ double* readDoubleVector(char* fpath,ulong* size){
if (i >= vectorSize ){ //realloc the array
vectorSize *= VECTOR_STEP_REALLOC;
if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
ERRPRINTS("realloc errd to ~~ %d MB\n",vectorSize >> 20);
goto _err;
}
out = tmp;
DEBUG printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
DEBUG printf("reallocd to ~~ %d MB\n",vectorSize >> 20);
}
ulong toRead = MIN(VECTOR_READ_BLOCK,(vectorSize-i));
idx_t toRead = MIN(VECTOR_READ_BLOCK,(vectorSize-i));
if((rd = fread_wrap(fp,out + i,sizeof(*out)*toRead)) == -2){
ERRPRINT("fread_wrap errd\n");
goto _err;
@ -227,7 +227,7 @@ double* readDoubleVector(char* fpath,ulong* size){
goto _err;
}
out = tmp;
DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
DEBUG printf("readed vector from %s of %d elements\n",fpath,*size);
goto _free;
_err:
@ -238,10 +238,10 @@ double* readDoubleVector(char* fpath,ulong* size){
return out;
}
double* readDoubleVectorStr(char* fpath,ulong* size){
double* readDoubleVectorStr(char* fpath,idx_t* size){
int fscanfOut;
double *out,*tmp;
ulong i=0,vectorSize=RNDVECTORSIZE;
idx_t i=0,vectorSize=RNDVECTORSIZE;
if (*size) vectorSize = *size;
FILE* fp = fopen(fpath,"r");
if (!fp){
@ -256,11 +256,11 @@ double* readDoubleVectorStr(char* fpath,ulong* size){
if (i >= vectorSize ){ //realloc the array
vectorSize *= VECTOR_STEP_REALLOC;
if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
ERRPRINTS("realloc errd to ~~ %d MB\n",vectorSize >> 20);
goto _err;
}
out = tmp;
DEBUG printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
DEBUG printf("reallocd to ~~ %d MB\n",vectorSize >> 20);
}
fscanfOut = fscanf(fp,DOUBLE_STR_FORMAT,out + i++ );
if ( fscanfOut == EOF && ferror(fp)){
@ -277,7 +277,7 @@ double* readDoubleVectorStr(char* fpath,ulong* size){
goto _err;
}
out = tmp;
DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
DEBUG printf("readed vector from %s of %d elements\n",fpath,*size);
goto _free;
_err:
@ -292,10 +292,10 @@ double* readDoubleVectorStr(char* fpath,ulong* size){
int getConfig(CONFIG* conf){
int changes=EXIT_FAILURE;
char *varVal,*ptr;
ulong val;
idx_t val;
if ((varVal = getenv(GRID_ROWS))){
val=strtoul(varVal,&ptr,10);
if (ptr==varVal || val>= UINT_MAX){
if (ptr==varVal || val>= INT_MAX){
perror("strtol errd");
} else {
conf->gridRows = val;
@ -304,7 +304,7 @@ int getConfig(CONFIG* conf){
}
if ((varVal = getenv(GRID_COLS))){
val=strtoul(varVal,&ptr,10);
if (ptr==varVal || val>= UINT_MAX){
if (ptr==varVal || val>= INT_MAX){
perror("strtol errd");
} else {
conf->gridCols = val;
@ -317,11 +317,11 @@ int getConfig(CONFIG* conf){
/////LIB-SORTING -- WRAPPERS
//comparing functions
static inline int cmp_idx_t(const void* a, const void*b){
idx_t aa=*((ulong*) a), bb = *((ulong*) b);
idx_t aa=*((idx_t*) a), bb = *((idx_t*) b);
return aa==bb?0:aa>bb?1:-1;
}
static inline int cmpulong(const void* a, const void*b){
ulong aa=*((ulong*) a), bb = *((ulong*) b);
static inline int cmpidx_t(const void* a, const void*b){
idx_t aa=*((idx_t*) a), bb = *((idx_t*) b);
return aa==bb?0:aa>bb?1:-1;
}
static inline int cmpuint(const void* a, const void*b){
@ -336,8 +336,8 @@ static inline int cmpRbNode(const void* a, const void* b){
void sort_idx_t(idx_t* arr, idx_t len){
qsort(arr,len,sizeof(*arr),cmp_idx_t);
}
void sortulong(ulong* arr, ulong len){
qsort(arr,len,sizeof(*arr),cmpulong);
void sortidx_t(idx_t* arr, idx_t len){
qsort(arr,len,sizeof(*arr),cmpidx_t);
}
void sortuint(uint* arr, uint len){
qsort(arr,len,sizeof(*arr),cmpuint);
@ -368,7 +368,7 @@ static inline int rndDouble_sinDecimal(double* d){
ERRPRINT("read_wrap failed to read holder for rnd double\n");
return EXIT_FAILURE;
}
*d = (_rndHold % (ulong) ceil(MAXRND)) + sin(_rndHold);
*d = (_rndHold % (idx_t) ceil(MAXRND)) + sin(_rndHold);
return EXIT_SUCCESS;
}
@ -383,8 +383,8 @@ void statsAvgVar(double* values,uint numVals, double* out){
}
/// MATRIX - VECTOR COMPUTE UTILS
int fillRndVector(ulong size, double* v){
for( ulong x=0; x<size; ++x ){
int fillRndVector(idx_t size, double* v){
for( idx_t x=0; x<size; ++x ){
if(rndDouble_sinAll( v+x )) return EXIT_FAILURE;
#ifdef RNDVECTMIN
v[x] += RNDVECTMIN;
@ -394,18 +394,18 @@ int fillRndVector(ulong size, double* v){
}
//convention true result in @a, toCheck in @b
int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
int doubleVectorsDiff(double* a, double* b, idx_t n,double* diffMax){
int out = EXIT_SUCCESS;
double diff,diffAbs,_diffMax=0;
if (diffMax) *diffMax = 0;
else diffMax = &_diffMax;
for (ulong i=0; i<n; i++){
for (idx_t i=0; i<n; i++){
diff = a[i] - b[i];
diffAbs = ABS( diff );
if( diffAbs > DOUBLE_DIFF_THREASH ){
out = EXIT_FAILURE;
ERRPRINTS("DOUBLE VECTORS DIFF: DOUBLE_DIFF_THREASH=%lf\t<\t"
"|%13lg| = %lf %% of @a[%lu]\n",
"|%13lg| = %lf %% of @a[%d]\n",
DOUBLE_DIFF_THREASH,diff,100*diffAbs/ABS(a[i]),i);
#ifdef DOUBLE_VECT_DIFF_EARLY_EXIT
/*#pragma message("DOUBLE_VECT_DIFF_EARLY_EXIT: only first diff double reported")*/
@ -415,7 +415,7 @@ int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
if ( ABS(*diffMax) < diffAbs ) *diffMax = diff;
}
DEBUG{
printf("\nchecked diff %s"CEND" between 2 double vector of %lu elements"
printf("\nchecked diff %s"CEND" between 2 double vector of %d elements"
"\tmax diff: %le %s threash: %le\n", !out?CCC"OK":CCCERR"ERR!",
n,*diffMax,*diffMax<DOUBLE_DIFF_THREASH?"<":">=",
DOUBLE_DIFF_THREASH);
@ -427,9 +427,9 @@ int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
return out;
}
void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers){
printf("printing matrix: %lu x %lu\n",m,n);
ulong i,j;
void printMatrix(double* mat,idx_t m,idx_t n,char justNZMarkers){
printf("printing matrix: %d x %d\n",m,n);
idx_t i,j;
for (i=0;i<m;i++){
for (j=0;j<n;j++){
if (justNZMarkers) printf("%s",mat[IDX2D(i,j,n)]?".":" ");
@ -441,8 +441,8 @@ void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers){
}
void printVector(double* v,ulong size){
for( ulong i=0;i<size;i++) printf("%1.1lf ",v[i]);
void printVector(double* v,idx_t size){
for( idx_t i=0;i<size;i++) printf("%1.1lf ",v[i]);
printf("\n");
}
@ -464,7 +464,7 @@ static inline char* searchPatternInStrs(char* pattern,char** strings){
return out;
}
inline int appendArr(ulong val,APPENDARRAY* list){
inline int appendArr(idx_t val,APPENDARRAY* list){
return 0; //TODO
}

@ -9,13 +9,23 @@ enum impl_types
};
CHUNKS_DISTR chunksFair;
CHUNKS_DISTR chunksFairFolded;
CHUNKS_DISTR chunksNOOP;
CHUNKS_DISTR_INTERF chunkDistrbFunc=&chunksFairFolded;
static CONFIG Conf = {
.gridRows = 20,
.gridCols = 2,
.symbMMRowImplID = RBTREE,
};
void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
double **a_as_ptr, idx_t **a_ja_ptr,
idx_t **a_irp_ptr, idx_t a_max_row_nz,
idx_t **a_irp_ptr,
idx_t b_m, idx_t b_n, idx_t b_nz,
double **b_as_ptr, idx_t **b_ja_ptr,
idx_t **b_irp_ptr, idx_t b_max_row_nz,
idx_t **b_irp_ptr,
enum impl_types impl_choice,
void **accumul,
void **rows_sizes,
@ -24,7 +34,6 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
{
int rc;
spmat a, b;
CONFIG cfg;
double *a_as = *a_as_ptr;
idx_t *a_ja = *a_ja_ptr;
@ -39,15 +48,32 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
b->RL = b_rl;
#endif // ROWLENS
// setting up cfg
// TODO : CHECK THAT THIS IS COMPATIBLE WITH PSB
rc = getConfig(&cfg);
if (!getConfig(&Conf)){
VERBOSE printf("configuration changed from env");
}
//save exported parallelization grid configured ... see Config.md in the top folder
//const ushort GRIDROWS = Conf.gridRows;
//const ushort GRIDCOLS = Conf.gridCols;
const ushort ORIG_GRID_CONF[2] = {Conf.gridRows,Conf.gridCols};
int maxThreads = omp_get_max_threads();
Conf.threadNum = (uint) maxThreads;
DEBUG printf("omp_get_max_threads:\t%d\n",maxThreads);
// TODO : change chunk distribution with a choice ?
cfg.chunkDistrbFunc = &chunksFair;
cfg.threadNum = 1;
/*
* get exported schedule configuration,
* if schedule != static -> dyn like -> set a chunk division function before omp for
*/
int schedKind_chunk_monotonic[3];
ompGetRuntimeSchedule(schedKind_chunk_monotonic);
Conf.chunkDistrbFunc = &chunksNOOP;
if (schedKind_chunk_monotonic[0] != omp_sched_static)
Conf.chunkDistrbFunc = chunkDistrbFunc;
VERBOSE
printf("%s",Conf.chunkDistrbFunc == &chunksNOOP?"static schedule =>chunkDistrbFunc NOOP\n":"");
int i, row_nz;
printf("irp %d %d %d %d %d\n", a_irp[0], a_irp[1], a_irp[2], a_irp[3], a_irp[4]);
// setting up spmat type matrices
a.M = a_m;
a.N = a_n;
@ -55,7 +81,13 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
a.AS = a_as;
a.JA = a_ja;
a.IRP = a_irp;
a.MAX_ROW_NZ = a_max_row_nz;
a.MAX_ROW_NZ = a.IRP[1] - a.IRP[0];
for (i = 1; i < a.M; i ++){
row_nz = a.IRP[i+1] - a.IRP[i];
if (row_nz > a.MAX_ROW_NZ){
a.MAX_ROW_NZ = row_nz;
}
}
b.M = b_m;
b.N = b_n;
@ -63,13 +95,20 @@ void psb_f_spmm_build_spacc(idx_t a_m, idx_t a_n, idx_t a_nz,
b.AS = b_as;
b.JA = b_ja;
b.IRP = b_irp;
b.MAX_ROW_NZ = b_max_row_nz;
b.MAX_ROW_NZ = b.IRP[1] - b.IRP[0];
for (i = 1; i < b.M; i ++){
row_nz = b.IRP[i+1] - b.IRP[i];
if (row_nz > b.MAX_ROW_NZ){
b.MAX_ROW_NZ = row_nz;
}
}
// computing the size
switch (impl_choice)
{
case ROW_BY_ROW_UB:
*nnz = spmmRowByRowCalculateSize_1(&a, &b, &cfg, accumul, rows_sizes, tmp_matrix);
*nnz = spmmRowByRowCalculateSize_1(&a, &b, &Conf, accumul, rows_sizes, tmp_matrix);
default:
break;
}
@ -79,11 +118,14 @@ void psb_f_spmm_merge_spacc(void **accumul,
void **rows_sizes,
void **tmp_matrix,
enum impl_types impl_choice,
double **c_as,
idx_t **c_ja,
idx_t **c_irp,
double **c_as_ptr,
idx_t **c_ja_ptr,
idx_t **c_irp_ptr,
int *info)
{
double *c_as = *c_as_ptr;
idx_t *c_ja = *c_ja_ptr;
idx_t *c_irp = *c_irp_ptr;
// merging the rows into the correct arrays
switch (impl_choice)
{

@ -39,10 +39,10 @@
//hold SPMM result over a unpartitionated space among threads-row[s' blocks]
typedef struct{
//space to hold SPMM output
ulong* JA;
idx_t* JA;
double* AS;
ulong size; //num of entries allocated -> only dbg checks
ulong lastAssigned; //last JA&AS assigned index to an accumulator(atom)
idx_t size; //num of entries allocated -> only dbg checks
idx_t lastAssigned; //last JA&AS assigned index to an accumulator(atom)
SPACC* accs; //SPARSIFIED ACC POINTERS
uint accsNum;
} SPMM_ACC; //accumulator for SPMM

@ -61,7 +61,7 @@ SPMM CAT(spmmRowByRow_,OFF_F);
idx_t CAT(spmmRowByRowCalculateSize_,OFF_F) (spmat* A, spmat*B, CONFIG* cfg, void** accumul, void **rows_sizes, void** tmp_matrix);
void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double** AS, idx_t** JA, idx_t** IRP);
void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double* AS, idx_t* JA, idx_t* IRP);
/*
* sparse parallel implementation of @A * @B parallelizing Gustavson

@ -37,12 +37,12 @@
* both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
*/
inline void CAT(scSparseVectMul_,OFF_F)
(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux){
for (ulong i=0,j; i<vectLen; i++){
(double scalar,double* vectVals,idx_t* vectIdxs,idx_t vectLen, ACC_DENSE* aux){
for (idx_t i=0,j; i<vectLen; i++){
j = vectIdxs[i]-OFF_F;
DEBUGCHECKS{
if (j>=aux->vLen){
fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
fprintf(stderr,"index %d outside vLen %d\n",j,aux->vLen);
assert(j < aux->vLen);
}
}
@ -65,12 +65,12 @@ inline void CAT(scSparseVectMul_,OFF_F)
* both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
*/
inline void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,
ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux){
for (ulong i=0,j; i<vectLen; i++){
idx_t* vectIdxs,idx_t vectLen,idx_t startIdx,ACC_DENSE* aux){
for (idx_t i=0,j; i<vectLen; i++){
j = vectIdxs[i]-OFF_F - startIdx;
DEBUGCHECKS{
if (j>=aux->vLen){
fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
fprintf(stderr,"index %d outside vLen %d\n",j,aux->vLen);
assert(j < aux->vLen);
}
}
@ -90,9 +90,9 @@ inline void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,
#endif
///TODO IMPLEMENT SCALAR<->ROW MUL AS GENERIC SPARSE VECTOR<->SCALAR MUL
//inline void scSparseRowMul(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
inline void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
ulong rowStartIdx = mat->IRP[trgtR]-OFF_F,rowLen;
//inline void scSparseRowMul(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux){
inline void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux){
idx_t rowStartIdx = mat->IRP[trgtR]-OFF_F,rowLen;
#ifdef ROWLENS
rowLen = mat->RL[trgtR];
#else
@ -116,7 +116,6 @@ inline idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B){
ERRPRINT("spMMSizeUpperbound: rowSizes calloc errd\n");
return NULL;
}
printf("A->IRP %ld %ld %ld %ld %ld\n", A->IRP[0], A->IRP[1], A->IRP[2], A->IRP[3], A->IRP[4]);
idx_t fullMatBound = 0;
#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
for (idx_t r=0; r<A->M; r++){
@ -135,7 +134,7 @@ inline idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B){
rowSizes[A->M] = fullMatBound;
AUDIT_INTERNAL_TIMES End= omp_get_wtime();
VERBOSE
printf("spMMSizeUpperbound:%lu\t%le s\n",rowSizes[A->M],End-Start);
printf("spMMSizeUpperbound:%d\t%le s\n",rowSizes[A->M],End-Start);
return rowSizes;
}
@ -170,7 +169,7 @@ inline idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)
rowPartsSizes[ A->M*gridCols ] = fullMatBound;
AUDIT_INTERNAL_TIMES End= omp_get_wtime();
VERBOSE
printf("spMMSizeUpperboundColParts_:%lu\t%le s\n",rowPartsSizes[A->M],End-Start);
printf("spMMSizeUpperboundColParts_:%d\t%le s\n",rowPartsSizes[A->M],End-Start);
return rowPartsSizes;
}
@ -182,7 +181,7 @@ inline idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)
#include "utils.h"
///Allocs - Free
//SpMM holder of accumulats
inline SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum){
inline SPMM_ACC* initSpMMAcc(idx_t entriesNum, idx_t accumulatorsNum){
SPMM_ACC* out = calloc(1,sizeof(*out));
if (!out){
ERRPRINT("initSpMMAcc: out calloc errd\n");
@ -275,7 +274,7 @@ static inline void _sparsifyUB(ACC_DENSE* accV,SPACC* accSparse,idx_t startColAc
//row[Part] sparsified in a thread safe (exactly long) reserved area using atomics
static inline void sparsifyUBNoPartsBounds
(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc){
(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, idx_t startColAcc){
//sort nnz indexes of dense accumulator
idx_t nnz = accV->nnzIdxMap.len;
idx_t sparsifyStartV; //start index(inside @accSparse) of @accV to sparsify
@ -305,11 +304,11 @@ static inline void sparsifyUBNoPartsBounds
*/
inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
CONFIG* conf){
ulong nzNum=0,j,rLen,idx,partsNum = mat->M * conf->gridCols;
idx_t nzNum=0,j,rLen,idx,partsNum = mat->M * conf->gridCols;
//TODO PARALLEL MERGE - SAVE STARTING offset OF EACH PARTITION IN THE OUT MATRIX
ulong* rowsPartsOffsets=alloca(partsNum*sizeof(*rowsPartsOffsets));
idx_t* rowsPartsOffsets=alloca(partsNum*sizeof(*rowsPartsOffsets));
///count nnz entries and alloc arrays for them
for (ulong r=0; r<mat->M; r++){
for (idx_t r=0; r<mat->M; r++){
//for each partition ->get len -> outMat.IRP and aux offsets
for (j=0,rLen=0; j<conf->gridCols; j++){
idx = IDX2D(r,j,conf->gridCols);
@ -332,20 +331,20 @@ inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
return EXIT_FAILURE;
}
///popolate with rows nnz values and indexes
ulong pLen; //omp for aux vars
idx_t pLen; //omp for aux vars
#pragma omp parallel for schedule(static) private(pLen)
for (ulong i=0; i<partsNum; i++){
for (idx_t i=0; i<partsNum; i++){
pLen = rowsParts[i].len;
memcpy(mat->AS + rowsPartsOffsets[i],rowsParts[i].AS,pLen*sizeof(*(mat->AS)));
memcpy(mat->JA + rowsPartsOffsets[i],rowsParts[i].JA,pLen*sizeof(*(mat->JA)));
}
CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
for (ulong i=0,w=0; i<mat->M; i++){
for (idx_t i=0,w=0; i<mat->M; i++){
if (mat->IRP[i] != w)
{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
for (j=0; j<conf->gridCols; j++){
SPACC r = rowsParts[IDX2D(i,j,conf->gridCols)];
for (ulong jj=0; jj<r.len; jj++,w++){
for (idx_t jj=0; jj<r.len; jj++,w++){
if (mat->AS[w]!= r.AS[jj]){
ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
if (mat->JA[w]!= r.JA[jj]){
@ -363,9 +362,9 @@ inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
* allocd arrays to hold non zero values and indexes into @mat
*/
inline int mergeRows(SPACC* rows,spmat* mat){
ulong nzNum=0;
idx_t nzNum=0;
//count nnz entries and alloc arrays for them
for (ulong r=0; r<mat->M; ++r){
for (idx_t r=0; r<mat->M; ++r){
nzNum += rows[r].len;
mat->IRP[r+1] = nzNum;
#ifdef ROWLENS
@ -384,15 +383,15 @@ inline int mergeRows(SPACC* rows,spmat* mat){
///POPOLATE WITH ROWS NNZ VALUES AND INDEXES
//TODO PARALLEL COPY
#pragma omp parallel for schedule(static)
for (ulong r=0; r<mat->M; r++){
for (idx_t r=0; r<mat->M; r++){
memcpy(mat->AS+mat->IRP[r], rows[r].AS, rows[r].len*sizeof(*(mat->AS)));
memcpy(mat->JA+mat->IRP[r], rows[r].JA, rows[r].len*sizeof(*(mat->JA)));
}
CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
for (ulong r=0,i=0; r<mat->M; r++){
for (idx_t r=0,i=0; r<mat->M; r++){
if (i != mat->IRP[r])
{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
for (ulong j=0; j<rows[r].len; j++,i++){
for (idx_t j=0; j<rows[r].len; j++,i++){
if (mat->AS[i]!= rows[r].AS[j]){
ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
if (mat->JA[i] != rows[r].JA[j]){
@ -415,9 +414,9 @@ inline int mergeRows(SPACC* rows,spmat* mat){
*/
inline idx_t calculateSize(SPACC* rows, spmat *mat){
ulong nzNum=0;
idx_t nzNum=0;
//count nnz entries and alloc arrays for them
for (ulong r=0; r<mat->M; ++r){
for (idx_t r=0; r<mat->M; ++r){
nzNum += rows[r].len;
mat->IRP[r+1] = nzNum;
#ifdef ROWLENS
@ -429,16 +428,17 @@ inline idx_t calculateSize(SPACC* rows, spmat *mat){
return nzNum;
}
inline int mergeRowsPopulate(SPACC* rows, spmat* mat, double** AS, idx_t** JA, idx_t** IRP) {
inline int mergeRowsPopulate(SPACC* rows, spmat* mat, double* AS, idx_t* JA, idx_t* IRP) {
// Populate with rows' non-zero values and indexes
// TODO PARALLEL COPY
#pragma omp parallel for schedule(static)
for (ulong r = 0; r < mat->M; r++) {
memcpy(*AS + mat->IRP[r], rows[r].AS, rows[r].len * sizeof(double));
memcpy(*JA + mat->IRP[r], rows[r].JA, rows[r].len * sizeof(idx_t));
// #pragma omp parallel for schedule(static)
for (idx_t r = 0; r < mat->M; r++) {
fprintf(stderr, "r %d, AS %p, mat->IRP[r] %d, rows[r].AS %p, rows[r].len %d\n", r,AS,mat->IRP[r],rows[r].AS, rows[r].len);
memcpy(AS + mat->IRP[r], rows[r].AS, rows[r].len * sizeof(double));
memcpy(JA + mat->IRP[r], rows[r].JA, rows[r].len * sizeof(idx_t));
}
memcpy(*IRP, mat->IRP, (mat->M + 1) * sizeof(idx_t));
memcpy(IRP, mat->IRP, (mat->M + 1) * sizeof(idx_t));
// Consistency checks
// TODO: Add your consistency checks here

@ -40,11 +40,11 @@ typedef enum{
//
typedef struct{
ushort gridRows;
ushort gridCols;
int gridRows;
int gridCols;
//TODO FULL CONFIG DOCCED HERE
ROW_MMSYM_IMPL_MODE symbMMRowImplID; //how to compute symb mul (if required)
uint threadNum; //thread num to use in an OMP parallel region ...
int threadNum; //thread num to use in an OMP parallel region ...
void* chunkDistrbFunc; //CHUNKS_DISTR_INTERF func pntr
} CONFIG;
///Smart controls

@ -33,17 +33,17 @@ void cleanRbNodes(rbRoot* root,rbNode* nodes,idx_t nodesNum);
int rbInsertNewKey(rbRoot *root,rbNode *node, idx_t key);
//void C_FortranShiftIdxs(spmat* outMat);
//void Fortran_C_ShiftIdxs(spmat* m);
ACC_DENSE* _initAccVectors(ulong num,ulong size);
ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size); //TODO PERF WITH NEXT
SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum);
ACC_DENSE* _initAccVectors(idx_t num,idx_t size);
ACC_DENSE* _initAccVectors_monoalloc(idx_t num,idx_t size); //TODO PERF WITH NEXT
SPMM_ACC* initSpMMAcc(idx_t entriesNum, idx_t accumulatorsNum);
idx_t reductionMaxSeq(idx_t* arr,idx_t arrLen);
int _allocAccDense(ACC_DENSE* v,ulong size);
int _allocAccDense(ACC_DENSE* v,idx_t size);
int mergeRows(SPACC* rows,spmat* mat);
int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,CONFIG* conf);
void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num);
void _freeAccsDenseChecks(ACC_DENSE* vectors,idx_t num);
void _resetAccVect(ACC_DENSE* acc);
void _resetIdxMap(SPVECT_IDX_DENSE_MAP* acc);
void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
void freeAccsDense(ACC_DENSE* vectors,ulong num);
void freeAccsDense(ACC_DENSE* vectors,idx_t num);
void freeSpMMAcc(SPMM_ACC* acc);
void sparsifyDenseVect(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc);
void sparsifyDenseVect(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, idx_t startColAcc);

@ -35,9 +35,9 @@
////inline exports
//multi implmentation functions
void CAT(scSparseVectMul_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux);
void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux);
void CAT(_scRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
void CAT(scSparseVectMul_,OFF_F)(double scalar,double* vectVals,idx_t* vectIdxs,idx_t vectLen, ACC_DENSE* aux);
void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,idx_t* vectIdxs,idx_t vectLen,idx_t startIdx,ACC_DENSE* aux);
void CAT(_scRowMul_,OFF_F)(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux);
void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,idx_t trgtR, ACC_DENSE* aux);
idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B);
idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)(spmat* A,spmat* B,ushort gridCols,idx_t* bColPartOffsets);

@ -42,7 +42,7 @@
#define IN_RANGE(i,start,end) ( (start) <= (i) && (i) <= (end) )
//ceil(x/y) with integers
#define INT_DIV_CEIL(x,y) ( ( (x) - 1) / (y) + 1 )
#define INT_DIV_CEIL(x,y) ( ( (x) - 1) / (y) + 1 )
//2D ROW MAJOR indexing wrap compute
#define IDX2D(i,j,nCols) ((j) + (i)*(nCols))

@ -40,14 +40,14 @@
#include "config.h"
#include "omp.h"
//distribution of @rows|blocks of @matrix, exploiting @config
typedef void (CHUNKS_DISTR ) (ulong,spmat*,CONFIG*);
typedef void (*CHUNKS_DISTR_INTERF ) (ulong,spmat*,CONFIG*);
typedef void (CHUNKS_DISTR ) (idx_t,spmat*,CONFIG*);
typedef void (*CHUNKS_DISTR_INTERF ) (idx_t,spmat*,CONFIG*);
//NOOP chunks division for manual configuration via export OMP_SCHEDULE
inline void chunksNOOP(ulong r,spmat* mat,CONFIG* cfg){ return; }
inline void chunksNOOP(idx_t r,spmat* mat,CONFIG* cfg){ return; }
#include "ompGetICV.h"
//fair division of @r elements from matrix @mat with threads num in @cfg
inline void chunksFair(ulong r,spmat* mat,CONFIG* cfg){
inline void chunksFair(idx_t r,spmat* mat,CONFIG* cfg){
assert(cfg->threadNum > 0); //configured target thread num
omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
omp_get_schedule(&kind,&chunk_size);
@ -75,7 +75,7 @@ inline void chunksFair(ulong r,spmat* mat,CONFIG* cfg){
}
//fair division of @r elements from matrix @mat of threads in @cfg
//subdividing the fair share with a factor of @FAIR_CHUNKS_FOLDING
inline void chunksFairFolded(ulong r,spmat* mat,CONFIG* cfg){
inline void chunksFairFolded(idx_t r,spmat* mat,CONFIG* cfg){
assert(cfg->threadNum > 0); //configured target thread num
omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
omp_get_schedule(&kind,&chunk_size);

@ -35,16 +35,16 @@
#include "sparseMatrix.h"
typedef struct{
ulong row;
ulong col;
idx_t row;
idx_t col;
double val;
} entry; //MatrixMarket COO entry
typedef struct{
MM_typecode mcode;
entry* entries;
ulong* rowLens;
ulong M,N,NZ; //spmat sizes
idx_t* rowLens;
idx_t M,N,NZ; //spmat sizes
} MatrixMarket;
////COO PARSE
@ -61,7 +61,7 @@ int MMCheck(MM_typecode typecode);
* return allocated and filled COO entries with the NNZ number into
* NO SORT CHECKING HERE
*/
entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens);
entry* MMtoCOO(idx_t* NZ, FILE *fp, MM_typecode mcode,idx_t* rowLens);
////COO -> ANYTHING ELSE CONVERSION
/*
@ -69,13 +69,13 @@ entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens);
* EXPECTED: CSR arrays allocated, @entries col sorted in (not madatory consecut) rows
* [simmetrical parts explicitly rappresented --> not important here]
*/
int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens);
int COOtoCSR(entry* entries, spmat* mat,idx_t* rowLens);
/*
* write COO entries in @entries inside sparse matrix @mat in ELL format
* EXPECTED: @entries col sorted in (not madatory consecut) rows
* ELL internal array allocated in this function, not freed in case of error
*/
int COOtoELL(entry* entries, spmat* mat, ulong* rowLens);
int COOtoELL(entry* entries, spmat* mat, idx_t* rowLens);
////wrapper MM -> specialized target
/*
* Parse MatrixMarket matrix stored in file at @matPath

@ -75,7 +75,7 @@ typedef struct{
idx_t vLen; //size of the aux dense vector //TODO USELESS?
SPVECT_IDX_DENSE_MAP nnzIdxMap; //
} ACC_DENSE;
int allocAccDense(ACC_DENSE* v, ulong size);
int allocAccDense(ACC_DENSE* v, idx_t size);
/*
**** NNZ IDXS PRESENCE FLAGS ACCESS INTERFACE: ***

@ -40,19 +40,19 @@
* returning an offsets matrix out[i][j] = start of jth colPartition of row i
* subdivide @A columns in uniform cols ranges in the output
*/
idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols);
/*
* partition CSR sparse matrix @A in @gridCols columns partitions as
* indipended and allocated sparse matrixes and return them
* subdivide @A columns in uniform cols ranges in the output
*/
spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,int gridCols);
//same as above but with (aux) use of offsets partitoning (also returned if colOffsets!=NULL
spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,idx_t** colPartsOffsets);
spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,int gridCols,idx_t** colPartsOffsets);
//same as checkOverallocPercent but with 2D partitioning - CSR col partitioning
void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
void CAT(checkOverallocRowPartsPercent_,OFF_F)(idx_t* forecastedSizes,spmat* AB,
idx_t gridCols,idx_t* bColOffsets);
///////////////////////////////////////////////////////////////////////////////
@ -62,13 +62,13 @@ void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
//shift every index about the sparse data for use the matrix in a fortran app
inline void C_FortranShiftIdxs(spmat* m){
for(ulong r=0; r<m->M+1; m -> IRP[r]++,r++);
for(ulong i=0; i<m->NZ; m -> JA[i]++, i++);
for(idx_t r=0; r<m->M+1; m -> IRP[r]++,r++);
for(idx_t i=0; i<m->NZ; m -> JA[i]++, i++);
}
//shift every index about the sparse data for use the matric in a C app
inline void Fortran_C_ShiftIdxs(spmat* m){ //TODO DBG ONLY and compleatness
for(ulong r=0; r<m->M+1; m -> IRP[r]--,r++);
for(ulong i=0; i<m->NZ; m -> JA[i]--, i++);
for(idx_t r=0; r<m->M+1; m -> IRP[r]--,r++);
for(idx_t i=0; i<m->NZ; m -> JA[i]--, i++);
}
/*
@ -77,7 +77,7 @@ inline void Fortran_C_ShiftIdxs(spmat* m){ //TODO DBG ONLY and compleatness
* in @forecastedSizes there's for each row -> forecasted size
* and in the last entry the cumulative of the whole matrix
*/
void checkOverallocPercent(ulong* forecastedSizes,spmat* AB);
void checkOverallocPercent(idx_t* forecastedSizes,spmat* AB);
/*
check if sparse matrixes A<->B differ up to
DOUBLE_DIFF_THREASH per element
@ -86,7 +86,7 @@ int spmatDiff(spmat* A, spmat* B);
////dyn alloc of spMM output matrix
/*
///size prediction of AB = @A * @B
inline ulong SpMMPreAlloc(spmat* A,spmat* B){
inline idx_t SpMMPreAlloc(spmat* A,spmat* B){
//TODO BETTER PREALLOC HEURISTICS HERE
return MAX(A->NZ,B->NZ);
}
@ -110,7 +110,7 @@ inline spmat* initSpMatrixSpMM(spmat* A, spmat* B){
#define REALLOC_FACTOR 1.5
//realloc sparse matrix NZ arrays
inline int reallocSpMatrix(spmat* mat,ulong newSize){
inline int reallocSpMatrix(spmat* mat,idx_t newSize){
mat->NZ *= newSize;
void* tmp;
if (!(tmp = realloc(mat->AS,mat->NZ * sizeof(*(mat->AS))))){
@ -133,6 +133,6 @@ void printSparseMatrix(spmat* sparseMat,char justNZMarkers);
/*convert @sparseMat sparse matrix in dense matrix returned*/
double* CSRToDense(spmat* sparseMat);
void freeAccsDense(ACC_DENSE* vectors,ulong num);
void freeAccsDense(ACC_DENSE* vectors,idx_t num);
#endif //SPARSEUTILS_H_COMMON_IDX_IMPLS

@ -58,18 +58,18 @@ int createNewFile(char* const outFpath);
#define DOUBLE_STR_FORMAT "%25le\n"
//write double vector @v as row sequence of double at @fpath
//e.g. read with od -tf8 -w8 fpath : OCTALOFFSET: DOUBLE FULL DIGITS
int writeDoubleVector(char* fpath,double* v,ulong size);
int writeDoubleVector(char* fpath,double* v,idx_t size);
/*
* read vector of double [Str] of arbitrary size from @fpath, true lenght in *size
* if size point to a nnz value, the initial allocation will be of *size
* eventual successive reallocation done multipling *size with VECTOR_STEP_REALLOC
*/
double* readDoubleVector(char* fpath,ulong* size);
double* readDoubleVectorStr(char* fpath,ulong* size);
double* readDoubleVector(char* fpath,idx_t* size);
double* readDoubleVectorStr(char* fpath,idx_t* size);
///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
//dual of readDoubleVectorVector
int writeDoubleVectorAsStr(char* fpath,double* v,ulong size);
int writeDoubleVectorAsStr(char* fpath,double* v,idx_t size);
int MPI_Dims_create(int nnodes, int ndims, int dims[]); //commons/ompi_dims_create/ompi_dims_create.c
#include "config.h"
@ -81,16 +81,16 @@ int getConfig(CONFIG* conf);
//append only list implemented with a reallocated array
typedef struct{
ulong* a;
ulong size;
ulong lastIdx;
idx_t* a;
idx_t size;
idx_t lastIdx;
} APPENDARRAY;
//append @val to @list, reallocating if reached end
//TODO inline int appendArr(ulong val,APPENDARRAY* list);
//TODO inline int appendArr(idx_t val,APPENDARRAY* list);
void sortuint(uint* arr, uint len); //sort uint array @arr of @len elements
void sort_idx_t(idx_t* arr, idx_t len);
void sortulong(ulong* arr, ulong len);
void sortidx_t(idx_t* arr, idx_t len);
void sortRbNode(rbNode* arr,idx_t len);
///ranges functions
@ -157,9 +157,9 @@ inline idx_t reductionMaxOmp(idx_t* arr,idx_t arrLen){
* of the 2 entries of the input vectors, signed as a[i] - b[i] (as well as dump prints)
* CONVENTION: @a = true result, @b vector to check with the true result
*/
int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax);
int doubleVectorsDiff(double* a, double* b, idx_t n,double* diffMax);
//fill a random vector in @v long @size doubles
int fillRndVector(ulong size, double* v);
int fillRndVector(idx_t size, double* v);
//read vector as a sequence of space separated double from file at @fpath
#define VECTOR_STEP_MALLOC 100
@ -174,8 +174,8 @@ int fillRndVector(ulong size, double* v);
int extractInTmpFS(char* path, char* tmpFsDecompressPath);
//compute E[@values] in @out[0] and VAR[@values] in @out[1] of @numVals values
void statsAvgVar(double* values,uint numVals, double* out);
void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers);
void printVector(double* v,ulong size);
void printMatrix(double* mat,idx_t m,idx_t n,char justNZMarkers);
void printVector(double* v,idx_t size);
void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
#endif

@ -52,10 +52,10 @@ int MMCheck(MM_typecode mcode) {
return EXIT_SUCCESS;
}
entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens){
entry* MMtoCOO(idx_t* NZ, FILE *fp, MM_typecode mcode,idx_t* rowLens){
int scanndRet=0;
ulong nzTrgt=*NZ,nzIdx=0; //expanded num of nz (in case of sym matrix)
ulong diagEntries=0, row = 0, col = 0;//current entry's row,col from MM -> 1 based
idx_t nzTrgt=*NZ,nzIdx=0; //expanded num of nz (in case of sym matrix)
idx_t diagEntries=0, row = 0, col = 0;//current entry's row,col from MM -> 1 based
double val = 0;
entry* entries = NULL; //COO parsed entries
///init
@ -70,10 +70,10 @@ entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens){
///parse MM fp lines into COOordinate entries
while (1) { // Reading the fp until EOF
if (mm_is_pattern(mcode)){
scanndRet = fscanf(fp, "%lu %lu\n", &row, &col);
scanndRet = fscanf(fp, "%d %d\n", &row, &col);
val = 1.0;
} else if (mm_is_real(mcode) || (mm_is_integer(mcode))){
scanndRet = fscanf(fp, "%lu %lu %lf\n", &row, &col, &val);
scanndRet = fscanf(fp, "%d %d %lf\n", &row, &col, &val);
}
@ -168,11 +168,11 @@ MatrixMarket* MMRead(char* matPath){
////COO -> ANYTHING ELSE CONVERSION
int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){
int COOtoCSR(entry* entries, spmat* mat,idx_t* rowLens){
int out = EXIT_FAILURE;
ulong idx;
idx_t idx;
long* _rowsLastCol = NULL; //for each row -> last added entry's columnIdx
ulong* rowsNextIdx = NULL; //for each row -> next entry progressive idx
idx_t* rowsNextIdx = NULL; //for each row -> next entry progressive idx
if (!(rowsNextIdx = calloc(mat->M,sizeof(*rowsNextIdx)))){
ERRPRINT("MMtoCOO: rowsNextIdx calloc errd\n");
goto _end;
@ -186,9 +186,9 @@ int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){
}
/*TODO OLD
* //get rowLens->IRP (partial), TODO moved MMtoCOO to avoid FULL rescan entries
* for (ulong i=0; i<mat->NZ; i++) mat->IRP[entries[i].row+1]++;
* for (idx_t i=0; i<mat->NZ; i++) mat->IRP[entries[i].row+1]++;
* memcpy(mat->RL,mat->IRP + 1,sizeof(*mat->IRP) * mat->M); //TODO in next ifdef
* for (ulong i=2; i<mat->M+1; i++) mat->IRP[i] += mat->IRP[i-1];
* for (idx_t i=2; i<mat->M+1; i++) mat->IRP[i] += mat->IRP[i-1];
* OLD2: rowLens memcpy ... no just moved the pointer
* #ifdef ROWLENS
* memcpy(mat->RL,rowLens,sizeof(*rowLens) * mat->M); //TODO in next ifdef
@ -197,19 +197,19 @@ int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){
//IRP: trasform rows lens as increments to build row index "pointer"
//0th -> 0 mandatory; 1th = 0th row len, ...., M+1th = end of Mth row
memcpy(mat->IRP+1,rowLens,sizeof(*rowLens) * mat->M);//init IRP with rows lens
for (ulong i=2; i<mat->M+1; i++) mat->IRP[i] += mat->IRP[i-1];
for (idx_t i=2; i<mat->M+1; i++) mat->IRP[i] += mat->IRP[i-1];
CONSISTENCY_CHECKS assert(mat->IRP[mat->M] == mat->NZ);
///FILL
//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
//sorted for each row (even nn sequential in @entries)
//entries write in CSR format
entry* e;
for (ulong i=0; i<mat->NZ; i++) {
for (idx_t i=0; i<mat->NZ; i++) {
e = entries+i;
CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE SORTED
/*#pragma message("COO sorting check enabled")*/
if (_rowsLastCol[e->row] >= (long) e->col){
ERRPRINTS("not sorted entry:%ld,%ld,%lf",e->row,e->col,e->val);
ERRPRINTS("not sorted entry:%d,%d,%lf",e->row,e->col,e->val);
goto _end;
}
_rowsLastCol[e->row] = e->col;
@ -228,16 +228,16 @@ int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){
return out;
}
int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
int COOtoELL(entry* entries, spmat* mat, idx_t* rowLens){
int out=EXIT_FAILURE;
ulong maxRow = 0, col, _ellEntriesTot, *rowsNextCol;
idx_t maxRow = 0, col, _ellEntriesTot, *rowsNextCol;
long* _rowsLastCol=NULL;
entry* e;
for (ulong i=0; i<mat->M; i++) maxRow = MAX(maxRow,rowLens[i]);
for (idx_t i=0; i<mat->M; i++) maxRow = MAX(maxRow,rowLens[i]);
_ellEntriesTot = 2*mat->M*maxRow;
#ifdef LIMIT_ELL_SIZE
if ( _ellEntriesTot > ELL_MAX_ENTRIES ){
ERRPRINTS("Required entries %lu -> %lu uMB for the matrix exceed the "
ERRPRINTS("Required entries %d -> %lu uMB for the matrix exceed the "
"designated threashold of: %lu -> %lu MB for ellpack\n",
_ellEntriesTot,(sizeof(double)*_ellEntriesTot) >> 20,
ELL_MAX_ENTRIES,(sizeof(double)*ELL_MAX_ENTRIES) >> 20);
@ -274,12 +274,12 @@ int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
///FILL NZ
//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
//sorted for each row (even nn sequential in @entries)
for (ulong i=0; i<mat->NZ; i++){
for (idx_t i=0; i<mat->NZ; i++){
e = entries + i;
CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE COLS SORTED for righe successive
/*#pragma message("COO sorting check enabled")*/
if (_rowsLastCol[e->row] >= (long) e->col){
ERRPRINTS("not sorted entry:%ld,%ld,%lf",
ERRPRINTS("not sorted entry:%d,%d,%lf",
e->row,e->col,e->val);
goto _end;
}
@ -291,16 +291,16 @@ int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
}
///FILL PAD
ulong padded = 0,paddedEntries = mat->M*mat->MAX_ROW_NZ;
for (ulong r=0; r<mat->M; r++){
for (ulong c=rowLens[r],j=IDX2D(r,c,maxRow); c<maxRow; c++,j++,padded++){
idx_t padded = 0,paddedEntries = mat->M*mat->MAX_ROW_NZ;
for (idx_t r=0; r<mat->M; r++){
for (idx_t c=rowLens[r],j=IDX2D(r,c,maxRow); c<maxRow; c++,j++,padded++){
//mat->AS[j] = ELL_AS_FILLER; //TODO ALREADY DONE IN CALLOC
//mat->JA[j] = mat->JA[rowLens[r]-1]; //ELL_JA_FILLER; //TODO calloc CUDA benefit?
}
}
VERBOSE{
printf("padded %lu entries = %lf%% of NZ\n",padded,100*padded/(double) mat->NZ);
printf("ELL matrix of: %lu paddedEntries -> %lu MB of JA+AS\n",
printf("padded %d entries = %lf%% of NZ\n",padded,100*padded/(double) mat->NZ);
printf("ELL matrix of: %d paddedEntries -> %ld MB of JA+AS\n",
paddedEntries,(paddedEntries*sizeof(*(mat->JA))+paddedEntries*sizeof(*(mat->AS))) >> 20);
}
out = EXIT_SUCCESS;
@ -318,7 +318,7 @@ spmat* MMtoCSR(char* matPath){
return NULL;
}
if (!(mat = calloc(1,sizeof(*mat)))){
ERRPRINT("MMtoCSR: mat struct alloc errd");
ERRPRINT("MMtoCSR: mat struct alloc err\n");
goto err;
}
mat -> M = mm->M;
@ -345,7 +345,7 @@ spmat* MMtoCSR(char* matPath){
#endif
VERBOSE
printf("MMtoCSR: %lu NZ entries-> %lu MB of AS+JA+IRP\n",mat->NZ,
printf("MMtoCSR: %d NZ entries-> %ld MB of AS+JA+IRP\n",mat->NZ,
(mat->NZ*sizeof(*(mat->AS))+mat->NZ*sizeof(*(mat->JA))+(1+mat->M*sizeof(*(mat->IRP))))>>20);
goto _free;

@ -15,24 +15,20 @@ subroutine dspmm(a,b,c,info, impl_choice)
integer(psb_ipk_), intent(in) :: impl_choice
! Internal variables
integer(c_size_t) :: a_m,a_n,a_nz
type(c_ptr) :: a_as,a_ja,a_irp
integer(c_size_t) :: a_max_row_nz
integer(c_size_t) :: b_m,b_n,b_nz
type(c_ptr) :: b_as,b_ja,b_irp
integer(c_size_t) :: b_max_row_nz
integer(c_int) :: impl_choice_
type(c_ptr) :: accumul, rows_sizes, tmp_matrix
integer(c_size_t) :: nnz
type(c_ptr) :: c_as,c_ja,c_irp
integer(c_int) :: a_m,a_n,a_nz
type(c_ptr) :: a_as,a_ja,a_irp
integer(c_int) :: b_m,b_n,b_nz
type(c_ptr) :: b_as,b_ja,b_irp
integer(c_int) :: impl_choice_
type(c_ptr) :: accumul, rows_sizes, tmp_matrix
integer(c_int) :: nnz
type(c_ptr) :: c_as,c_ja,c_irp
interface spmm_build_spacc
subroutine psb_f_spmm_build_spacc(c_a_m,c_a_n,c_a_nz,&
c_a_as,c_a_ja,c_a_irp,&
c_a_max_row_nz,&
c_b_m,c_b_n,c_b_nz,&
c_b_as,c_b_ja,c_b_irp,&
c_b_max_row_nz,&
c_impl_choice,&
c_accumul,&
c_rows_sizes,&
@ -41,16 +37,14 @@ subroutine dspmm(a,b,c,info, impl_choice)
c_nnz) bind(C)
use iso_c_binding
use psb_base_mod
integer(c_size_t), intent(in), value :: c_a_m,c_a_n,c_a_nz
integer(c_int), intent(in), value :: c_a_m,c_a_n,c_a_nz
type(c_ptr), intent(in) :: c_a_as,c_a_ja,c_a_irp
integer(c_size_t), intent(in), value :: c_a_max_row_nz
integer(c_size_t), intent(in), value :: c_b_m,c_b_n,c_b_nz
integer(c_int), intent(in), value :: c_b_m,c_b_n,c_b_nz
type(c_ptr), intent(in) :: c_b_as,c_b_ja,c_b_irp
integer(c_size_t), intent(in), value :: c_b_max_row_nz
integer(c_int), intent(in), value :: c_impl_choice
type(c_ptr), intent(out) :: c_accumul,c_rows_sizes,c_tmp_matrix
integer(psb_ipk_), intent(out) :: c_info
integer(c_size_t), intent(out) :: c_nnz
integer(c_int), intent(out) :: c_nnz
end subroutine psb_f_spmm_build_spacc
end interface spmm_build_spacc
@ -74,7 +68,6 @@ subroutine dspmm(a,b,c,info, impl_choice)
a_m = a%get_nrows()
a_n = a%get_ncols()
a_nz = a%get_nzeros()
write(*,*) 'IRP(1:5) ',a%irp(1:5)
a_as = c_loc(a%val)
a_ja = c_loc(a%ja)
a_irp = c_loc(a%irp)
@ -89,24 +82,25 @@ subroutine dspmm(a,b,c,info, impl_choice)
! call calculateSize
call psb_f_spmm_build_spacc(a_m,a_n,a_nz,a_as,&
a_ja,a_irp,a_max_row_nz,&
a_ja,a_irp,&
b_m,b_n,b_nz,b_as,b_ja,&
b_irp,b_max_row_nz,&
b_irp,&
impl_choice_,accumul,&
rows_sizes,tmp_matrix,&
info,nnz)
! allocate c%val, c%ja and c%irp
allocate(c%val(nnz))
allocate(c%ja(nnz))
allocate(c%irp(a_m + 1))
! c_as = c_loc(c%val)
! c_ja = c_loc(c%ja)
! c_irp = c_loc(c%irp)
call psb_realloc(nnz,c%val,info)
if (info /= 0) write(*,*) "reallocation failed"
call psb_realloc(nnz,c%ja,info)
call psb_realloc(a_m+1,c%irp,info)
! c%set_nrows(a_m)
! c%set_ncols(b_n)
c_as = c_loc(c%val)
c_ja = c_loc(c%ja)
c_irp = c_loc(c%irp)
call c%set_nrows(a_m)
call c%set_ncols(b_n)
! call spmmRowByRowPopulate
call psb_f_spmm_merge_spacc(accumul,&

Loading…
Cancel
Save