/* * Sp3MM_for_AlgebraicMultiGrid * (C) Copyright 2021-2022 * Andrea Di Iorio * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )*/ #ifndef OFF_F #error generic implementation requires OFF_F defined #endif //////////////////// COMPUTE CORE Sp[3]MM Upperbound ////////////////////////// spmat* CAT(spmmSerial_,OFF_F)(spmat* A,spmat* B, CONFIG* _cfg){ //serial implementation spmat* AB = NULL; ACC_DENSE acc; if ( allocAccDense(&acc,B->N) ) goto _free; if (!(AB = allocSpMatrix(A->M,B->N))) goto _free; for( idx_t r=0; rM; r++ ){ for (ulong c=A->IRP[r]-OFF_F; cIRP[r+1]-OFF_F; c++) //row-by-row formul CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, &acc); sparsifyDirect(&acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API } _free: freeAccsDense(&acc,1); return AB; } ////////Sp3MM as 2 x SpMM ///1D spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N); ///thread aux ACC_DENSE *accVects = NULL,*acc; SPMM_ACC* outAccumul=NULL; idx_t* rowsSizes = NULL; ///init AB matrix with SPMM heuristic preallocation spmat* AB = allocSpMatrix(A->M,B->N); if (!AB) goto _err; if (!(rowsSizes = CAT(spMMSizeUpperbound_,OFF_F) (A,B))) goto _err; ///aux structures alloc if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){ ERRPRINT("accVects init failed\n"); goto _err; } if (!(outAccumul = initSpMMAcc(rowsSizes[AB->M],AB->M))) goto _err; #if SPARSIFY_PRE_PARTITIONING == T //prepare sparse accumulators with U.Bounded rows[parts] starts SPACC* accSp; for( idx_t r=0,rSizeCumul=0; rM; rSizeCumul += rowsSizes[r++]){ accSp = outAccumul->accs+r; accSp->JA = outAccumul->JA + rSizeCumul; accSp->AS = outAccumul->AS + rSizeCumul; //accSp->len = rowsSizes[r]; } #endif ((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (AB->M,AB,cfg); AUDIT_INTERNAL_TIMES Start=omp_get_wtime(); #pragma omp parallel for schedule(runtime) private(acc) for (ulong r=0; rM; r++){ //row-by-row formulation //iterate over nz entry index c inside current row r acc = accVects + omp_get_thread_num(); /* direct use of sparse scalar vector multiplication for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; jaIRP[r+1]-OFF_F; ja++){ ca = A->JA[ja] - OFF_F; jb = B->IRP[ca] - OFF_F; bRowLen = B->IRP[ca+1] - B->IRP[ca]; CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc); }*/ for (ulong c=A->IRP[r]-OFF_F; cIRP[r+1]-OFF_F; c++) //row-by-row formul CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc); //trasform accumulated dense vector to a CSR row #if SPARSIFY_PRE_PARTITIONING == T _sparsifyUB(acc,outAccumul->accs+r,0); #else sparsifyUBNoPartsBounds(outAccumul,acc,outAccumul->accs + r,0); #endif _resetAccVect(acc); //rezero for the next A row } ///merge sparse row computed before if (mergeRows(outAccumul->accs,AB)) goto _err; #if OFF_F != 0 C_FortranShiftIdxs(AB); #endif AUDIT_INTERNAL_TIMES End=omp_get_wtime(); DEBUG checkOverallocPercent(rowsSizes,AB); goto _free; _err: if(AB) freeSpmat(AB); AB=NULL; //nothing'll be returned _free: if(rowsSizes) free(rowsSizes); if(accVects) freeAccsDense(accVects,cfg->threadNum); if(outAccumul) freeSpMMAcc(outAccumul); return AB; } idx_t CAT(spmmRowByRowCalculateSize_,OFF_F)(spmat* A, spmat*B, CONFIG* cfg, void** accumul, void **rows_sizes, void** tmp_matrix){ DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N); ///thread aux ACC_DENSE *accVects = NULL,*acc; SPMM_ACC* outAccumul=NULL; idx_t* rowsSizes = NULL; ///init AB matrix with SPMM heuristic preallocation spmat* AB = allocSpMatrix(A->M,B->N); if (!AB) goto _err; if (!(rowsSizes = CAT(spMMSizeUpperbound_,OFF_F) (A,B))) goto _err; ///aux structures alloc if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){ ERRPRINT("accVects init failed\n"); goto _err; } if (!(outAccumul = initSpMMAcc(rowsSizes[AB->M],AB->M))) goto _err; #if SPARSIFY_PRE_PARTITIONING == T //prepare sparse accumulators with U.Bounded rows[parts] starts SPACC* accSp; for( idx_t r=0,rSizeCumul=0; rM; rSizeCumul += rowsSizes[r++]){ accSp = outAccumul->accs+r; accSp->JA = outAccumul->JA + rSizeCumul; accSp->AS = outAccumul->AS + rSizeCumul; //accSp->len = rowsSizes[r]; } #endif ((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (AB->M,AB,cfg); AUDIT_INTERNAL_TIMES Start=omp_get_wtime(); #pragma omp parallel for schedule(runtime) private(acc) for (ulong r=0; rM; r++){ //row-by-row formulation //iterate over nz entry index c inside current row r acc = accVects + omp_get_thread_num(); /* direct use of sparse scalar vector multiplication for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; jaIRP[r+1]-OFF_F; ja++){ ca = A->JA[ja] - OFF_F; jb = B->IRP[ca] - OFF_F; bRowLen = B->IRP[ca+1] - B->IRP[ca]; CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc); }*/ for (ulong c=A->IRP[r]-OFF_F; cIRP[r+1]-OFF_F; c++) //row-by-row formul CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc); //trasform accumulated dense vector to a CSR row #if SPARSIFY_PRE_PARTITIONING == T _sparsifyUB(acc,outAccumul->accs+r,0); #else sparsifyUBNoPartsBounds(outAccumul,acc,outAccumul->accs + r,0); #endif _resetAccVect(acc); //rezero for the next A row } ///calculate exact number of non zero elements idx_t nnz; nnz = calculateSize(outAccumul->accs,AB); /// put the sparse accumulator into the argument so that /// it can be retrived in Fortran *accumul = outAccumul; *rows_sizes = rowsSizes; *tmp_matrix = AB; if(accVects) freeAccsDense(accVects,cfg->threadNum); return nnz; _err: if(AB) freeSpmat(AB); AB=NULL; //nothing'll be returned _free: if(rowsSizes) free(rowsSizes); if(accVects) freeAccsDense(accVects,cfg->threadNum); if(outAccumul) freeSpMMAcc(outAccumul); } void CAT(spmmRowByRowPopulate_,OFF_F)(void** accumul, void** rows_sizes, void** tmp_matrix, double** AS, idx_t** JA, idx_t** IRP){ SPMM_ACC* outAccumul= *accumul; idx_t* rowsSizes = *rows_sizes; spmat *AB = *tmp_matrix; mergeRowsPopulate(outAccumul->accs, AB, AS, JA, IRP); #if OFF_F != 0 C_FortranShiftIdxs(AB); #endif AUDIT_INTERNAL_TIMES End=omp_get_wtime(); DEBUG checkOverallocPercent(rowsSizes,AB); if(AB) freeSpmat(AB); if(rowsSizes) free(rowsSizes); if(outAccumul) freeSpMMAcc(outAccumul); } spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ DEBUG printf("spmm\trowBlocks of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N); DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols); ///thread aux ACC_DENSE *accVects = NULL,*acc; SPMM_ACC* outAccumul=NULL; idx_t* rowsSizes = NULL; ///init AB matrix with SPMM heuristic preallocation spmat* AB = allocSpMatrix(A->M,B->N); if (!AB) goto _err; if (!(rowsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B))) goto _err; ///aux structures alloc if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){ ERRPRINT("accVects init failed\n"); goto _err; } if (!(outAccumul = initSpMMAcc(rowsSizes[AB->M],AB->M))) goto _err; #if SPARSIFY_PRE_PARTITIONING == T //prepare sparse accumulators with U.Bounded rows[parts] starts SPACC* accSp; for( idx_t r=0,rSizeCumul=0; rM; rSizeCumul += rowsSizes[r++]){ accSp = outAccumul->accs+r; accSp->JA = outAccumul->JA + rSizeCumul; accSp->AS = outAccumul->AS + rSizeCumul; } #endif //perform Gustavson over rows blocks -> M / @cfg->gridRows ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows; ((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg); AUDIT_INTERNAL_TIMES Start=omp_get_wtime(); ulong b,startRow,block; //omp for aux vars #pragma omp parallel for schedule(runtime) private(acc,startRow,block) for (b=0; b < cfg->gridRows; b++){ block = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem); startRow= UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem); acc = accVects + omp_get_thread_num(); DEBUGPRINT{ fflush(NULL); printf("block %lu\t%lu:%lu(%lu)\n",b,startRow,startRow+block-1,block); fflush(NULL); } //row-by-row formulation in the given row block for (ulong r=startRow; rIRP[r]-OFF_F; cIRP[r+1]-OFF_F; c++) CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc); //trasform accumulated dense vector to a CSR row #if SPARSIFY_PRE_PARTITIONING == T _sparsifyUB(acc,outAccumul->accs+r,0); #else sparsifyUBNoPartsBounds(outAccumul,acc,outAccumul->accs + r,0); #endif _resetAccVect(acc); //rezero for the next A row } } ///merge sparse row computed before if (mergeRows(outAccumul->accs,AB)) goto _err; #if OFF_F != 0 C_FortranShiftIdxs(AB); #endif AUDIT_INTERNAL_TIMES End=omp_get_wtime(); DEBUG checkOverallocPercent(rowsSizes,AB); goto _free; _err: if(AB) freeSpmat(AB); AB=NULL; //nothing'll be returned _free: if(rowsSizes) free(rowsSizes); if(accVects) freeAccsDense(accVects,cfg->threadNum); if(outAccumul) freeSpMMAcc(outAccumul); return AB; } ///2D //PARTITIONS NOT ALLOCATED spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N); DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols); idx_t* bColOffsets = NULL; //B group columns starting offset for each row ACC_DENSE *accVectors=NULL,*accV; SPACC* accRowPart; spmat* AB = allocSpMatrix(A->M,B->N); SPMM_ACC* outAccumul=NULL; idx_t* rowsPartsSizes=NULL; if (!AB) goto _err; //2D indexing aux vars ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols; ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows; ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols; ulong startRow,startCol,rowBlock,colBlock; //data division aux variables ////get bColOffsets for B column groups if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols))) goto _err; #if SPARSIFY_PRE_PARTITIONING == T uint rowsPartsSizesN = aSubRowsN; if (!(rowsPartsSizes = CAT(spMMSizeUpperboundColParts_,OFF_F) (A,B,cfg->gridCols,bColOffsets))) #else uint rowsPartsSizesN = AB->M; if (!(rowsPartsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B))) #endif goto _err; //aux vectors ///other AUX struct alloc if (!(accVectors = _initAccVectors(gridSize,_colBlock+(_colBlockRem?1:0)))){ ERRPRINT("accVectors calloc failed\n"); goto _err; } if (!(outAccumul = initSpMMAcc(rowsPartsSizes[rowsPartsSizesN],aSubRowsN))) goto _err; #if SPARSIFY_PRE_PARTITIONING == T //prepare sparse accumulators with U.Bounded rows[parts] starts SPACC* accSp; for( idx_t i=0,rSizeCumul=0; iaccs+i; accSp->JA = outAccumul->JA + rSizeCumul; accSp->AS = outAccumul->AS + rSizeCumul; } //memset(outAccumul->AS,0,sizeof(double)*rowsSizes[AB->M]);memset(outAccumul->JA,0,sizeof(idx_t)*rowsSizes[AB->M]); #endif ((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg); AUDIT_INTERNAL_TIMES Start=omp_get_wtime(); ulong tileID,t_i,t_j; //for aux vars ulong bPartLen,bPartID,bPartOffset;//B partition acces aux vars #pragma omp parallel for schedule(runtime) \ private(accV,accRowPart,rowBlock,colBlock,startRow,startCol,\ bPartLen,bPartID,bPartOffset,t_i,t_j) for (tileID = 0; tileID < gridSize; tileID++){ ///get iteration's indexing variables //tile index in the 2D grid of AB computation TODO OMP HOW TO PARALLELIZE 2 FOR t_i = tileID/cfg->gridCols; //i-th row block t_j = tileID%cfg->gridCols; //j-th col block //get tile row-cols group FAIR sizes rowBlock = UNIF_REMINDER_DISTRI(t_i,_rowBlock,_rowBlockRem); startRow = UNIF_REMINDER_DISTRI_STARTIDX(t_i,_rowBlock,_rowBlockRem); startCol = UNIF_REMINDER_DISTRI_STARTIDX(t_j,_colBlock,_colBlockRem); accV = accVectors + tileID; DEBUGPRINT{ fflush(NULL); colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem); printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",t_i,startRow,startRow+rowBlock-1,rowBlock); printf("colBlock [%lu\t%lu:%lu(%lu)]\n",t_j,startCol,startCol+colBlock-1,colBlock); fflush(NULL); } ///AB[t_i][t_j] block compute for (ulong r=startRow; rIRP[r]-OFF_F,c; jIRP[r+1]-OFF_F; j++){ //get start of B[A->JA[j]][:colBlock:] c = A->JA[j]-OFF_F; // col of nnz in A[r][:] <-> target B row bPartID = IDX2D(c,t_j,cfg->gridCols); bPartOffset = bColOffsets[ bPartID ]; bPartLen = bColOffsets[ bPartID + 1 ] - bPartOffset; CAT(scSparseVectMulPart_,OFF_F)(A->AS[j],B->AS+bPartOffset, B->JA+bPartOffset,bPartLen,startCol,accV); } accRowPart = outAccumul->accs + IDX2D(r,t_j,cfg->gridCols); #if SPARSIFY_PRE_PARTITIONING == T _sparsifyUB(accV,accRowPart,startCol); #else sparsifyUBNoPartsBounds(outAccumul,accV,accRowPart,startCol); #endif _resetAccVect(accV); } } if (mergeRowsPartitions(outAccumul->accs,AB,cfg)) goto _err; #if OFF_F != 0 C_FortranShiftIdxs(AB); #endif AUDIT_INTERNAL_TIMES End=omp_get_wtime(); DEBUG CAT(checkOverallocRowPartsPercent_, OFF_F)(rowsPartsSizes, AB, cfg->gridCols, bColOffsets); goto _free; _err: if (AB) freeSpmat(AB); AB = NULL; _free: free(rowsPartsSizes); free(bColOffsets); if (accVectors) freeAccsDense(accVectors,gridSize); if (outAccumul) freeSpMMAcc(outAccumul); return AB; } spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ DEBUG printf("spmm\trowBlocks of A,\tcolBlocks (allcd) of B\tM=%luxN=%lu\n",A->M,B->N); DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols); spmat *AB = NULL, *colPartsB = NULL, *colPart; idx_t* rowsPartsSizes=NULL; //aux vectors SPMM_ACC* outAccumul=NULL; ACC_DENSE *accVectors=NULL,*accV; SPACC* accRowPart; ulong startRow,startCol,rowBlock,colBlock; //data division aux variables //2D indexing aux vars idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols; idx_t* bColOffsets = NULL; if (!(AB = allocSpMatrix(A->M,B->N))) goto _err; ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows; ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols; ////B cols partition in CSRs //if (!(colPartsB = CAT(colsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols))) goto _err; if (!(colPartsB = CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F) (B, cfg->gridCols, &bColOffsets))) goto _err; #if SPARSIFY_PRE_PARTITIONING == T uint rowsPartsSizesN = aSubRowsN; if (!(rowsPartsSizes = CAT(spMMSizeUpperboundColParts_,OFF_F) (A, B, cfg->gridCols, bColOffsets))) #else uint rowsPartsSizesN = AB->M; if (!(rowsPartsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B))) #endif goto _err; ///other AUX struct alloc if (!(accVectors = _initAccVectors(gridSize,_colBlock+(_colBlockRem?1:0)))){ ERRPRINT("accVectors calloc failed\n"); goto _err; } if (!(outAccumul = initSpMMAcc(rowsPartsSizes[rowsPartsSizesN],aSubRowsN))) goto _err; #if SPARSIFY_PRE_PARTITIONING == T //prepare sparse accumulators with U.Bounded rows[parts] starts SPACC* accSp; for( idx_t i=0,rLenCumul=0; iaccs+i; accSp->JA = outAccumul->JA + rLenCumul; accSp->AS = outAccumul->AS + rLenCumul; } #endif ((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg); AUDIT_INTERNAL_TIMES Start=omp_get_wtime(); ulong tileID,t_i,t_j; //for aux vars #pragma omp parallel for schedule(runtime) \ private(accV,accRowPart,colPart,rowBlock,colBlock,startRow,startCol,t_i,t_j) for (tileID = 0; tileID < gridSize; tileID++){ ///get iteration's indexing variables //tile index in the 2D grid of AB computation TODO OMP HOW TO PARALLELIZE 2 FOR t_i = tileID/cfg->gridCols; //i-th row block t_j = tileID%cfg->gridCols; //j-th col block //get tile row-cols group FAIR sizes rowBlock = UNIF_REMINDER_DISTRI(t_i,_rowBlock,_rowBlockRem); colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem); startRow = UNIF_REMINDER_DISTRI_STARTIDX(t_i,_rowBlock,_rowBlockRem); startCol = UNIF_REMINDER_DISTRI_STARTIDX(t_j,_colBlock,_colBlockRem); colPart = colPartsB + t_j; accV = accVectors + tileID; DEBUGPRINT{ fflush(NULL); printf("rowBlock [%lu\t%lu:%lu(%lu)]\t", t_i,startRow,startRow+rowBlock-1,rowBlock); printf("colBlock [%lu\t%lu:%lu(%lu)]\n", t_j,startCol,startCol+colBlock-1,colBlock); fflush(NULL); } ///AB[t_i][t_j] block compute for (ulong r=startRow; rIRP[r]-OFF_F,c,bRowStart,bRowLen; jIRP[r+1]-OFF_F; j++){ //get start of B[A->JA[j]][:colBlock:] c = A->JA[j]-OFF_F; // column of nnz entry in A[r][:] <-> target B row bRowStart = colPart->IRP[c]; #ifdef ROWLENS bRowLen = colPart->RL[c]; #else bRowLen = colPart->IRP[c+1] - bRowStart; #endif CAT(scSparseVectMulPart_,OFF_F)(A->AS[j], colPart->AS+bRowStart,colPart->JA+bRowStart, bRowLen,startCol,accV); } accRowPart = outAccumul->accs + IDX2D(r,t_j,cfg->gridCols); #if SPARSIFY_PRE_PARTITIONING == T _sparsifyUB(accV,accRowPart,startCol); #else sparsifyUBNoPartsBounds(outAccumul,accV,accRowPart,startCol); #endif _resetAccVect(accV); } } if (mergeRowsPartitions(outAccumul->accs,AB,cfg)) goto _err; #if OFF_F != 0 C_FortranShiftIdxs(AB); #endif AUDIT_INTERNAL_TIMES End=omp_get_wtime(); DEBUG CAT(checkOverallocRowPartsPercent_, OFF_F)(rowsPartsSizes,AB,cfg->gridCols,bColOffsets); goto _free; _err: ERRPRINT("spmmRowByRow2DBlocksAllocated failed\n"); if (AB) freeSpmat(AB); AB = NULL; _free: if (colPartsB){ for (ulong i=0; igridCols; i++) freeSpmatInternal(colPartsB+i); free(colPartsB); } free(rowsPartsSizes); free(bColOffsets); if (accVectors) freeAccsDense(accVectors,gridSize); if (outAccumul) freeSpMMAcc(outAccumul); return AB; } ///SP3MM spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P, CONFIG* cfg,SPMM_INTERF spmm){ double end,start,elapsed,partial,flops; spmat *RAC = NULL, *out = NULL; if (!spmm){ //TODO runtime on sizes decide witch spmm implementation to use if not given spmm = &CAT(spmmRowByRow2DBlocks_,OFF_F); } /* TODO alloc dense aux vector, reusable over 3 product TODO arrays sovrallocati per poter essere riusati nelle 2 SpMM ulong auxVectSize = MAX(R->N,AC->N); auxVectSize = MAX(auxVectSize,P->N); */ start = omp_get_wtime(); /// triple product as a pair of spmm if (!(RAC = spmm(R,AC,cfg))) goto _free; AUDIT_INTERNAL_TIMES partial = End - Start; if (!(out = spmm(RAC,P,cfg))) goto _free; // end = omp_get_wtime(); ElapsedInternal = End - Start + partial; VERBOSE { elapsed = end - start; flops = (2 * R->NZ * P->NZ * AC->NZ) / (elapsed); printf("elapsed %le - flops %le",elapsed,flops); AUDIT_INTERNAL_TIMES printf("\tinternalTime: %le",ElapsedInternal); printf("\n"); } _free: zeroSpmat(RAC); freeSpmat(RAC); return out; } ////////Sp3MM direct ///1D spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg, SPMM_INTERF spmm){ ulong* rowSizes = NULL; SPMM_ACC* outAccumul=NULL; ACC_DENSE *accVectorsR_AC=NULL,*accVectorsRAC_P=NULL,*accRAC,*accRACP; ///init AB matrix with SPMM heuristic preallocation spmat* out = allocSpMatrix(R->M,P->N); if (!out) goto _err; /*TODO 3MM VERSION COMPUTE OUT ALLOC : -> \forall RAC.row -> hashmap{col=True}->(AVL||RBTHREE); upperBound std col RAC.rows.cols in hashmap || SYM_bis * NB: UP per RACP => NN note dimensioni righe precise => stesso approccio riservazione spazio di spmm ( fetch_and_add ) * SYM_BIS ==> note dimensioni righe => * 1) pre riservazione spazio per righe -> cache allignement per threads -(sc. static & blocco di righe allineato a cache block successivo a blocco righe precedente) -(sc. dynamic& righe tutte allineate a cache block (NO OVERLAPS!) -> huge overhead ? * 2) pre riservazione spazio righe diretamente in out CSR -> probabili cache blocks overlap; salvo costo di P.M memcpy */ if (!(rowSizes = CAT(spMMSizeUpperbound_,OFF_F)(R,AC))) goto _err; ///TODO TOO LOOSE UB...INTEGRATE RBTREE FOR SYM->PRECISE ///aux structures alloc if (!(outAccumul = initSpMMAcc(rowSizes[R->M],P->M))) goto _err; //TODO size estimated with RAC mat if (!(accVectorsR_AC = _initAccVectors(cfg->threadNum,AC->N))){ //TODO LESS || REUSE ERRPRINT("accVectorsR_AC init failed\n"); goto _err; } if (!(accVectorsRAC_P = _initAccVectors(cfg->threadNum,R->N))){ //TODO LESS || REUSE ERRPRINT("accVectorsRAC_P init failed\n"); goto _err; } ulong c; ((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (R->M,R,cfg); AUDIT_INTERNAL_TIMES Start=omp_get_wtime(); #pragma omp parallel for schedule(runtime) private(accRAC,accRACP,c) for (ulong r=0; rM; r++){ //row-by-row formulation //iterate over nz entry index c inside current row r accRAC = accVectorsR_AC + omp_get_thread_num(); accRACP = accVectorsRAC_P + omp_get_thread_num(); //computing (tmp) R*AC r-th row for (idx_t j=R->IRP[r]-OFF_F; jIRP[r+1]-OFF_F; j++) CAT(scSparseRowMul_,OFF_F)(R->AS[j], AC, R->JA[j]-OFF_F, accRAC); //forward the computed row for (idx_t j=0; jnnzIdxMap.len; j++){ c = accRAC->nnzIdx[j]; CAT(scSparseRowMul_,OFF_F)(accRAC->v[c],P,c,accRACP); } //trasform accumulated dense vector to a CSR row TODO in UB buff sparsifyUBNoPartsBounds(outAccumul,accRACP,outAccumul->accs+r,0); _resetAccVect(accRAC); _resetAccVect(accRACP); } ///merge sparse row computed before if (mergeRows(outAccumul->accs,out)) goto _err; #if OFF_F != 0 C_FortranShiftIdxs(out); #endif AUDIT_INTERNAL_TIMES{ End=omp_get_wtime(); ElapsedInternal = End-Start; } DEBUG checkOverallocPercent(rowSizes,out); goto _free; _err: if(out) freeSpmat(out); out = NULL; _free: if(rowSizes) free(rowSizes); if(accVectorsR_AC) freeAccsDense(accVectorsR_AC,cfg->threadNum); if(accVectorsRAC_P) freeAccsDense(accVectorsRAC_P,cfg->threadNum); if(outAccumul) freeSpMMAcc(outAccumul); return out; }