Refactoring private queues, still not working

3 years ago · b5e52d31f5
parent deab695294
commit b5e52d31f5
4 changed files with 48 additions and 56 deletions
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@ -179,7 +179,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                        MilanLongInt StartIndex, MilanLongInt EndIndex,
                        MilanLongInt* numGhostEdgesPtr,
                        MilanLongInt* numGhostVerticesPtr,
                        MilanLongInt* insertMePtr,
                        MilanLongInt* S,
                        MilanLongInt* verLocInd,
                        MilanLongInt* verLocPtr,
@ -196,7 +195,12 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                        vector<MilanLongInt>& QMsgType,
                        vector<MilanInt>& QOwner,
                        MilanLongInt* &candidateMate,
-                        staticQueue& U
+                        staticQueue& U,
                        staticQueue& privateU,
                        staticQueue& privateQLocalVtx,
                        staticQueue& privateQGhostVtx,
                        staticQueue& privateQMsgType,
                        staticQueue& privateQOwner
                        );
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@ -185,7 +185,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
    //Build the Ghost Vertex Set: Vg
    map <MilanLongInt, MilanLongInt> Ghost2LocalMap; //Map each ghost vertex to a local vertex
    vector <MilanLongInt> Counter;  //Store the edge count for each ghost vertex
-    MilanLongInt numGhostVertices = 0, numGhostEdges = 0, insertMe = 0; //Number of Ghost vertices
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0; //Number of Ghost vertices
 #ifdef PRINT_DEBUG_INFO_
    cout<<"\n("<<myRank<<")About to compute Ghost Vertices..."; fflush(stdout);
@ -218,7 +218,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
    initialize(NLVer, NLEdge, StartIndex, 
                EndIndex, &numGhostEdges, 
-                &numGhostVertices, &insertMe, &S,
+                &numGhostVertices, &S,
                verLocInd, verLocPtr,
                MateLock, 
                Ghost2LocalMap, Counter,
@ -226,20 +226,27 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                tempCounter, GMate,
                Message, QLocalVtx,
                QGhostVtx, QMsgType, QOwner, 
-                candidateMate, U);
+                candidateMate, U,
                privateU,
                privateQLocalVtx,
                privateQGhostVtx,
                privateQMsgType,
                privateQOwner
                );
    finishTime = MPI_Wtime();
    *ph0_time = finishTime - startTime; //Time taken for Phase-0: Initialization      
    startTime = MPI_Wtime();
    /////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////// INITIALIZATION /////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////////////////
    //Compute the Initial Matching Set:
-#pragma omp parallel private(insertMe, k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateU, privateMyCard, isEmpty, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) firstprivate(StartIndex, EndIndex) default(shared) num_threads(4)
+#pragma omp parallel private(k, u, w, v, k1, adj1, adj2, adj11, adj12, heaviestEdgeWt, ghostOwner, privateMyCard, isEmpty) firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) num_threads(4)
    {
        /*
        * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
@ -272,21 +279,6 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
         */
        MilanLongInt size = numGhostVertices; //TODO how can I decide a more meaningfull size?
        //Fail messages
        privateQLocalVtx.~staticQueue();
        privateQGhostVtx.~staticQueue();
        privateQMsgType.~staticQueue();
        privateQOwner.~staticQueue();
        privateU.~staticQueue();
        new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
        new(&privateQLocalVtx) staticQueue(size);
        new(&privateQGhostVtx) staticQueue(size);
        new(&privateQMsgType) staticQueue(size);
        new(&privateQOwner) staticQueue(size);
 #pragma omp for reduction(+: msgInd, NumMessagesBundled, myCard, PCounter[:numProcs]) schedule(static)
        for (v = 0; v < NLVer; v++) {
            //Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
@ -334,8 +326,8 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                        assert(ghostOwner != myRank);
                        PCounter[ghostOwner]++;
                        //TODO why does it fail if I use a private data structure???
                        /*
                        privateQLocalVtx.push_back(v + StartIndex);
                        privateQGhostVtx.push_back(w);
@ -351,6 +343,7 @@ void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
                            QMsgType.push_back(REQUEST);
                            QOwner.push_back(ghostOwner);
                        } // end of critical region
                        if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
--- a/amgprec/impl/aggregator/dataStrStaticQueue.h
+++ b/amgprec/impl/aggregator/dataStrStaticQueue.h
@ -80,9 +80,11 @@ class staticQueue
 		MilanLongInt squeueTail;
 		MilanLongInt NumNodes;
 		//FIXME I had to comment this piece of code in order to make everything work.
 		//		why?
 		//Prevent Assignment and Pass by Value:
-		staticQueue(const staticQueue& src);
+		//staticQueue(const staticQueue& src);
-		staticQueue& operator=(const staticQueue& rhs);
+		//staticQueue& operator=(const staticQueue& rhs);
 	public:
 		//Constructors and Destructors
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@ -12,7 +12,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                        MilanLongInt StartIndex, MilanLongInt EndIndex,
                        MilanLongInt* numGhostEdgesPtr,
                        MilanLongInt* numGhostVerticesPtr,
                        MilanLongInt* insertMePtr,
                        MilanLongInt* S,
                        MilanLongInt* verLocInd,
                        MilanLongInt* verLocPtr,
@ -29,7 +28,12 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                        vector<MilanLongInt>& QMsgType,
                        vector<MilanInt>& QOwner,
                        MilanLongInt* &candidateMate,
-                        staticQueue& U
+                        staticQueue& U,
                        staticQueue& privateU,
                        staticQueue& privateQLocalVtx,
                        staticQueue& privateQGhostVtx,
                        staticQueue& privateQMsgType,
                        staticQueue& privateQOwner
                        )
 {
@ -37,7 +41,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
    MilanLongInt adj1, adj2;
    int i, v, k, w;
    // index that starts with zero to |Vg|  - 1
    map<MilanLongInt, MilanLongInt>::iterator storedAlready;
@ -64,10 +67,9 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
     *
     * Despite the critical region it is still productive to
     * parallelize this for because the critical region is exeuted
-     * only when a ghost edge is found and ghost edges are a minority.
+     * only when a ghost edge is found and ghost edges are a minority,
     * circa 3.5% during the tests.
     */
        // TODO comments about the reduction
 #pragma omp for reduction(+ : numGhostEdges)
        for (i = 0; i < NLEdge; i++) { //O(m) - Each edge stored twice
            insertMe = verLocInd[i];
@ -90,8 +92,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
            } //End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
        } //End of for(ghost vertices)
        #pragma omp single
        {
            //numGhostEdges = atomicNumGhostEdges;
@ -143,7 +143,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
            /*
             * Not parallelizable
             */
            for (i = 0; i < numGhostVertices; i++) { //O(|Ghost Vertices|)
                verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
@ -163,6 +162,10 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
        fflush(stdout);
 #endif
 #ifdef TIME_TRACKER
        double verGhostIndInitialization = MPI_Wtime();
 #endif
        /*
         * OMP verGhostIndInitialization
         *
@ -175,13 +178,8 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
         * Despite the critical region it's still useful to
         * parallelize the for cause the ghost nodes
         * are a minority hence the critical region is executed
-         * few times.
+         * few times, circa 3.5% of the times in the tests.
         */
 #ifdef TIME_TRACKER
        double verGhostIndInitialization = MPI_Wtime();
 #endif
 #pragma omp for nowait schedule(static)
        for (v = 0; v < NLVer; v++) {
            adj1 = verLocPtr[v];   //Vertex Pointer
@ -192,17 +190,14 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
 #pragma omp critical
                    {
                        insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; //Where to insert
                        verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
                        tempCounter[Ghost2LocalMap[w]]++; //Increment the counter
                    }
                    verGhostInd[insertMe] = v + StartIndex; //Add the adjacency
                } //End of if((w < StartIndex) || (w > EndIndex))
            } //End of for(k)
        } //End of for (v)
    }
-    #pragma omp single
+    } // End of parallel region
        {
 #ifdef TIME_TRACKER
            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
@ -216,11 +211,6 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
            cout<<endl; fflush(stdout);
 #endif
            Message.resize(3, -1);
            //message_type = 0;
            //NumMessagesBundled = 0;
            //ghostOwner = 0;
            try {
                QLocalVtx.reserve(numGhostEdges); //Local Vertex
                QGhostVtx.reserve(numGhostEdges); //Ghost Vertex
@ -232,23 +222,19 @@ inline void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                exit(1);
            }
        } // end of single region
 #ifdef PRINT_DEBUG_INFO_
 cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
 #endif
    *numGhostEdgesPtr = numGhostEdges;
    *numGhostVerticesPtr = numGhostVertices;  
    *insertMePtr = insertMe; 
    //Allocate Data Structures:
    /*
     * candidateMate was a vector and has been replaced with a raw array
-     * there is no point in using the vector (or maybe there is???)
+     * there is no point in using the vector (or maybe there is (???))
     * so I replaced it with an array wich is slightly faster
     */
    //candidateMate = new MilanLongInt[NLVer + numGhostVertices];
    candidateMate = new MilanLongInt[NLVer + numGhostVertices];
@ -267,7 +253,6 @@ cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
    *S = numGhostVertices; //Initialize S with number of Ghost Vertices
    /*
     * Create the Queue Data Structure for the Dominating Set
     *
@ -276,6 +261,14 @@ cout<<"\n("<<myRank<<")Allocating CandidateMate.. "; fflush(stdout);
     * of a staticQueue I had to destroy the previous object and instantiate
     * a new one of the correct size.
     */
    U.~staticQueue();
    new(&U) staticQueue(NLVer + numGhostVertices);
    //TODO how can I decide a more meaningfull size?
    MilanLongInt size = numGhostVertices;
    new(&privateU) staticQueue(NLVer + numGhostVertices); //TODO how can I put a meaningfull size?
    new(&privateQLocalVtx) staticQueue(size);
    new(&privateQGhostVtx) staticQueue(size);
    new(&privateQMsgType) staticQueue(size);
    new(&privateQOwner) staticQueue(size);
 }