#include "MatchBoxPC.h"
//#define DEBUG_HANG_
void processMatchedVerticesAndSendMessagesD(
					   MilanLongInt NLVer,
					   vector<MilanLongInt> &UChunkBeingProcessed,
					   vector<MilanLongInt> &U,
					   vector<MilanLongInt> &privateU,
					   MilanLongInt StartIndex,
					   MilanLongInt EndIndex,
					   MilanLongInt *myCard,
					   MilanLongInt *msgInd,
					   MilanLongInt *NumMessagesBundled,
					   MilanLongInt *SPtr,
					   MilanLongInt *verLocPtr,
					   MilanLongInt *verLocInd,
					   MilanLongInt *verDistance,
					   MilanLongInt *PCounter,
					   vector<MilanLongInt> &Counter,
					   MilanInt myRank,
					   MilanInt numProcs,
					   MilanLongInt *candidateMate,
					   vector<MilanLongInt> &GMate,
					   MilanLongInt *Mate,
					   map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
					   MilanReal *edgeLocWeight,
					   vector<MilanLongInt> &QLocalVtx,
					   vector<MilanLongInt> &QGhostVtx,
					   vector<MilanLongInt> &QMsgType,
 					   vector<MilanInt> &QOwner,
					   vector<MilanLongInt> &privateQLocalVtx,
					   vector<MilanLongInt> &privateQGhostVtx,
					   vector<MilanLongInt> &privateQMsgType,
					   vector<MilanInt> &privateQOwner,
					   MPI_Comm comm,
					   MilanLongInt *msgActual,
					   vector<MilanLongInt> &Message)
{

    MilanLongInt initialSize = QLocalVtx.size();
    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
    int option;
    MilanLongInt mateVal;

#ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
#endif

#ifdef COUNT_LOCAL_VERTEX
    MilanLongInt localVertices = 0;
#endif
    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
  firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, \
	       privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) \
  default(shared)							\
      num_threads(NUM_THREAD)						\
  reduction(+								\
	    : msgInd[:1], PCounter					\
	    [:numProcs], myCard						\
	    [:1], NumMessagesBundled					\
	    [:1], msgActual						\
	    [:1])
    {

        while (!U.empty()) {

	  extractUChunk(UChunkBeingProcessed, U, privateU);

	  for (MilanLongInt u : UChunkBeingProcessed) {
#ifdef PRINT_DEBUG_INFO_
	    cout << "\n(" << myRank << ")u: " << u;
	    fflush(stdout);
#endif
	    if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices

#ifdef COUNT_LOCAL_VERTEX
	      localVertices++;
#endif
	      // Get the Adjacency list for u
	      adj1 = verLocPtr[u - StartIndex]; // Pointer
	      adj2 = verLocPtr[u - StartIndex + 1];
	      for (k = adj1; k < adj2; k++) {
		option = -1;
		v = verLocInd[k];

		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:

#ifdef PRINT_DEBUG_INFO_
		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
		  fflush(stdout);
#endif
#pragma omp atomic read
		  mateVal = Mate[v - StartIndex];
		  // If the current vertex is pointing to a matched vertex and is not matched
		  if (mateVal < 0) {
#pragma omp critical
		    {
#pragma omp atomic read
		      mateVal = Mate[v - StartIndex];
		      // If the current vertex is pointing to a matched vertex and is not matched
		      if (mateVal < 0) {
			
			if (candidateMate[v - StartIndex] == u)  {
			  // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
			  w = computeCandidateMateD(verLocPtr[v - StartIndex],
						   verLocPtr[v - StartIndex + 1],
						   edgeLocWeight, 0,
						   verLocInd, StartIndex, EndIndex,
						   GMate, Mate, Ghost2LocalMap);
			  candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
			  cout << "\n(" << myRank << ")" << v << " Points to: " << w;
			  fflush(stdout);
#endif
			  // If found a dominating edge:
			  if (w >= 0)  {
			    
			    if ((w < StartIndex) || (w > EndIndex))  { // A ghost
#ifdef PRINT_DEBUG_INFO_
			      cout << "\n(" << myRank << ")Sending a request message:";
			      cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
#endif
			      option = 2;
			      
			      if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
				option = 1;
				Mate[v - StartIndex] = w;     // v is a local vertex
				GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
				
			      } // End of if CandidateMate[w] = v
			    }     // End of if a Ghost Vertex
			    else   { // w is a local vertex
			      if (candidateMate[w - StartIndex] == v)  {
				option = 3;
				Mate[v - StartIndex] = w; // v is a local vertex
				Mate[w - StartIndex] = v; // w is a local vertex
#ifdef PRINT_DEBUG_INFO_
				cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
				fflush(stdout);
#endif
			      } // End of if(CandidateMate(w) = v
			    }     // End of Else
			  }         // End of if(w >=0)
			  else
			    option = 4; // End of Else: w == -1
			  // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
			} // End of If (candidateMate[v-StartIndex] == u
		      }
		    }     // End of task
		  }         // mateval < 0
		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
		else { // Neighbor is a ghost vertex

#pragma omp critical
		  {
		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
		    if (v != Mate[u - StartIndex])
		      option = 5; // u is local
		  }                   // End of critical
		}                       // End of Else //A Ghost Vertex

		switch (option)
		  {
		  case -1:
		    // No things to do
		    break;
		  case 1:
		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
		    privateU.push_back(v);
		    privateU.push_back(w);
		    (*myCard)++;
#ifdef PRINT_DEBUG_INFO_
		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
		    fflush(stdout);
#endif
		    // Decrement the counter:
		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
		  case 2:

		    // Found a dominating edge, it is a ghost
		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);

		    // Build the Message Packet:
		    // Message[0] = v;       // LOCAL
		    // Message[1] = w;       // GHOST
		    // Message[2] = REQUEST; // TYPE
		    // Send a Request (Asynchronous)
		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);

		    (*msgActual)++;
		    (*msgInd)++;

		    privateQLocalVtx.push_back(v);
		    privateQGhostVtx.push_back(w);
		    privateQMsgType.push_back(REQUEST);
		    privateQOwner.push_back(ghostOwner);
		    break;
		  case 3:
		    privateU.push_back(v);
		    privateU.push_back(w);
		    (*myCard)++;
		    break;
		  case 4:
		    // Could not find a dominating vertex
		    adj11 = verLocPtr[v - StartIndex];
		    adj12 = verLocPtr[v - StartIndex + 1];
		    for (k1 = adj11; k1 < adj12; k1++) {
		      w = verLocInd[k1];
		      if ((w < StartIndex) || (w > EndIndex))  { // A ghost
#ifdef PRINT_DEBUG_INFO_
			cout << "\n(" << myRank << ")Sending a failure message: ";
			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
			fflush(stdout);
#endif
			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
			// Build the Message Packet:
			// Message[0] = v;       // LOCAL
			// Message[1] = w;       // GHOST
			// Message[2] = FAILURE; // TYPE
			// Send a Request (Asynchronous)
			// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);

			(*msgActual)++;
			(*msgInd)++;
			privateQLocalVtx.push_back(v);
			privateQGhostVtx.push_back(w);
			privateQMsgType.push_back(FAILURE);
			privateQOwner.push_back(ghostOwner);

		      } // End of if(GHOST)
		    }     // End of for loop
		    break;
		  case 5:
		  default:

#ifdef PRINT_DEBUG_INFO_
		    cout << "\n(" << myRank << ")Sending a success message: ";
		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
		    fflush(stdout);
#endif

		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
		    // Build the Message Packet:
		    // Message[0] = u;       // LOCAL
		    // Message[1] = v;       // GHOST
		    // Message[2] = SUCCESS; // TYPE
		    // Send a Request (Asynchronous)
		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
		    (*msgActual)++;
		    (*msgInd)++;
		    privateQLocalVtx.push_back(u);
		    privateQGhostVtx.push_back(v);
		    privateQMsgType.push_back(SUCCESS);
		    privateQOwner.push_back(ghostOwner);

		    break;
		  } // End of switch
	      }     // End of inner for
	    }
	  } // End of outer for

	  queuesTransfer(U, privateU, QLocalVtx,
			 QGhostVtx,
			 QMsgType, QOwner, privateQLocalVtx,
			 privateQGhostVtx,
			 privateQMsgType,
			 privateQOwner);

        } // End of while ( !U.empty() )

#ifdef COUNT_LOCAL_VERTEX
        printf("Count local vertexes: %ld for thread %d of processor %d\n",
               localVertices, mp_get_thread_num(), myRank);
#endif
    } // End of parallel region

    // Send the messages
#ifdef DEBUG_HANG_
    cout << myRank<<" Sending: "<<QOwner.size()-initialSize<<" messages" <<endl;
#endif
    for (int i = initialSize; i < QOwner.size(); i++)   {
      Message[0] = QLocalVtx[i];
      Message[1] = QGhostVtx[i];
      Message[2] = QMsgType[i];
      ghostOwner = QOwner[i];
      //MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
      //cout << myRank<<" Sending to "<<ghostOwner<<endl;
      MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
    }
#ifdef DEBUG_HANG_
    cout << myRank<<" Done sending messages"<<endl;
#endif
}



void processMatchedVerticesAndSendMessagesS(
					   MilanLongInt NLVer,
					   vector<MilanLongInt> &UChunkBeingProcessed,
					   vector<MilanLongInt> &U,
					   vector<MilanLongInt> &privateU,
					   MilanLongInt StartIndex,
					   MilanLongInt EndIndex,
					   MilanLongInt *myCard,
					   MilanLongInt *msgInd,
					   MilanLongInt *NumMessagesBundled,
					   MilanLongInt *SPtr,
					   MilanLongInt *verLocPtr,
					   MilanLongInt *verLocInd,
					   MilanLongInt *verDistance,
					   MilanLongInt *PCounter,
					   vector<MilanLongInt> &Counter,
					   MilanInt myRank,
					   MilanInt numProcs,
					   MilanLongInt *candidateMate,
					   vector<MilanLongInt> &GMate,
					   MilanLongInt *Mate,
					   map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
					   MilanFloat *edgeLocWeight,
					   vector<MilanLongInt> &QLocalVtx,
					   vector<MilanLongInt> &QGhostVtx,
					   vector<MilanLongInt> &QMsgType,
					   vector<MilanInt> &QOwner,
					   vector<MilanLongInt> &privateQLocalVtx,
					   vector<MilanLongInt> &privateQGhostVtx,
					   vector<MilanLongInt> &privateQMsgType,
					   vector<MilanInt> &privateQOwner,
					   MPI_Comm comm,
					   MilanLongInt *msgActual,
					   vector<MilanLongInt> &Message)
{

    MilanLongInt initialSize = QLocalVtx.size();
    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
    int option;
    MilanLongInt mateVal;

#ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
#endif

#ifdef COUNT_LOCAL_VERTEX
    MilanLongInt localVertices = 0;
#endif
    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\
		 privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
        num_threads(NUM_THREAD)                                                        \
            reduction(+                                                                \
                      : msgInd[:1], PCounter                                           \
                      [:numProcs], myCard                                              \
                      [:1], NumMessagesBundled                                         \
                      [:1], msgActual                                                  \
                      [:1])
    {

        while (!U.empty()) {

	  extractUChunk(UChunkBeingProcessed, U, privateU);

	  for (MilanLongInt u : UChunkBeingProcessed) {
#ifdef PRINT_DEBUG_INFO_
	    cout << "\n(" << myRank << ")u: " << u;
	    fflush(stdout);
#endif
	    if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices

#ifdef COUNT_LOCAL_VERTEX
	      localVertices++;
#endif
	      // Get the Adjacency list for u
	      adj1 = verLocPtr[u - StartIndex]; // Pointer
	      adj2 = verLocPtr[u - StartIndex + 1];
	      for (k = adj1; k < adj2; k++) {
		option = -1;
		v = verLocInd[k];

		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:

#ifdef PRINT_DEBUG_INFO_
		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
		  fflush(stdout);
#endif
#pragma omp atomic read
		  mateVal = Mate[v - StartIndex];
		  // If the current vertex is pointing to a matched vertex and is not matched
		  if (mateVal < 0) {
#pragma omp critical
		    {
#pragma omp atomic read
		      mateVal = Mate[v - StartIndex];
		      // If the current vertex is pointing to a matched vertex and is not matched
		      if (mateVal < 0) {
			
			if (candidateMate[v - StartIndex] == u)  {
			  // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
			  w = computeCandidateMateS(verLocPtr[v - StartIndex],
						   verLocPtr[v - StartIndex + 1],
						   edgeLocWeight, 0,
						   verLocInd, StartIndex, EndIndex,
						   GMate, Mate, Ghost2LocalMap);
			  candidateMate[v - StartIndex] = w;
#ifdef PRINT_DEBUG_INFO_
			  cout << "\n(" << myRank << ")" << v << " Points to: " << w;
			  fflush(stdout);
#endif
			  // If found a dominating edge:
			  if (w >= 0)  {
			    
			    if ((w < StartIndex) || (w > EndIndex))  { // A ghost
#ifdef PRINT_DEBUG_INFO_
			      cout << "\n(" << myRank << ")Sending a request message:";
			      cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
#endif
			      option = 2;
			      
			      if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
				option = 1;
				Mate[v - StartIndex] = w;     // v is a local vertex
				GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
				
			      } // End of if CandidateMate[w] = v
			    }     // End of if a Ghost Vertex
			    else   { // w is a local vertex
			      if (candidateMate[w - StartIndex] == v)  {
				option = 3;
				Mate[v - StartIndex] = w; // v is a local vertex
				Mate[w - StartIndex] = v; // w is a local vertex
#ifdef PRINT_DEBUG_INFO_
				cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
				fflush(stdout);
#endif
			      } // End of if(CandidateMate(w) = v
			    }     // End of Else
			  }         // End of if(w >=0)
			  else
			    option = 4; // End of Else: w == -1
			  // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
			} // End of If (candidateMate[v-StartIndex] == u
		      }
		    }     // End of task
		  }         // mateval < 0
		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
		else { // Neighbor is a ghost vertex

#pragma omp critical
		  {
		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
		    if (v != Mate[u - StartIndex])
		      option = 5; // u is local
		  }                   // End of critical
		}                       // End of Else //A Ghost Vertex

		switch (option)
		  {
		  case -1:
		    // No things to do
		    break;
		  case 1:
		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
		    privateU.push_back(v);
		    privateU.push_back(w);
		    (*myCard)++;
#ifdef PRINT_DEBUG_INFO_
		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
		    fflush(stdout);
#endif
		    // Decrement the counter:
		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
		  case 2:

		    // Found a dominating edge, it is a ghost
		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);

		    // Build the Message Packet:
		    // Message[0] = v;       // LOCAL
		    // Message[1] = w;       // GHOST
		    // Message[2] = REQUEST; // TYPE
		    // Send a Request (Asynchronous)
		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);

		    (*msgActual)++;
		    (*msgInd)++;

		    privateQLocalVtx.push_back(v);
		    privateQGhostVtx.push_back(w);
		    privateQMsgType.push_back(REQUEST);
		    privateQOwner.push_back(ghostOwner);
		    break;
		  case 3:
		    privateU.push_back(v);
		    privateU.push_back(w);
		    (*myCard)++;
		    break;
		  case 4:
		    // Could not find a dominating vertex
		    adj11 = verLocPtr[v - StartIndex];
		    adj12 = verLocPtr[v - StartIndex + 1];
		    for (k1 = adj11; k1 < adj12; k1++) {
		      w = verLocInd[k1];
		      if ((w < StartIndex) || (w > EndIndex))  { // A ghost
#ifdef PRINT_DEBUG_INFO_
			cout << "\n(" << myRank << ")Sending a failure message: ";
			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
			fflush(stdout);
#endif
			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
			// Build the Message Packet:
			// Message[0] = v;       // LOCAL
			// Message[1] = w;       // GHOST
			// Message[2] = FAILURE; // TYPE
			// Send a Request (Asynchronous)
			// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);

			(*msgActual)++;
			(*msgInd)++;
			privateQLocalVtx.push_back(v);
			privateQGhostVtx.push_back(w);
			privateQMsgType.push_back(FAILURE);
			privateQOwner.push_back(ghostOwner);

		      } // End of if(GHOST)
		    }     // End of for loop
		    break;
		  case 5:
		  default:

#ifdef PRINT_DEBUG_INFO_
		    cout << "\n(" << myRank << ")Sending a success message: ";
		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
		    fflush(stdout);
#endif

		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
		    // Build the Message Packet:
		    // Message[0] = u;       // LOCAL
		    // Message[1] = v;       // GHOST
		    // Message[2] = SUCCESS; // TYPE
		    // Send a Request (Asynchronous)
		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
		    (*msgActual)++;
		    (*msgInd)++;
		    privateQLocalVtx.push_back(u);
		    privateQGhostVtx.push_back(v);
		    privateQMsgType.push_back(SUCCESS);
		    privateQOwner.push_back(ghostOwner);

		    break;
		  } // End of switch
	      }     // End of inner for
	    }
	  } // End of outer for

	  queuesTransfer(U, privateU, QLocalVtx,
			 QGhostVtx,
			 QMsgType, QOwner, privateQLocalVtx,
			 privateQGhostVtx,
			 privateQMsgType,
			 privateQOwner);

        } // End of while ( !U.empty() )

#ifdef COUNT_LOCAL_VERTEX
        printf("Count local vertexes: %ld for thread %d of processor %d\n",
               localVertices, mp_get_thread_num(), myRank);
#endif
    } // End of parallel region

    // Send the messages
#ifdef DEBUG_HANG_
    cout << myRank<<" Sending: "<<QOwner.size()-initialSize<<" messages" <<endl;
#endif
    for (int i = initialSize; i < QOwner.size(); i++)   {
      Message[0] = QLocalVtx[i];
      Message[1] = QGhostVtx[i];
      Message[2] = QMsgType[i];
      ghostOwner = QOwner[i];
      //MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
      //cout << myRank<<" Sending to "<<ghostOwner<<endl;
      MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
    }
#ifdef DEBUG_HANG_
    cout << myRank<<" Done sending messages"<<endl;
#endif
}