experimental functions in C++

main
Luca Lombardo 2 years ago
parent b58f99d9a1
commit 790d9e865e

@ -0,0 +1,122 @@
#include <iostream>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <thread>
#include <functional>
#include <set>
using namespace std;
// It receives the file name as a string and returns a dictionary with the keys being the UserID and the values being a vector of VenueID associated with that UserID.
unordered_map<string, set<string>> createDictFromFile(string filename) {
// Create an empty dictionary
unordered_map<string, set<string>> dict;
// Open the file
ifstream file(filename);
// Check if the file was opened successfully
if (!file.is_open()) {
cerr << "Error opening file " << filename << endl;
return dict;
}
// Read the file line by line
string userId, venueId;
while (file.good()) {
file >> userId >> venueId;
// Add the venueId to the vector of venues associated with the userId
dict[userId].insert(venueId);
}
cout << "Dict created" << endl;
// Return the dictionary
return dict;
}
// void create_tsv_multi(unordered_map<string, vector<string>> dict, mutex& dict_mutex) {
// // Create an output stream to write the file
// ofstream out_file("output.tsv");
// // Create a mutex to protect the output file
// mutex out_file_mutex;
// // Loop over all the key-value pairs in the map
// for (const auto& kv1 : dict) {
// for (const auto& kv2 : dict) {
// // Check if the keys are the same
// if (kv1.first == kv2.first) continue;
// // Check if the values have elements in common
// vector<string> common;
// for (const auto& str1 : kv1.second) {
// for (const auto& str2 : kv2.second) {
// if (str1 == str2) common.push_back(str1);
// }
// }
// // Write the keys and the number of common elements to the output file
// if (!common.empty()) {
// // Lock the mutexes before accessing the dict and the output file
// lock_guard<mutex> dict_guard(dict_mutex);
// lock_guard<mutex> out_file_guard(out_file_mutex);
// out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
// }
// }
// }
// }
void create_tsv(const unordered_map<string, set<string>>& dict, string outfilename) {
// Create an output stream to write the file
ofstream out_file(outfilename);
// Loop over all the key-value pairs in the map
unsigned long long i = 0;
for (const auto& kv1 : dict) {
if (!((++i) & 127)) cout << (((double)i) * 100 / dict.size()) << "%\r" << flush;
for (const auto& kv2 : dict) {
// Check if the keys are the same
if(kv1.first >= kv2.first) continue;
// Check if the values have elements in common
set<string> common;
set_intersection(kv1.second.begin(), kv1.second.end(), kv2.second.begin(), kv2.second.end(), inserter(common, common.begin()));
// Write the keys and the number of common elements to the output file
if (!common.empty()) {
out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
// cout << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
}
}
}
}
void print_help() {
cout << "Usage: ./main IN_FILE_PATH OUT_FILE_PATH" << endl;
cout << "Suggested options: \n\t./main data/brightkite/brightkite_checkins.txt data/brightkite/brightkite_checkins_graph.tsv \n\t./main data/gowalla/gowalla_checkins.txt data/gowalla/gowalla_checkins_graph.tsv \n\t./main data/foursquare/foursquare_checkins.txt data/foursquare/foursquare_checkins_graph.tsv" << endl;
}
// int main() {
// unordered_map<string, set<string>> dict = createDictFromFile("data/foursquare/foursquare_checkins_TKY.txt");
// create_tsv(dict);
// }
int main(int argc, const char* argv[]) {
if (argc == 3) {
string in_file = argv[1];
string out_file = argv[2];
if (in_file == "-h" || in_file == "--help" || out_file == "-h" || out_file == "--help") {
print_help();
return 0;
}
unordered_map<string, set<string>> dict = createDictFromFile(in_file);
create_tsv(dict, out_file);
return 0;
} else {
print_help();
return 0;
}
}

@ -0,0 +1,684 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <cmath>
#include <random>
#include <queue>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <sstream>
class Graph {
public:
// Default constructor
Graph() {}
// Add a node to the graph
void add_node(int node) {
nodes_.insert(node);
}
// Add an edge to the graph
void add_edge(int u, int v) {
adjacency_list_[u].insert(v);
adjacency_list_[v].insert(u);
}
// Remove a node from the graph
void remove_node(int node) {
nodes_.erase(node);
adjacency_list_.erase(node);
for (auto& pair : adjacency_list_) {
pair.second.erase(node);
}
}
// Remove an edge from the graph
void remove_edge(int u, int v) {
adjacency_list_[u].erase(v);
adjacency_list_[v].erase(u);
}
// Return the number of nodes in the graph. Use unsigned int to avoid compiler warning
unsigned int num_nodes() const {
return nodes_.size();
}
// Return the number of edges in the graph
unsigned int num_edges() const {
int num_edges = 0;
for (const auto& pair : adjacency_list_) {
num_edges += pair.second.size();
}
return num_edges / 2;
}
// Return the degree of a node
int degree(int node) const {
return adjacency_list_.at(node).size();
}
// Return the neighbors of a node
std::vector<int> neighbors(int node) const {
return std::vector<int>(adjacency_list_.at(node).begin(),
adjacency_list_.at(node).end());
}
// Check if a node is in the graph
bool has_node(int node) const {
return nodes_.count(node) > 0;
}
// Check if an edge exists in the graph
bool has_edge(int u, int v) const {
if (!has_node(u) || !has_node(v)) {
return false;
}
return adjacency_list_.at(u).count(v) > 0;
}
// Check if the graph is connected
bool is_connected() const {
if (num_nodes() == 0) {
return true;
}
std::unordered_set<int> visited;
std::queue<int> queue;
queue.push(*nodes_.begin());
while (!queue.empty()) {
int node = queue.front();
queue.pop();
if (visited.count(node) == 0) {
visited.insert(node);
for (int neighbor : neighbors(node)) {
queue.push(neighbor);
}
}
}
// convert num_nodes() to unsigned int to avoid compiler warning
return visited.size() == num_nodes();
}
// Return the set of nodes in the graph
const std::unordered_set<int>& nodes() const {
return nodes_;
}
// Return the set of edges in the graph
std::vector<std::pair<int, int>> edges() const {
std::vector<std::pair<int, int>> edges;
for (const auto& pair : adjacency_list_) {
int u = pair.first;
for (int v : pair.second) {
if (u < v) {
edges.push_back({u, v});
}
}
}
return edges;
}
// Return the adjacency list representation of the graph
const std::unordered_map<int, std::unordered_set<int>>& adjacency_list() const {
return adjacency_list_;
}
// Info function that prints number of nodes and edges
void info() const {
std::cout << "Graph with " << num_nodes() << " nodes and " << num_edges() << " edges" << std::endl;
}
// Density function that returns the density of the graph
double density() const {
return 2.0 * num_edges() / (num_nodes() * (num_nodes() - 1));
}
private:
// Set of nodes in the graph
std::unordered_set<int> nodes_;
// Adjacency list representation of the graph
std::unordered_map<int, std::unordered_set<int>> adjacency_list_;
};
// Read graph from edge list file
Graph read_graph(const std::string& filename) {
Graph G;
std::ifstream file(filename);
std::string line;
while (std::getline(file, line)) {
std::stringstream ss(line);
int u, v;
ss >> u >> v;
G.add_node(u);
G.add_node(v);
G.add_edge(u, v);
}
G.info();
return G;
}
// Return a bool if there is a path between two nodes
bool has_path(const Graph& G, int u, int v) {
if (!G.has_node(u) || !G.has_node(v)) {
return false;
}
std::unordered_set<int> visited;
std::queue<int> queue;
queue.push(u);
while (!queue.empty()) {
int node = queue.front();
queue.pop();
if (visited.count(node) == 0) {
visited.insert(node);
for (int neighbor : G.neighbors(node)) {
if (neighbor == v) {
return true;
}
queue.push(neighbor);
}
}
}
return false;
}
// Check if the graph is connected. If not, return a list of connected components
std::vector<std::vector<int>> connected_components(const Graph& G) {
std::vector<std::vector<int>> components;
std::unordered_set<int> visited;
for (int u : G.nodes()) {
if (visited.count(u) == 0) {
std::vector<int> component;
std::queue<int> q;
q.push(u);
visited.insert(u);
while (!q.empty()) {
int v = q.front();
q.pop();
component.push_back(v);
for (int w : G.neighbors(v)) {
if (visited.count(w) == 0) {
visited.insert(w);
q.push(w);
}
}
}
components.push_back(component);
}
}
return components;
}
// Return the largest connected component of a graph, use the connected_components function
Graph largest_component(const Graph& G) {
std::vector<std::vector<int>> components = connected_components(G);
std::vector<int> largest_component;
unsigned int largest_size = 0;
for (const std::vector<int>& component : components) {
if (component.size() > largest_size) {
largest_size = component.size();
largest_component = component;
}
}
Graph H;
for (int u : largest_component) {
for (int v : G.neighbors(u)) {
if (std::find(largest_component.begin(), largest_component.end(), v) != largest_component.end()) {
H.add_edge(u, v);
}
}
}
return H;
}
// Randomly rewire an edge with probability `p`
void rewire(Graph& G, int u, int v, double p) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0, 1);
if (dis(gen) < p) {
// Choose a new node randomly
std::uniform_int_distribution<> node_dis(0, G.nodes().size() - 1);
int w = *std::next(G.nodes().begin(), node_dis(gen));
// Remove the edge (u, v) and add the edge (u, w)
G.add_edge(u, w);
G.add_edge(v, w);
}
}
// Compute a random graph by swapping edges of a given graph.
Graph random_reference(const Graph& G, int k) {
Graph H;
// Add the nodes to the new graph
for (int u : G.nodes()) {
H.add_node(u);
}
// Choose the probability of rewiring an edge
double p = k / (G.nodes().size() - 1.0);
// Add edges to the new graph
for (int u : G.nodes()) {
for (int v : G.neighbors(u)) {
if (u < v) {
H.add_edge(u, v);
rewire(H, u, v, p);
}
}
}
return H;
}
// Latticize the given graph by swapping edges.
Graph lattice_reference(const Graph& G, int n) {
Graph H;
// Add the nodes to the new graph
for (int u : G.nodes()) {
H.add_node(u);
}
// Add edges to the new graph
for (int u : G.nodes()) {
for (int v : G.neighbors(u)) {
if (u < v && has_path(G, u, v)) {
// Check if u and v are neighbors in the lattice
int u_x = u / n;
int u_y = u % n;
int v_x = v / n;
int v_y = v % n;
if ((u_x == v_x && std::abs(u_y - v_y) == 1) ||
(u_y == v_y && std::abs(u_x - v_x) == 1)) {
H.add_edge(u, v);
}
}
}
}
return H;
}
std::vector<int> cumulative_distribution(const Graph& G) {
std::vector<int> cdf;
int sum = 0;
for (int u : G.nodes()) {
sum += G.degree(u);
cdf.push_back(sum);
}
return cdf;
}
// Calculate the D matrix for the given number of nodes.
std::vector<std::vector<int>> D_matrix(int n) {
std::vector<std::vector<int>> D(n, std::vector<int>(n));
std::vector<int> un(n - 1), um(n - 1);
std::iota(un.begin(), un.end(), 1);
std::iota(um.rbegin(), um.rend(), 1);
std::vector<int> u(n);
u[0] = 0;
for (int i = 1; i < n; i++) {
u[i] = (un[i - 1] < um[i - 1]) ? un[i - 1] : um[i - 1];
}
for (int v = 0; v < std::ceil(n / 2.0); v++) {
std::vector<int> d(u.begin() + v + 1, u.end());
d.insert(d.end(), u.begin(), u.begin() + v + 1);
D[n - v - 1] = d;
D[v] = std::vector<int>(d.rbegin(), d.rend());
}
return D;
}
// Choose a value from the given cumulative distribution.
int discrete_sequence(int n, const std::vector<int>& cdf, std::mt19937& rng) {
std::uniform_int_distribution<int> dist(0, cdf.back());
int value = dist(rng);
return std::lower_bound(cdf.begin(), cdf.end(), value) - cdf.begin();
}
int random_choice(const std::vector<int>& cdf, std::mt19937& rng) {
return discrete_sequence(1, cdf, rng);
}
// takes as input a graph, an int, an array of distance to the diagonal matrix (default None), a bool for connected (default True)
Graph lattice_reference2(const Graph& G, int niter, std::vector<std::vector<int>> distance_to_diagonal = {}, bool connected = true) {
Graph H;
// Add the nodes to the new graph
for (int u : G.nodes()) {
H.add_node(u);
}
// if there are less then 4 nodes, return an error
if (G.nodes().size() < 4) {
std::cout << "Error: Graph must have at least 4 nodes" << std::endl;
return H;
}
// Calculate the cumulative distribution of node degree
std::vector<int> cdf = cumulative_distribution(G);
// Calculate the D matrix
std::vector<std::vector<int>> D = D_matrix(G.num_nodes());
// niter = niter * nedges
niter = niter * G.num_edges();
// # maximal number of rewiring attempts per 'niter'
int max_attempts = G.num_nodes() * G.num_edges() / (G.num_nodes() * (G.num_nodes() - 1) / 2);
// For loop in range niter
for (int i = 0; i < niter; i++) {
int n = 0;
while (n < max_attempts) {
// pick two random edges without creating edge list
// choose source node indices from discrete distribution
std::mt19937 rng;
rng.seed(std::random_device{}());
int u = random_choice(cdf, rng);
int v = random_choice(cdf, rng);
// choose target node indices from discrete distribution
int w = random_choice(cdf, rng);
int x = random_choice(cdf, rng);
// check if the edges are distinct
if (u == v || u == w || u == x || v == w || v == x || w == x) {
n++;
continue;
}
// check if the edges are already present
if (H.has_edge(u, v) || H.has_edge(u, w) || H.has_edge(u, x) || H.has_edge(v, w) || H.has_edge(v, x) || H.has_edge(w, x)) {
n++;
continue;
}
// check if the edges are neighbors
if (G.has_edge(u, v) || G.has_edge(u, w) || G.has_edge(u, x) || G.has_edge(v, w) || G.has_edge(v, x) || G.has_edge(w, x)) {
n++;
continue;
}
// check if the edges are in the same distance to the diagonal. Do not create parallel edges
if (distance_to_diagonal.size() > 0) {
int u_x = u / n;
int u_y = u % n;
int v_x = v / n;
int v_y = v % n;
int w_x = w / n;
int w_y = w % n;
int x_x = x / n;
int x_y = x % n;
int d_uv = distance_to_diagonal[u_x][u_y] + distance_to_diagonal[v_x][v_y];
int d_ux = distance_to_diagonal[u_x][u_y] + distance_to_diagonal[x_x][x_y];
int d_uw = distance_to_diagonal[u_x][u_y] + distance_to_diagonal[w_x][w_y];
int d_vx = distance_to_diagonal[v_x][v_y] + distance_to_diagonal[x_x][x_y];
int d_vw = distance_to_diagonal[v_x][v_y] + distance_to_diagonal[w_x][w_y];
int d_wx = distance_to_diagonal[w_x][w_y] + distance_to_diagonal[x_x][x_y];
if (d_uv == d_ux || d_uv == d_uw || d_uv == d_vx || d_uv == d_vw || d_uv == d_wx) {
n++;
continue;
}
}
// check if the edges are connected
if (connected) {
if (has_path(H, u, v) || has_path(H, u, w) || has_path(H, u, x) || has_path(H, v, w) || has_path(H, v, x) || has_path(H, w, x)) {
n++;
continue;
}
}
// add the edges to the new graph
H.add_edge(u, v);
H.add_edge(u, w);
H.add_edge(u, x);
H.add_edge(v, w);
H.add_edge(v, x);
H.add_edge(w, x);
break;
}
}
return H;
}
// Latticize the given graph by swapping edges and return the modified graph.
Graph latticize(const Graph& G, int niter) {
Graph H = G;
// Create a random number generator
std::mt19937 rng;
rng.seed(std::random_device{}());
std::vector<int> degrees;
for (int u : H.nodes()) {
degrees.push_back(H.degree(u));
}
// Create a cumulative distribution of the node degrees
std::vector<int> keys;
for (int u : H.nodes()) {
keys.push_back(u);
degrees.push_back(H.degree(u));
}
std::vector<int> cdf = cumulative_distribution(H);
for (int i = 0; i < niter; i++) {
// Choose two random nodes
int u = keys[random_choice(cdf, rng)];
int v = keys[random_choice(cdf, rng)];
// Choose two random neighbors of the nodes
std::vector<int> neighbors_u = H.neighbors(u);
int w = neighbors_u[rng() % neighbors_u.size()];
std::vector<int> neighbors_v = H.neighbors(v);
int x = neighbors_v[rng() % neighbors_v.size()];
// Swap the edges
H.add_edge(u, x);
H.add_edge(v, w);
H.remove_edge(u, w);
H.remove_edge(v, x);
}
// Return the modified graph
return H;
}
// Calculate the clustering coefficient of a node
double clustering_coefficient(const Graph& G, int u) {
std::unordered_set<int> neighbors;
for (int v : G.neighbors(u)) {
neighbors.insert(v);
}
int num_neighbors = neighbors.size();
if (num_neighbors < 2) {
return 0.0;
}
int num_edges = 0;
for (int v : neighbors) {
for (int w : neighbors) {
if (v < w && G.has_edge(v, w)) {
num_edges++;
}
}
}
return static_cast<double>(num_edges) / (num_neighbors * (num_neighbors - 1) / 2.0);
}
// Calculate the average clustering coefficient of a graph
double average_clustering(const Graph& G) {
int num_nodes = G.nodes().size();
double sum = 0.0;
for (int u : G.nodes()) {
sum += clustering_coefficient(G, u);
}
return sum / num_nodes;
}
// Calculate the shortest path between two nodes using breadth-first search (Dijkstra's algorithm)
std::vector<int> shortest_path(const Graph& G, int source, int target) {
std::queue<int> q;
std::unordered_map<int, int> predecessor;
std::unordered_set<int> visited;
q.push(source);
visited.insert(source);
while (!q.empty()) {
int u = q.front();
q.pop();
for (int v : G.neighbors(u)) {
if (visited.count(v) == 0) {
visited.insert(v);
predecessor[v] = u;
q.push(v);
}
}
}
// Construct the shortest path
std::vector<int> path;
if (predecessor.count(target) > 0) {
int u = target;
while (u != source) {
path.push_back(u);
u = predecessor[u];
}
path.push_back(source);
}
std::reverse(path.begin(), path.end());
return path;
}
// Calculate the average shortest path length of a graph
double average_shortest_path_length(const Graph& G) {
double sum = 0.0;
int num_paths = 0;
for (int u : G.nodes()) {
for (int v : G.nodes()) {
if (u < v && has_path(G, u, v)) {
sum += shortest_path(G, u, v).size();
num_paths++;
}
}
}
return sum / num_paths;
}
// Calculate the average degree of a graph
double average_degree(const Graph& G) {
int num_nodes = G.nodes().size();
int sum = 0;
for (int u : G.nodes()) {
sum += G.neighbors(u).size();
}
return static_cast<double>(sum) / num_nodes;
}
// Calculate the omega index of a graph
double omega(const Graph& G, int niter=2, int nrand=2) {
double C = average_clustering(G);
std::cout << "Clustering Coefficient of the original graph = " << C << std::endl;
double L = average_shortest_path_length(G);
std::cout << "L = " << L << std::endl;
double Lr_sum = 0.0;
std::cout << "Starting random reference" << std::endl;
for (int i = 0; i < nrand; i++) {
std::cout << "\tIteration " << i+1 << std::endl;
Graph H = random_reference(G, niter);
std::cout << "\tCreated random reference" << std::endl;
Lr_sum += average_shortest_path_length(H);
std::cout << "\tCalculated average shortest path length of the random reference" << std::endl;
}
double Lr = Lr_sum / nrand;
double Cl_sum = 0.0;
std::cout << "Starting lattice reference" << std::endl;
for (int i = 0; i < nrand; i++) {
std::cout << "\tIteration " << i+1 << std::endl;
Graph H = latticize(G, niter);
std::cout << "\tCreated lattice reference" << std::endl;
Cl_sum += average_clustering(H);
std::cout << "\tCalculated average clustering of the lattice reference" << std::endl;
}
double Cl = Cl_sum / nrand;
return Lr / L - C / Cl;
}
// Calculate the sigma index of a graph
double sigma(const Graph& G, int niter=2, int nrand=2) {
double C = average_clustering(G);
std::cout << "Clustering Coefficient of the original graph = " << C << std::endl;
double L = average_shortest_path_length(G);
std::cout << "Average shortest path of the original graph = " << L << std::endl;
double Lr_sum = 0.0;
double Cl_sum = 0.0;
for (int i = 0; i < nrand; i++) {
std::cout << "\tIteration " << i+1 << std::endl;
Graph H = random_reference(G, niter);
std::cout << "\tCreated random reference" << std::endl;
Lr_sum += average_shortest_path_length(H);
std::cout << "\tCalculated average shortest path length of the random reference" << std::endl;
Cl_sum += average_clustering(H);
std::cout << "\tCalculated average clustering of the random reference" << std::endl;
}
double Lr = Lr_sum / nrand;
double Cl = Cl_sum / nrand;
return (Lr / L) / (C / Cl);
}
int main() {
std::cout << "\nStarting the computation for the Foursquare network" << std::endl;
Graph foursquare = read_graph("data/foursquare/foursquare_friends_edges_filtered.tsv");
// std::cout << "Omega: " << omega(foursquare) << std::endl;
std::cout << "Sigma: " << sigma(foursquare) << std::endl;
std::cout << "\nStarting the computation for the Brightkite network" << std::endl;
Graph brightkite = read_graph("data/brightkite/brightkite_friends_edges_filtered.tsv");
// std::cout << "Omega: " << omega(brightkite) << std::endl;
std::cout << "Sigma: " << sigma(brightkite) << std::endl;
std::cout << "\nStarting the computation for the Gowalla network" << std::endl;
Graph gowalla = read_graph("data/gowalla/gowalla_friends_edges_filtered.tsv");
// std::cout << "Omega: " << omega(gowalla) << std::endl;
std::cout << "Sigma: " << sigma(gowalla) << std::endl;
return 0;
}

@ -0,0 +1,73 @@
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <utility>
#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/small_world_generator.hpp>
#include <boost/random/linear_congruential.hpp>
using namespace std;
using namespace boost;
typedef adjacency_list<vecS, vecS, undirectedS, no_property, no_property> Graph;
typedef small_world_iterator<minstd_rand, Graph> SWGen;
vector<pair<int, int>> lattice_reference(const string& edge_list_file, int niter, bool connectivity) {
vector<pair<int, int>> edges;
int num_nodes = 0;
// Read in the edge list from the input file
ifstream in(edge_list_file);
string line;
while (getline(in, line)) {
int u, v;
sscanf(line.c_str(), "%d\t%d", &u, &v);
edges.emplace_back(u, v);
num_nodes = max(num_nodes, max(u, v));
}
// Construct the graph from the edge list
Graph g(edges.begin(), edges.end(), num_nodes + 1);
// Create the small-world generator
minstd_rand gen;
SWGen sw_gen(g, niter);
// Generate the lattice reference and store the resulting edge list
vector<pair<int, int>> lattice_edges;
for (int i = 0; i < num_nodes; ++i) {
auto [u, v] = *sw_gen;
lattice_edges.emplace_back(u, v);
++sw_gen;
}
// convert the vector of pairs in a .tsv file called "lattice_reference.tsv"
ofstream out("lattice_reference.tsv");
for (const auto& [u, v] : lattice_edges) {
out << u << "\t" << v << endl;
}
// return the vector of pairs
return lattice_edges;
}
// main
int main(int argc, char* argv[]) {
if (argc != 4) {
cerr << "Usage: " << argv[0] << " <edge_list_file> <niter> <connectivity>" << endl;
return 1;
}
string edge_list_file = argv[1];
int niter = atoi(argv[2]);
bool connectivity = atoi(argv[3]);
lattice_reference(edge_list_file, niter, connectivity);
return 0;
}
Loading…
Cancel
Save