diff --git a/.gitignore b/.gitignore index ae6c52c..a74eeb9 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,4 @@ data/ *.out *.synctex.gz *.toc +*.txt diff --git a/main.py b/main.py index a225103..7973f3c 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ # /bin/env/python3 +from itertools import combinations import os import wget import zipfile @@ -96,40 +97,44 @@ def download_datasets(): os.system("gunzip {}".format(os.path.join("data", "gowalla", file))) -def foursquare_checkins_graph(dataset: Literal['NYC', 'TKY'])-> nx.Graph: +def create_checkins_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY'])-> nx.Graph: """ - This function takes in input a tsv file with 8 columns, each line in the file is a check-in. The function returns an undirected networkx graph object. + This function takes in input a tsv file, each line in the file is a check-in. The function returns an undirected networkx graph object. - Differently from the function create_graph used for the brightkite and gowalla dataset, we are not given a list of edges, so we can't use the function nx.from_pandas_edgelist. We have to create the graph manually. - - Firstly, we retrive the unique user ID using the set() data structure: this are the nodes of our graph. Since we don't want to work with adjacency matrices due to their O(mn) space complexity (even tho, we could memorize them in a compressed way thanks to their sparsity propriety), we use an adjacency list representation of the graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues. + Firstly, we retrive the unique user ID: this are the nodes of our graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues. """ - if dataset not in ["NYC", "TKY"]: - raise ValueError("The dataset must be NYC or TKY") + if dataset not in ['brightkite', 'gowalla', + 'foursquareNYC', 'foursquareTKY']: + raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquareNYC, foursquareTKY") - file = os.path.join("data", "foursquare", "dataset_TSMC2014_{}.txt".format(dataset)) + # based on the dataset, we have to read the file in a different way. + if dataset == "foursquareNYC": + file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt") + df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore") - df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "Timezone offset in minutes", "UTC time"], encoding="utf-8", encoding_errors="ignore") + elif dataset == "foursquareTKY": + file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt") + df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore") + else: + file = os.path.join("data", dataset, "loc-{}_totalCheckins.txt".format(dataset)) + df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "CheckIn", "latitude", "longitude", "VenueID"], encoding="utf-8", encoding_errors="ignore") - users = set(df["UserID"]) # get the unique users ID + # get the unique users ID + users = df["UserID"].unique() G = nx.Graph() G.add_nodes_from(users) + print("Number of nodes added to the graph {}: {}".format(dataset, G.number_of_nodes())) - users_venues = {} # key: user ID, value: set of venues ID - for user in users: - users_venues[user] = set(df[df["UserID"] == user]["VenueID"]) - - # create the edges - for user1 in users: # nested for loop in python, I'm crying. C++ I miss you - for user2 in users: - if user1 != user2: - if len(users_venues[user1].intersection(users_venues[user2])) > 0: - G.add_edge(user1, user2, weight=len(users_venues[user1].intersection(users_venues[user2]))) + users_venues = df.groupby("UserID")["VenueID"].apply(list).to_dict() - return G + for user1, user2 in combinations(users, 2): + intersection = set(users_venues[user1]) & set(users_venues[user2]) + if len(intersection) > 0: + G.add_edge(user1, user2, weight=len(intersection)) + print("Number of edges added to the graph {}: {}".format(dataset, G.number_of_edges())) def friendships_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph: diff --git a/ref.bib b/ref.bib deleted file mode 100644 index 3df3100..0000000 --- a/ref.bib +++ /dev/null @@ -1,11 +0,0 @@ -@article{yang2014modeling, - author={Yang, Dingqi and Zhang, Daqing and Zheng, Vincent. W. and Yu, Zhiyong}, - journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems}, - title={Modeling User Activity Preference by Leveraging User Spatial Temporal Characteristics in LBSNs}, - year={2015}, - volume={45}, - number={1}, - pages={129--142}, - ISSN={2168-2216}, - publisher={IEEE} -} diff --git a/test.cpp b/test.cpp new file mode 100644 index 0000000..4fb3be1 --- /dev/null +++ b/test.cpp @@ -0,0 +1,108 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +// It receives the file name as a string and returns a dictionary with the keys being the UserID and the values being a vector of VenueID associated with that UserID. + +unordered_map> createDictFromFile(string filename) { + // Create an empty dictionary + unordered_map> dict; + + // Open the file + ifstream file(filename); + + // Check if the file was opened successfully + if (!file.is_open()) { + cerr << "Error opening file " << filename << endl; + return dict; + } + + // Read the file line by line + string userId, venueId; + while (file >> userId >> venueId) { + // Check if the userId is already in the dictionary + if (dict.count(userId) == 0) { + // If not, create an entry in the dictionary with an empty vector of venues + dict[userId] = vector(); + } + + // Add the venueId to the vector of venues associated with the userId + dict[userId].push_back(venueId); + } + + // Close the file + file.close(); + + cout << "Dict created" << endl; + + // Return the dictionary + return dict; +} + +void create_tsv(unordered_map> dict, mutex& dict_mutex) { + // Create an output stream to write the file + ofstream out_file("output.tsv"); + + // Create a mutex to protect the output file + mutex out_file_mutex; + + // Loop over all the key-value pairs in the map + for (const auto& kv1 : dict) { + for (const auto& kv2 : dict) { + // Check if the keys are the same + if (kv1.first == kv2.first) continue; + + // Check if the values have elements in common + vector common; + for (const auto& str1 : kv1.second) { + for (const auto& str2 : kv2.second) { + if (str1 == str2) common.push_back(str1); + } + } + + // Write the keys and the number of common elements to the output file + if (!common.empty()) { + // Lock the mutexes before accessing the dict and the output file + lock_guard dict_guard(dict_mutex); + lock_guard out_file_guard(out_file_mutex); + + out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl; + } + } + } + + // Close the output file + out_file.close(); +} + + +int main() { + // Create a map of vectors + unordered_map> dict = createDictFromFile("test.txt"); + + // Create a mutex to protect the dict map + mutex dict_mutex; + + // Create an array of threads + const int num_threads = 12; + thread threads[num_threads]; + + // Launch the threads + for (int i = 0; i < num_threads; i++) { + threads[i] = thread(create_tsv, ref(dict), ref(dict_mutex)); + } + + // Wait for the threads to finish + for (int i = 0; i < num_threads; i++) { + threads[i].join(); + } + + return 0; +} diff --git a/testing.ipynb b/testing.ipynb index 2c58931..6eb8a1a 100644 --- a/testing.ipynb +++ b/testing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -485,13 +485,33 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of nodes added to the graph brightkite: 51406\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_51637/2618808480.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mBrightkite_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"brightkite\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mGowalla_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gowalla\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mFoursquare_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"foursquareNYC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/github/small-worlds/main.py\u001b[0m in \u001b[0;36mcreate_checkins_graph\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[0;31m# now add the edges, try to use pandas to speed up the process\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcombinations\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m \u001b[0mintersection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 135\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0mG\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_edge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], "source": [ - "Brightkite_G = friendships_graph(\"brightkite\")\n", - "Gowalla_G = friendships_graph(\"gowalla\")\n", - "Foursquare_G = foursquare_checkins_graph(\"NYC\")" + "Brightkite_G = create_checkins_graph(\"brightkite\")\n", + "Gowalla_G = create_checkins_graph(\"gowalla\")\n", + "Foursquare_G = create_checkins_graph(\"foursquareNYC\")" ] }, { @@ -591,9 +611,21 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 3, "metadata": {}, - "source": [] + "outputs": [], + "source": [ + "path = \"data/brightkite/loc-brightkite_totalCheckins.txt\"\n", + "# modify the file, take only the first and last column, return a test.txt file. Use pandas\n", + "\n", + "def modify_file(path):\n", + " df = pd.read_csv(path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n", + " df = df.iloc[:, [0, 4]]\n", + " df.to_csv(\"test.txt\", sep=\"\\t\", header=None, index=None)\n", + "\n", + "modify_file(path)" + ] } ], "metadata": {