main
Luca Lombardo 2 years ago
parent 6a3b1a9b9f
commit 825030a26e

1
.gitignore vendored

@ -142,3 +142,4 @@ data/
*.out
*.synctex.gz
*.toc
*.txt

@ -1,5 +1,6 @@
# /bin/env/python3
from itertools import combinations
import os
import wget
import zipfile
@ -96,40 +97,44 @@ def download_datasets():
os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))
def foursquare_checkins_graph(dataset: Literal['NYC', 'TKY'])-> nx.Graph:
def create_checkins_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY'])-> nx.Graph:
"""
This function takes in input a tsv file with 8 columns, each line in the file is a check-in. The function returns an undirected networkx graph object.
This function takes in input a tsv file, each line in the file is a check-in. The function returns an undirected networkx graph object.
Differently from the function create_graph used for the brightkite and gowalla dataset, we are not given a list of edges, so we can't use the function nx.from_pandas_edgelist. We have to create the graph manually.
Firstly, we retrive the unique user ID using the set() data structure: this are the nodes of our graph. Since we don't want to work with adjacency matrices due to their O(mn) space complexity (even tho, we could memorize them in a compressed way thanks to their sparsity propriety), we use an adjacency list representation of the graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues.
Firstly, we retrive the unique user ID: this are the nodes of our graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues.
"""
if dataset not in ["NYC", "TKY"]:
raise ValueError("The dataset must be NYC or TKY")
if dataset not in ['brightkite', 'gowalla',
'foursquareNYC', 'foursquareTKY']:
raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquareNYC, foursquareTKY")
file = os.path.join("data", "foursquare", "dataset_TSMC2014_{}.txt".format(dataset))
# based on the dataset, we have to read the file in a different way.
if dataset == "foursquareNYC":
file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt")
df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore")
df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "Timezone offset in minutes", "UTC time"], encoding="utf-8", encoding_errors="ignore")
elif dataset == "foursquareTKY":
file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt")
df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore")
else:
file = os.path.join("data", dataset, "loc-{}_totalCheckins.txt".format(dataset))
df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "CheckIn", "latitude", "longitude", "VenueID"], encoding="utf-8", encoding_errors="ignore")
users = set(df["UserID"]) # get the unique users ID
# get the unique users ID
users = df["UserID"].unique()
G = nx.Graph()
G.add_nodes_from(users)
print("Number of nodes added to the graph {}: {}".format(dataset, G.number_of_nodes()))
users_venues = {} # key: user ID, value: set of venues ID
for user in users:
users_venues[user] = set(df[df["UserID"] == user]["VenueID"])
# create the edges
for user1 in users: # nested for loop in python, I'm crying. C++ I miss you
for user2 in users:
if user1 != user2:
if len(users_venues[user1].intersection(users_venues[user2])) > 0:
G.add_edge(user1, user2, weight=len(users_venues[user1].intersection(users_venues[user2])))
users_venues = df.groupby("UserID")["VenueID"].apply(list).to_dict()
return G
for user1, user2 in combinations(users, 2):
intersection = set(users_venues[user1]) & set(users_venues[user2])
if len(intersection) > 0:
G.add_edge(user1, user2, weight=len(intersection))
print("Number of edges added to the graph {}: {}".format(dataset, G.number_of_edges()))
def friendships_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph:

@ -1,11 +0,0 @@
@article{yang2014modeling,
author={Yang, Dingqi and Zhang, Daqing and Zheng, Vincent. W. and Yu, Zhiyong},
journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems},
title={Modeling User Activity Preference by Leveraging User Spatial Temporal Characteristics in LBSNs},
year={2015},
volume={45},
number={1},
pages={129--142},
ISSN={2168-2216},
publisher={IEEE}
}

@ -0,0 +1,108 @@
#include <iostream>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <thread>
#include <functional>
using namespace std;
// It receives the file name as a string and returns a dictionary with the keys being the UserID and the values being a vector of VenueID associated with that UserID.
unordered_map<string, vector<string>> createDictFromFile(string filename) {
// Create an empty dictionary
unordered_map<string, vector<string>> dict;
// Open the file
ifstream file(filename);
// Check if the file was opened successfully
if (!file.is_open()) {
cerr << "Error opening file " << filename << endl;
return dict;
}
// Read the file line by line
string userId, venueId;
while (file >> userId >> venueId) {
// Check if the userId is already in the dictionary
if (dict.count(userId) == 0) {
// If not, create an entry in the dictionary with an empty vector of venues
dict[userId] = vector<string>();
}
// Add the venueId to the vector of venues associated with the userId
dict[userId].push_back(venueId);
}
// Close the file
file.close();
cout << "Dict created" << endl;
// Return the dictionary
return dict;
}
void create_tsv(unordered_map<string, vector<string>> dict, mutex& dict_mutex) {
// Create an output stream to write the file
ofstream out_file("output.tsv");
// Create a mutex to protect the output file
mutex out_file_mutex;
// Loop over all the key-value pairs in the map
for (const auto& kv1 : dict) {
for (const auto& kv2 : dict) {
// Check if the keys are the same
if (kv1.first == kv2.first) continue;
// Check if the values have elements in common
vector<string> common;
for (const auto& str1 : kv1.second) {
for (const auto& str2 : kv2.second) {
if (str1 == str2) common.push_back(str1);
}
}
// Write the keys and the number of common elements to the output file
if (!common.empty()) {
// Lock the mutexes before accessing the dict and the output file
lock_guard<mutex> dict_guard(dict_mutex);
lock_guard<mutex> out_file_guard(out_file_mutex);
out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
}
}
}
// Close the output file
out_file.close();
}
int main() {
// Create a map of vectors
unordered_map<string, vector<string>> dict = createDictFromFile("test.txt");
// Create a mutex to protect the dict map
mutex dict_mutex;
// Create an array of threads
const int num_threads = 12;
thread threads[num_threads];
// Launch the threads
for (int i = 0; i < num_threads; i++) {
threads[i] = thread(create_tsv, ref(dict), ref(dict_mutex));
}
// Wait for the threads to finish
for (int i = 0; i < num_threads; i++) {
threads[i].join();
}
return 0;
}

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -485,13 +485,33 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of nodes added to the graph brightkite: 51406\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_51637/2618808480.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mBrightkite_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"brightkite\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mGowalla_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gowalla\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mFoursquare_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"foursquareNYC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/github/small-worlds/main.py\u001b[0m in \u001b[0;36mcreate_checkins_graph\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[0;31m# now add the edges, try to use pandas to speed up the process\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcombinations\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m \u001b[0mintersection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 135\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0mG\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_edge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"Brightkite_G = friendships_graph(\"brightkite\")\n",
"Gowalla_G = friendships_graph(\"gowalla\")\n",
"Foursquare_G = foursquare_checkins_graph(\"NYC\")"
"Brightkite_G = create_checkins_graph(\"brightkite\")\n",
"Gowalla_G = create_checkins_graph(\"gowalla\")\n",
"Foursquare_G = create_checkins_graph(\"foursquareNYC\")"
]
},
{
@ -591,9 +611,21 @@
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"source": []
"outputs": [],
"source": [
"path = \"data/brightkite/loc-brightkite_totalCheckins.txt\"\n",
"# modify the file, take only the first and last column, return a test.txt file. Use pandas\n",
"\n",
"def modify_file(path):\n",
" df = pd.read_csv(path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
" df = df.iloc[:, [0, 4]]\n",
" df.to_csv(\"test.txt\", sep=\"\\t\", header=None, index=None)\n",
"\n",
"modify_file(path)"
]
}
],
"metadata": {

Loading…
Cancel
Save