fixes

2 years ago · 825030a26e
parent 6a3b1a9b9f
commit 825030a26e
5 changed files with 175 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -142,3 +142,4 @@ data/
 *.out
 *.synctex.gz
 *.toc
 *.txt
--- a/main.py
+++ b/main.py
@ -1,5 +1,6 @@
 # /bin/env/python3
 from itertools import combinations
 import os
 import wget
 import zipfile
@ -96,40 +97,44 @@ def download_datasets():
                os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))
-def foursquare_checkins_graph(dataset: Literal['NYC', 'TKY'])-> nx.Graph:
+def create_checkins_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY'])-> nx.Graph:
    """
-    This function takes in input a tsv file with 8 columns, each line in the file is a check-in. The function returns an undirected networkx graph object.
+    This function takes in input a tsv file, each line in the file is a check-in. The function returns an undirected networkx graph object.
-    Differently from the function create_graph used for the brightkite and gowalla dataset, we are not given a list of edges, so we can't use the function nx.from_pandas_edgelist. We have to create the graph manually.
+    Firstly, we retrive the unique user ID: this are the nodes of our graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues.
    Firstly, we retrive the unique user ID using the set() data structure: this are the nodes of our graph. Since we don't want to work with adjacency matrices due to their O(mn) space complexity (even tho, we could memorize them in a compressed way thanks to their sparsity propriety), we use an adjacency list representation of the graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues.
    """
-    if dataset not in ["NYC", "TKY"]:
+    if dataset not in ['brightkite', 'gowalla',
-        raise ValueError("The dataset must be NYC or TKY")
+     'foursquareNYC', 'foursquareTKY']:
        raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquareNYC, foursquareTKY")
-    file = os.path.join("data", "foursquare", "dataset_TSMC2014_{}.txt".format(dataset))
+    # based on the dataset, we have to read the file in a different way.
    if dataset == "foursquareNYC":
        file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt")
        df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore")
-    df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "Timezone offset in minutes", "UTC time"], encoding="utf-8", encoding_errors="ignore")
+    elif dataset == "foursquareTKY":
        file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt")
        df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore")
    else:
        file = os.path.join("data", dataset, "loc-{}_totalCheckins.txt".format(dataset))
        df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "CheckIn", "latitude", "longitude", "VenueID"], encoding="utf-8", encoding_errors="ignore")
-    users = set(df["UserID"]) # get the unique users ID
+    # get the unique users ID
    users = df["UserID"].unique()
    G = nx.Graph()
    G.add_nodes_from(users)
    print("Number of nodes added to the graph {}: {}".format(dataset, G.number_of_nodes()))
-    users_venues = {} # key: user ID, value: set of venues ID
+    users_venues = df.groupby("UserID")["VenueID"].apply(list).to_dict()
    for user in users:
        users_venues[user] = set(df[df["UserID"] == user]["VenueID"])
-    # create the edges
+    for user1, user2 in combinations(users, 2):
-    for user1 in users: # nested for loop in python, I'm crying. C++ I miss you
+        intersection = set(users_venues[user1]) & set(users_venues[user2])
-        for user2 in users:
+        if len(intersection) > 0:
-            if user1 != user2:
+            G.add_edge(user1, user2, weight=len(intersection))
                if len(users_venues[user1].intersection(users_venues[user2])) > 0:
                    G.add_edge(user1, user2, weight=len(users_venues[user1].intersection(users_venues[user2])))
    return G
    print("Number of edges added to the graph {}: {}".format(dataset, G.number_of_edges()))
 def friendships_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph:
--- a/ref.bib
+++ b/ref.bib
@ -1,11 +0,0 @@
@article{yang2014modeling,
 	author={Yang, Dingqi and Zhang, Daqing and Zheng, Vincent. W. and Yu, Zhiyong},
 	journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems},
 	title={Modeling User Activity Preference by Leveraging User Spatial Temporal Characteristics in LBSNs},
 	year={2015},
 	volume={45},
 	number={1},
 	pages={129--142},
 	ISSN={2168-2216},
 	publisher={IEEE}
 }
--- a/test.cpp
+++ b/test.cpp
@ -0,0 +1,108 @@
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <mutex>
 #include <thread>
 #include <functional>
 using namespace std;
 // It receives the file name as a string and returns a dictionary with the keys being the UserID and the values being a vector of VenueID associated with that UserID.
 unordered_map<string, vector<string>> createDictFromFile(string filename) {
  // Create an empty dictionary
  unordered_map<string, vector<string>> dict;
  // Open the file
  ifstream file(filename);
  // Check if the file was opened successfully
  if (!file.is_open()) {
    cerr << "Error opening file " << filename << endl;
    return dict;
  }
  // Read the file line by line
  string userId, venueId;
  while (file >> userId >> venueId) {
    // Check if the userId is already in the dictionary
    if (dict.count(userId) == 0) {
      // If not, create an entry in the dictionary with an empty vector of venues
      dict[userId] = vector<string>();
    }
    // Add the venueId to the vector of venues associated with the userId
    dict[userId].push_back(venueId);
  }
  // Close the file
  file.close();
  cout << "Dict created" << endl;
  // Return the dictionary
  return dict;
 }
 void create_tsv(unordered_map<string, vector<string>> dict, mutex& dict_mutex) {
  // Create an output stream to write the file
  ofstream out_file("output.tsv");
  // Create a mutex to protect the output file
  mutex out_file_mutex;
  // Loop over all the key-value pairs in the map
  for (const auto& kv1 : dict) {
    for (const auto& kv2 : dict) {
      // Check if the keys are the same
      if (kv1.first == kv2.first) continue;
      // Check if the values have elements in common
      vector<string> common;
      for (const auto& str1 : kv1.second) {
        for (const auto& str2 : kv2.second) {
          if (str1 == str2) common.push_back(str1);
        }
      }
      // Write the keys and the number of common elements to the output file
      if (!common.empty()) {
        // Lock the mutexes before accessing the dict and the output file
        lock_guard<mutex> dict_guard(dict_mutex);
        lock_guard<mutex> out_file_guard(out_file_mutex);
        out_file << kv1.first << "\t" << kv2.first << "\t" << common.size() << endl;
      }
    }
  }
  // Close the output file
  out_file.close();
 }
 int main() {
  // Create a map of vectors
  unordered_map<string, vector<string>> dict = createDictFromFile("test.txt");
  // Create a mutex to protect the dict map
  mutex dict_mutex;
  // Create an array of threads
  const int num_threads = 12;
  thread threads[num_threads];
  // Launch the threads
  for (int i = 0; i < num_threads; i++) {
    threads[i] = thread(create_tsv, ref(dict), ref(dict_mutex));
  }
  // Wait for the threads to finish
  for (int i = 0; i < num_threads; i++) {
    threads[i].join();
  }
  return 0;
 }
--- a/testing.ipynb
+++ b/testing.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -485,13 +485,33 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of nodes added to the graph brightkite: 51406\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_51637/2618808480.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mBrightkite_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"brightkite\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mGowalla_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gowalla\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mFoursquare_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_checkins_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"foursquareNYC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/github/small-worlds/main.py\u001b[0m in \u001b[0;36mcreate_checkins_graph\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m    132\u001b[0m     \u001b[0;31m# now add the edges, try to use pandas to speed up the process\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcombinations\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m         \u001b[0mintersection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0musers_venues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muser2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    135\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m             \u001b[0mG\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_edge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muser1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muser2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintersection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
-    "Brightkite_G = friendships_graph(\"brightkite\")\n",
+    "Brightkite_G = create_checkins_graph(\"brightkite\")\n",
-    "Gowalla_G = friendships_graph(\"gowalla\")\n",
+    "Gowalla_G = create_checkins_graph(\"gowalla\")\n",
-    "Foursquare_G = foursquare_checkins_graph(\"NYC\")"
+    "Foursquare_G = create_checkins_graph(\"foursquareNYC\")"
   ]
  },
  {
@ -591,9 +611,21 @@
   ]
  },
  {
-   "cell_type": "markdown",
+   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
-   "source": []
+   "outputs": [],
   "source": [
    "path = \"data/brightkite/loc-brightkite_totalCheckins.txt\"\n",
    "# modify the file, take only the first and last column, return a test.txt file. Use pandas\n",
    "\n",
    "def modify_file(path):\n",
    "    df = pd.read_csv(path, sep=\"\\t\", header=None, names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
    "    df = df.iloc[:, [0, 4]]\n",
    "    df.to_csv(\"test.txt\", sep=\"\\t\", header=None, index=None)\n",
    "\n",
    "modify_file(path)"
   ]
  }
 ],
 "metadata": {