diff --git a/main.py b/main.py deleted file mode 100644 index ae72fd8..0000000 --- a/main.py +++ /dev/null @@ -1,221 +0,0 @@ -""" -NOTEs: - -- This file is note meant to be run, it's just a collection of functions that are used in the other files. It's just a way to keep the code clean and organized. - -- Why do I use os.path.join and not the "/"? Because it's more portable, it works on every OS, while "/" works only on Linux and Mac. If you want to use it on Windows, you have to change all the "/" with "\". With os.path.join you don't have to worry about it and, as always, f*** Microsoft. -""" - -import os -import wget -import zipfile -import pandas as pd -import networkx as nx -from typing import Literal -from itertools import combinations - -# ------------------------------------------------------------------------# - -def download_datasets(): - - urls = [ - ["https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz", "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"], - - ["https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz", "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"], - - ["http://www-public.it-sudparis.eu/~zhang_da/pub/dataset_tsmc2014.zip"] - ] - - folders = ["brightkite", "gowalla", "foursquare"] - - # check if the data folder exists - if not os.path.exists("data"): - os.mkdir("data") - - # if they don't exist, create 3 subfolders in data called brightkite, gowalla and foursquare - for folder in folders: - if not os.path.exists(os.path.join("data", folder)): - os.mkdir(os.path.join("data", folder)) - - # download every url in urls[0] in the brightkite folder, and every url in urls[1] in the gowalla folder, and every url in urls[2] in the foursquare folder. If the file is already downloaded, skip the download - - for i in range(len(urls)): - for url in urls[i]: - if not os.path.exists(os.path.join("data", folders[i], url.split("/")[-1])): - wget.download(url, os.path.join("data", folders[i])) - - # unzip the .gz files inside the brightkite and gowalla folders - for file in os.listdir(os.path.join("data", "brightkite")): - if file.endswith(".gz"): - os.system("gunzip {}".format(os.path.join("data", "brightkite", file))) - - for file in os.listdir(os.path.join("data", "gowalla")): - if file.endswith(".gz"): - os.system("gunzip {}".format(os.path.join("data", "gowalla", file))) - - # extract the data of foursquare in a nice way, checking all the edge cases as a maniac. More details below - - """ - The code below it's very ugly to read, but it's effective. Basically, in every possible messy situation we have the files (maybe after testing) inside the foursquare folder, it will fix them and bring them as the program expects them to be. - - Firstly it checks if in the foursquare folder there is a folder called dataset_tsmc2014. If true, it checks if there are 3 files inside the foursquare folders, if yes, skip the process (everything is in order). If false, it moves all the files inside the dataset_tsmc2014 folder to the foursquare folder and delete the dataset_tsmc2014 folder (we don't want a nested folder) - - Then, if there is no dataset_tsmc2014 folder, it unzips the file. Then move all the .txt files inside the dataset_tsmc2014 folder in the foursquare folder. Then delete the dataset_tsmc2014 folder and the .zip file. - """ - - for file in os.listdir(os.path.join("data", "foursquare")): - if file.endswith(".zip"): - if os.path.exists(os.path.join("data", "foursquare", "dataset_tsmc2014")): - if len(os.listdir(os.path.join("data", "foursquare"))) == 3: - pass - else: - for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")): - os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file)) - os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014")) - else: - with zipfile.ZipFile(os.path.join("data", "foursquare", file), 'r') as zip_ref: - zip_ref.extractall(os.path.join("data", "foursquare")) - os.remove(os.path.join("data", "foursquare", file)) - for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")): - os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file)) - os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014")) - - # Now we want to clean our data. Both for brightkite and gowalla, we want to rename _edges files as "brightkite_friends_edges.txt" and "gowalla_friends_edges.txt" - - for file in os.listdir(os.path.join("data", "brightkite")): - if file.endswith("_edges.txt"): - os.rename(os.path.join("data", "brightkite", file), os.path.join("data", "brightkite", "brightkite_friends_edges.txt")) - - for file in os.listdir(os.path.join("data", "gowalla")): - if file.endswith("_edges.txt"): - os.rename(os.path.join("data", "gowalla", file), os.path.join("data", "gowalla", "gowalla_friends_edges.txt")) - - # Now we from the _totalCheckins.txt files we want to keep only the first and last column, which are the user ID and the venue ID. We also want to remove the header of the file. Use pandas to do that. Then rename the files as "brightkite_checkins_edges.txt" and "gowalla_checkins_edges.txt - - for file in os.listdir(os.path.join("data", "brightkite")): - if file.endswith("_totalCheckins.txt"): - df = pd.read_csv(os.path.join("data", "brightkite", file), sep="\t", header=None) - df = df[[0, 2]] - df.to_csv(os.path.join("data", "brightkite", "brightkite_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8") - os.remove(os.path.join("data", "brightkite", file)) - - for file in os.listdir(os.path.join("data", "gowalla")): - if file.endswith("_totalCheckins.txt"): - df = pd.read_csv(os.path.join("data", "gowalla", file), sep="\t", header=None) - df = df[[0, 2]] - df.to_csv(os.path.join("data", "gowalla", "gowalla_checkins.txt"), sep="\t", header=False, index=False, errors="ignore", encoding="utf-8") - os.remove(os.path.join("data", "gowalla", file)) - - # now for foursquare we want to keep only the first and second column, which are the user ID and the venue ID. We also want to remove the header of the file. Use pandas to do that. Do that for both _NYC.txt and _TKY.txt files. Then rename the files as "foursquare_checkins_edges_NYC.txt" and "foursquare_checkins_edges_TKY.txt - - for file in os.listdir(os.path.join("data", "foursquare")): - if file.endswith("_NYC.txt"): - df = pd.read_csv(os.path.join("data", "foursquare", file), sep="\t", header=None, encoding="utf-8", encoding_errors="ignore") - df = df[[0, 1]] - df.to_csv(os.path.join("data", "foursquare", "foursquare_checkins_NYC.txt"), sep="\t", header=False, index=False) - os.remove(os.path.join("data", "foursquare", file)) - - if file.endswith("_TKY.txt"): - df = pd.read_csv(os.path.join("data", "foursquare", file), sep="\t", header=None, encoding="utf-8", encoding_errors="ignore") - df = df[[0, 1]] - df.to_csv(os.path.join("data", "foursquare", "foursquare_checkins_TKY.txt"), sep="\t", header=False, index=False) - os.remove(os.path.join("data", "foursquare", file)) - -# ------------------------------------------------------------------------# - -def create_checkins_graph_SLOW(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY'])-> nx.Graph: - - """ - This function takes in input a tsv file, each line in the file is a check-in. The function returns an undirected networkx graph object. - - Firstly, we retrive the unique user ID: this are the nodes of our graph. We create a dictionary with the users ID as keys and the venues ID as values. Two users are connected if they have visited the same venue at least once. The weight of the edge is the number of common venues. - """ - - if dataset not in ['brightkite', 'gowalla', - 'foursquareNYC', 'foursquareTKY']: - raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquareNYC, foursquareTKY") - - # based on the dataset, we have to read the file in a different way. - if dataset == "foursquareNYC": - file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt") - df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore") - - elif dataset == "foursquareTKY": - file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt") - df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "LocalTime" ,"UTCtime",], encoding="utf-8", encoding_errors="ignore") - else: - file = os.path.join("data", dataset, "loc-{}_totalCheckins.txt".format(dataset)) - df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "CheckIn", "latitude", "longitude", "VenueID"], encoding="utf-8", encoding_errors="ignore") - - # get the unique users ID - users = df["UserID"].unique() - G = nx.Graph() - G.add_nodes_from(users) - print("Number of nodes added to the graph {}: {}".format(dataset, G.number_of_nodes())) - - users_venues = df.groupby("UserID")["VenueID"].apply(list).to_dict() - - for user1, user2 in combinations(users, 2): - intersection = set(users_venues[user1]) & set(users_venues[user2]) - if len(intersection) > 0: - G.add_edge(user1, user2, weight=len(intersection)) - - print("Number of edges added to the graph {}: {}".format(dataset, G.number_of_edges())) - -# ------------------------------------------------------------------------# - -def friendships_graph(dataset: Literal['brightkite', 'gowalla']) -> nx.Graph: - - """ - This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well: - - G = nx.Graph() - with open(file, "r") as f: - for line in f: - node1, node2 = line.split("\t") - G.add_edge(node1, node2) - - """ - - if dataset not in ["brightkite", "gowalla"]: - raise ValueError("The dataset must be brightkite or gowalla") - - file = os.path.join("data", dataset, "{}_friends_edges.txt".format(dataset)) - df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"]) - G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph()) - - return G - -# ------------------------------------------------------------------------# - -def checkins_graph_from_edges(dataset: Literal['brightkite', 'gowalla', 'foursquareNYC', 'foursquareTKY']) -> nx.Graph: - - """ - This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well: - - G = nx.Graph() - with open(file, "r") as f: - for line in f: - node1, node2 = line.split("\t") - G.add_edge(node1, node2) - - """ - - if dataset not in ["brightkite", "gowalla", "foursquareNYC", "foursquareTKY"]: - raise ValueError("The dataset must be brightkite, gowalla or foursquare") - - - file = os.path.join("data", dataset, "{}_checkins_edges.tsv".format(dataset)) - - # if dataset == "foursquareTKY": - # file = os.path.join("data", "foursquare", "foursquareNYC_checkins_graph.tsv") - # elif dataset == "foursquareNYC": - # file = os.path.join("data", "foursquare", "foursquareTKY_checkins_graph.tsv") - # else: - # file = os.path.join("data", dataset, "{}_checkins_graph.tsv".format(dataset)) - - - df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"]) - G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph()) - - return G diff --git a/testing.ipynb b/testing.ipynb index 39bf6a7..a38229b 100644 --- a/testing.ipynb +++ b/testing.ipynb @@ -6,14 +6,24 @@ "metadata": {}, "outputs": [], "source": [ - "%reload_ext autoreload\n", + "%load_ext autoreload\n", + "%autoreload 2\n", "\n", "import os\n", - "import zipfile\n", "import wget\n", + "import zipfile\n", + "import numpy as np\n", + "import pandas as pd\n", "import networkx as nx\n", - "from main import *\n", - "import pandas as pd" + "import plotly.graph_objects as go\n", + "from utils import *\n", + "from collections import Counter\n", + "from tqdm import tqdm\n", + "import time\n", + "\n", + "# ignore warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" ] }, { @@ -37,12 +47,14 @@ "We can download the datasets using the function `download_dataset` from the `utils` module. It will download the datasets in the `data` folder, organized in sub-folders in the following way:\n", "\n", "```\n", + "data\n", "├── brightkite\n", - "│   ├── brightkite_checkins.txt\n", - "│   └── brightkite_friends_edges.txt\n", + "│ ├── brightkite_checkins.txt\n", + "│ └── brightkite_friends_edges.txt\n", "├── foursquare\n", - "│   ├── foursquare_checkins_NYC.txt\n", - "│   ├── foursquare_checkins_TKY.txt\n", + "│ ├── foursquare_checkins.txt\n", + "│ ├── foursquare_friends_edges.txt\n", + "│ └── raw_POIs.txt\n", "└── gowalla\n", " ├── gowalla_checkins.txt\n", " └── gowalla_friends_edges.txt\n", @@ -50,12 +62,17 @@ "\n", "If any of the datasets is already downloaded, it will not be downloaded again. For further details about the function below, please refer to the `utils` module.\n", "\n", - "> NOTE: the Stanford servers tends to be slow, so it may take a while to download the datasets. It's gonna take about 2 to 3 minutes to download all the datasets." + "> NOTE: the Stanford servers tends to be slow, so it may take a while to download the datasets. It's gonna take about 5 minutes to download all the datasets.\n", + "\n", + "---\n", + "\n", + "### A deeper look at the datasets\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -71,10 +88,12 @@ "\n", "## Brightkite\n", "\n", - "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. We will work with two different datasets:\n", + "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. We will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n", + "\n", + "- `data/brightkite/brightkite_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list.\n", + "\n", "\n", - "- `data/brightkite/brightkite_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids\n", - "- `data/brightkite/brightkite_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph." + "- `data/brightkite/brightkite_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph. Originally there were other columns, but we will not use them." ] }, { @@ -84,10 +103,11 @@ "source": [ "## Gowalla\n", "\n", - "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. As for Brightkite, we will work with two different datasets:\n", + "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. As for Brightkite, we will work with two different datasets. This is how they look like after being filtered by the `download_dataset` function:\n", "\n", - "- `data/gowalla/gowalla_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids\n", - "- `data/gowalla/gowalla_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list, in the next section we will see how to convert it into a graph." + "- `data/gowalla/gowalla_checkins.txt`: the checkins, a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. Originally there were other columns, such as the time of the checkins. During the filtering, we used this information to extract only the checkins from 2009 and then deleted it. This is why the number of checkins is smaller than the original dataset. \n", + "\n", + "- `data/gowalla/gowalla_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This file it's untouched by the function, it's in the form of a graph edge list. In the next section when we will build the friendship network, we will only consider the users that have at least one check-in in 2009 to avoid having biases in the analysis." ] }, { @@ -97,12 +117,15 @@ "source": [ "## Foursquare\n", "\n", - "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 10 months) check-in data in New York city and Tokyo collected from Foursquare from 12 April 2012 to 16 February 2013. It contains two files in tsv format. Each file contains 2 columns, which are:\n", + "[Foursquare](https://foursquare.com/) is a location-based social networking website where users share their locations by checking-in. This dataset includes long-term (about 22 months from Apr. 2012 to Jan. 2014) global-scale check-in data collected from Foursquare, and also two snapshots of user social networks before and after the check-in data collection period (see more details in our paper). We will work with three different datasets:\n", + "\n", + "- `data/foursquare/foursquare_checkins.txt`: a tsv file with 2 columns of user id and location. This is not in the form of a graph edge list. This fill will remain untouched by the function but due to its size, in the next sections we will focus on the EU sub-sample and the IT sub-sample. The friendship edge list will be modified accordingly.\n", "\n", - "1. User ID (anonymized)\n", - "2. Venue ID (Foursquare)\n", + "- `data/foursquare/foursquare_friends_edges.txt`: the friendship network, a tsv file with 2 columns of users ids. This is in the form of a graph edge list. \n", "\n", - "In this case, we don't have any information about the friendship network, so we will only work with the checkins." + "- `data/foursquare/raw_POIs.txt`: the POIS, a tsv file with 2 columns of location and country ISO. We are going to use this file to create the sub-samples of the dataset.\n", + "\n", + "> **NOTE:** In this case I preferred not to take sub-samples based on time. The reason is that there may be a period of time where the social network was not very popular in some countries, so the analysis may be biased. Instead, I decided to take sub-samples based on the country. In this way I have a more homogeneous dataset." ] }, { @@ -117,39 +140,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We are asked to construct the networks for the three datasets as un undirected graph $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n", + "We are asked to construct the networks for the three datasets as an undirected graph $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n", "\n", - "And this is were the fun begins! The check-ins files of the three datasets are not in the form of a graph edge list, so we need to manipulate them. But those datasets are huge! Let's have a look at the number of lines of each file." + "The check-ins files of the three datasets are not in the form of a graph edge list, so we need to manipulate them. Let's have a look at the number of lines of each file (note that gowalla is already filtered, only 2009 data are present)." ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gowalla\n", - "Number of lines: 6442892\n", - "Number of unique elements: 107092\n", - "\n", - "brightkite\n", - "Number of lines: 4747287\n", - "Number of unique elements: 51406\n", - "\n", - "foursquare\n", - "Number of lines: 227428\n", - "Number of unique elements: 1083\n", - "\n", - "foursquare\n", - "Number of lines: 573703\n", - "Number of unique elements: 2293\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "def count_lines_and_unique_elements(file):\n", " df = pd.read_csv(file, sep='\\t', header=None)\n", @@ -158,10 +158,9 @@ "\n", "gowalla_path = os.path.join('data', 'gowalla', 'gowalla_checkins.txt')\n", "brightkite_path = os.path.join('data', 'brightkite', 'brightkite_checkins.txt')\n", - "foursquareNYC_path = os.path.join('data', 'foursquare', 'foursquare_checkins_NYC.txt')\n", - "foursquareTKY_path = os.path.join('data', 'foursquare', 'foursquare_checkins_TKY.txt')\n", + "foursquare_path = os.path.join('data', 'foursquare', 'foursquare_checkins.txt')\n", "\n", - "_ = [gowalla_path, brightkite_path, foursquareNYC_path, foursquareTKY_path]\n", + "_ = [gowalla_path, brightkite_path, foursquare_path]\n", "\n", "for path in _:\n", " print(path.split(os.sep)[-2])\n", @@ -174,43 +173,91 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We would like to build a graph starting from an edge list. So the basic idea is to create a dictionary where the keys are the unique users and the values are the locations that they visited. Then, we can iterate over the dictionary and create the edges.\n", - "\n", - "But, even if we avoids repetitions, the time complexity will be $O(n^2)$, where $n$ is the number of users. And since $n$ is in the order of millions, doing this in python, where we have to build nested for loops, it's a no-go. We need to find a faster way to do this.\n", + "We would like to build a graph starting from an edge list. To do that, we are going to check, for each venue, all the users that visited it. Then, we will create an edge between each pair of users that visited the same venue (avoid repetitions). This can be easily done in python, but it's going to be a bit slow (this is why we are considering sub-samples of the datasets). Let's see how to do it.\n", "\n", - "In the `utils` module I provided anyway a function that does exactly this, but I do not raccomend to use it unless you have countless hours of time spare. It's called `create_checkicreate_checkins_graph_SLOW` and it takes a dataset name as input and returns a networkx graph object. \n", + "```python\n", + "# let df be the dataframe [\"user_id\", \"venue_id\"] of the checkins\n", "\n", - "SCRIVERE QUALCOSA RIGUARDO LA FUNZIONE IN C++\n", + "venues_users = df.groupby(\"venue_id\")[\"user_id\"].apply(set)\n", "\n", - "The function will output a new .tsv file in the form of an edge list, in the `data` folder. Since the C++ program needs to be compiled, I have already created the edge lists for the four datasets, so you can skip this step if you want.\n", + " for users in venues_users:\n", + " for user1, user2 in combinations(users, 2):\n", + " G.add_edge(user1, user2)\n", + "```\n", "\n", - "Once that we have our edge list, we can build the graph using the function `checkins_graph_from_edges` from the `utils` module. It takes as input the name of the dataset and returns a networkx graph object. The options are\n", + "It the `utilis.py` module, we have a function that does exactly this called `create_graph_from_checkins`. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n", "\n", "- `brightkite`\n", "- `gowalla`\n", - "- `foursquareNYC`\n", - "- `foursquareTKY`\n", + "- `foursquareEU`\n", + "- `foursquareIT`\n", "\n", "Let's see how it works:" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "G_brighkite_checkins = checkins_graph_from_edges('brightkite')\n", + "# It takes about 4 minutes to create the all the 4 graphs on a i7-8750H CPU\n", + "\n", + "G_brighkite_checkins = create_graph_from_checkins('brightkite')\n", "G_brighkite_checkins.name = 'Brightkite Checkins Graph'\n", "\n", - "G_gowalla_checkins = checkins_graph_from_edges('gowalla')\n", + "G_gowalla_checkins = create_graph_from_checkins('gowalla')\n", "G_gowalla_checkins.name = 'Gowalla Checkins Graph'\n", "\n", - "G_foursquareNYC_checkins = checkins_graph_from_edges('foursquareNYC')\n", - "G_foursquareNYC_checkins.name = 'Foursquare NYC Checkins Graph'\n", + "G_foursquareEU_checkins = create_graph_from_checkins('foursquareEU')\n", + "G_foursquareEU_checkins.name = 'Foursquare EU Checkins Graph'\n", + "\n", + "G_foursquareIT_checkins = create_graph_from_checkins('foursquareIT')\n", + "G_foursquareIT_checkins.name = 'Foursquare IT Checkins Graph'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Friendship network\n", + "\n", + "Now we want to create a graph where two users are connected if they are friends in the social network. We are intending the concept of friendship in a \"facebook way\", not a \"twitter way\". Less empirically, the graphs is not going to be directed and the edges are not going to be weighted. A user can't be friend with himself, and can't be friend with a user without the user being friend with him.\n", + "\n", + "Since we filtered the checkins for foursquare and gowalla, we are considering only the users that are also present in the check-ins graph. We can build this graph with the function `create_friendships_graph` in the `utils.py` module. It takes as input the name of the dataset and returns a networkx graph object. By default it will also write the edge list to a file in the respective dataset folder. The options are\n", + "\n", + "- `brightkite`\n", + "- `gowalla`\n", + "- `foursquareEU`\n", + "- `foursquareIT`\n", + "\n", + "> **NOTE:** This functions is implemented without the necessity of the checkins graphs being loaded in memory, it uses the edge list file. This choice was made since someone may want to perform some analysis only on the friendship network and so there is no need to load the checkins graph and waste memory. Furthermore, networkx is tremendously slow when loading a graph from an edge list file (since it's written in pure python), so this choice is also motivated by the speed of the function.\n", + "\n", + "Let's see how it works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "G_brighkite_friends = create_friendships_graph('brightkite')\n", + "print(\"Computation done for Brightkite friendship graph\")\n", + "G_brighkite_friends.name = 'Brightkite Friendship Graph'\n", + "\n", + "G_gowalla_friends = create_friendships_graph('gowalla')\n", + "print(\"Computation done for (filtered) Gowalla friendship graph\")\n", + "G_gowalla_friends.name = '(Filtered) Gowalla Friendship Graph'\n", + "\n", + "G_foursquareIT_friends = create_friendships_graph('foursquareIT')\n", + "print(\"Computation done for Foursquare IT friendship graph\")\n", + "G_foursquareIT_friends.name = 'Foursquare IT Friendship Graph'\n", "\n", - "G_foursquareTKY_checkins = checkins_graph_from_edges('foursquareTKY')\n", - "G_foursquareTKY_checkins.name = 'Foursquare TKY Checkins Graph'" + "G_foursquareEU_friends = create_friendships_graph('foursquareEU')\n", + "print(\"Computation done for Foursquare EU friendship graph\")\n", + "G_foursquareEU_friends.name = 'Foursquare EU Friendship Graph'\n" ] }, { @@ -223,34 +270,11 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Brightkite Checkins Graph\n", - "Number of nodes: 44058\n", - "Number of edges: 106699\n", - "\n", - "Gowalla Checkins Graph\n", - "Number of nodes: 44058\n", - "Number of edges: 106699\n", - "\n", - "Foursquare NYC Checkins Graph\n", - "Number of nodes: 2293\n", - "Number of edges: 31261\n", - "\n", - "Foursquare TKY Checkins Graph\n", - "Number of nodes: 1078\n", - "Number of edges: 7273\n", - "\n" - ] - } - ], - "source": [ - "for G in [G_brighkite_checkins, G_gowalla_checkins, G_foursquareNYC_checkins, G_foursquareTKY_checkins]:\n", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for G in [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]:\n", " print(G.name)\n", " print('Number of nodes: ', G.number_of_nodes())\n", " print('Number of edges: ', G.number_of_edges())\n", @@ -262,24 +286,71 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Friendship network\n", + "# Analysis of the structure of the networks\n", + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Degree distribution\n", + "\n", + "" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "G_brighkite_friends = friendships_graph('brightkite')\n", - "G_brighkite_friends.name = 'Brightkite Friendship Graph'\n", + "checkins_graphs = [G_brighkite_checkins, G_gowalla_checkins, G_foursquareEU_checkins, G_foursquareIT_checkins]\n", "\n", - "G_gowalla_friends = friendships_graph('gowalla')\n", - "G_gowalla_friends.name = 'Gowalla Friendship Graph'" + "for graph in checkins_graphs:\n", + " degree_distribution(graph, log=True)" ] }, { @@ -287,42 +358,308 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have our graphs, let's have a look at some basic information about them" + "--- \n", + "\n", + "Let's see how does it changes for the friendship networks" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Brightkite Friendship Graph\n", - "Number of nodes: 58228\n", - "Number of edges: 214078\n", - "\n", - "Gowalla Friendship Graph\n", - "Number of nodes: 196591\n", - "Number of edges: 950327\n", - "\n" - ] - } - ], - "source": [ - "for G in [G_brighkite_friends, G_gowalla_friends]:\n", - " print(G.name)\n", - " print('Number of nodes: ', G.number_of_nodes())\n", - " print('Number of edges: ', G.number_of_edges())\n", - " print()" + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "friendships_graph = [G_brighkite_friends, G_gowalla_friends, G_foursquareIT_friends, G_foursquareEU_friends]\n", + "\n", + "for graph in friendships_graph:\n", + " degree_distribution(graph, log=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We may be curious to see if the whole friendship network has a different degree distribution than the filtered one. Let's see if there are any differences" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "G1 = nx.read_edgelist('data/brightkite/brightkite_friends_edges.txt', nodetype=int)\n", + "G1.name = 'Brightkite Friendship Graph'\n", + "G2 = nx.read_edgelist('data/gowalla/gowalla_friends_edges.txt', nodetype=int)\n", + "G2.name = 'Gowalla Friendship Graph'\n", + "G3 = nx.read_edgelist('data/foursquare/foursquare_friends_edges.txt', nodetype=int)\n", + "G3.name = 'Foursquare Friendship Graph'\n", + "\n", + "degree_distribution(G1, log=True)\n", + "degree_distribution(G2, log=True)\n", + "degree_distribution(G3, log=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, there are no notable differences, and this is not surprising. We where only taking into considerations some edge cases. Maybe in siberia this was a very popular social network, but since it's a very harsh environment, being friends on the social network was it's not synonymous of visiting the same places together (where do you go in siberia?). " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "--- \n", + "\n", + "Now, we can compute the average degree for each checkins graph and for the friendship graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a dataframe with the average degree for each graph\n", + "average_degree = pd.DataFrame(columns=['Graph', 'Average Degree'], index=None)\n", + "\n", + "for graph in tqdm(checkins_graphs):\n", + " average_degree = average_degree.append({'Graph': graph.name, 'Average Degree': np.mean(list(dict(graph.degree()).values()))}, ignore_index=True)\n", + "\n", + "for graph in tqdm(friendships_graph):\n", + " average_degree = average_degree.append({'Graph': graph.name, 'Average Degree': np.mean(list(dict(graph.degree()).values()))}, ignore_index=True)\n", + "\n", + "print(average_degree) " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clustering coefficient\n", + "\n", + "\n", + "--- \n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analysis_results = pd.DataFrame(columns=['Graph', 'Number of Nodes', 'Number of Edges', 'Average Degree', 'Average Clustering Coefficient', 'log N', 'Average Shortest Path Length', 'betweenness centrality'], index=None)\n", + "\n", + "graphs_all = checkins_graphs + friendships_graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# It's going to take a while (about 10 minutes). The time complexity is O(n^2) since we are iterating over all the nodes and their neighbors. \n", + "\n", + "clustering_results = pd.DataFrame(columns=['Graph', 'Average Clustering Coefficient'], index=None)\n", + "\n", + "for graph in friendships_graph:\n", + " print(graph.name)\n", + " clustering_results = clustering_results.append(\n", + " {'Graph': graph.name, \n", + " 'Number of Nodes': graph.number_of_nodes(),\n", + " 'Number of Edges': graph.number_of_edges(),\n", + " 'Average Clustering Coefficient': nx.average_clustering(graph),\n", + " 'log N': np.log(graph.number_of_nodes()),\n", + " 'Average Shortest Path Length': mean_shortest_path(graph), \n", + " 'betweenness centrality': nx.betweenness_centrality(G)}, \n", + " ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(clustering_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can use our formula to compute the clustering coefficient in a small world network" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Average Path Length\n", + "\n", + "" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Analysis of the structure of the networks" + "## Betweenness Centrality\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "betweenness_results = pd.DataFrame(columns=['Graph', 'Betweenness Centrality'])\n", + "\n", + "for graph in checkins_graphs:\n", + " betweenness_results = betweenness_results.append(\n", + " {'Graph': graph.name,\n", + " 'Betweenness Centrality': np.mean(list(nx.betweenness_centrality(graph).values()))}, \n", + " ignore_index=True)\n", + "\n", + "betweenness_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def small_world_clustering(graph: nx.Graph):\n", + " tmp = 0\n", + " for node in tqdm(graph.nodes()):\n", + " k = len(list(graph.neighbors(node)))\n", + " if k >=1:\n", + " tmp += (3*(k-1))/(2*(2*k-1))\n", + " return tmp/graph.number_of_nodes()\n", + "\n", + "print(\"Clustering coefficient for the Watts-Strogatz graph: \", small_world_clustering(G_ws))\n", + "\n", + "print(\"Clustering coefficient for the Brightkite checkins graph: \", small_world_clustering(G_brighkite_checkins))" ] } ],