Initial commit, started to build the networks

main
Luca Lombardo 2 years ago
parent fb92013911
commit 83feae8fb9

4
.gitignore vendored

@ -127,3 +127,7 @@ dmypy.json
# Pyre type checker # Pyre type checker
.pyre/ .pyre/
# Data folder
data/

@ -0,0 +1,140 @@
# /bin/env/python3
import os
import wget
import zipfile
import networkx as nx
import pandas as pd
def download_datasets():
urls = [
["https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz", "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"],
["https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz", "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"],
["http://www-public.it-sudparis.eu/~zhang_da/pub/dataset_tsmc2014.zip"]
]
folders = ["brightkite", "gowalla", "foursquare"]
# check if the data folder exists
if not os.path.exists("data"):
os.mkdir("data")
# if they don't exist, create 3 subfolders in data called brightkite, gowalla and foursquare
for folder in folders:
if not os.path.exists(os.path.join("data", folder)):
os.mkdir(os.path.join("data", folder))
# download every url in urls[0] in the brightkite folder, and every url in urls[1] in the gowalla folder, and every url in urls[2] in the foursquare folder. At ech iteration, it checks if the file already exists, if yes, it skips the download and prints a message. If no, it downloads the file and prints a message.
for i in range(len(urls)):
for url in urls[i]:
# check if there are .txt files inside folder, if yes, skip the download
if len([file for file in os.listdir(os.path.join("data", folders[i])) if file.endswith(".txt")]) > 0:
print("The {} dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder".format(folders[i]))
break
# check if there are .gz files inside folder, if yes, skip the download
elif len([file for file in os.listdir(os.path.join("data", folders[i])) if file.endswith(".gz")]) > 0:
print("The {} dataset is already downloaded as .gz file, if you want to download again the .gz file with this function, delete the .gz files in the folder".format(folders[i]))
break
# if there are no .txt or .gz files, download the file
else:
print("Downloading {} dataset...".format(folders[i]))
wget.download(url, os.path.join("data", folders[i]))
print("Download completed of {} dataset".format(folders[i]))
# extract the data of foursquare in a nice way, checking all the edge cases as a maniac. More details below
"""
The code below it's very ugly to read, but it's effective. Basically, in every possible messy situation we have the files (maybe after testing) inside the foursquare folder, it will fix them and bring them as the program expects them to be.
Firstly it checks if in the foursquare folder there is a folder called dataset_tsmc2014. If true, it checks if there are 3 files inside the foursquare folders, if yes, skip the process (everything is in order). If false, it moves all the files inside the dataset_tsmc2014 folder to the foursquare folder and delete the dataset_tsmc2014 folder (we don't want a nested folder)
Then, if there is no dataset_tsmc2014 folder, it unzips the file. Then move all the .txt files inside the dataset_tsmc2014 folder in the foursquare folder. Then delete the dataset_tsmc2014 folder and the .zip file.
"""
for file in os.listdir("data/foursquare"):
if file.endswith(".zip"):
if os.path.exists(os.path.join("data", "foursquare", "dataset_tsmc2014")):
if len(os.listdir(os.path.join("data", "foursquare"))) == 3:
pass
else:
for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")):
os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file))
os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014"))
else:
with zipfile.ZipFile(os.path.join("data", "foursquare", file), 'r') as zip_ref:
zip_ref.extractall(os.path.join("data", "foursquare"))
os.remove(os.path.join("data", "foursquare", file))
for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")):
os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file))
os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014"))
# if there are no .txt files inside the brightkite folder, unzip the .gz files
if len([file for file in os.listdir(os.path.join("data", "brightkite")) if file.endswith(".txt")]) == 0:
for file in os.listdir(os.path.join("data", "brightkite")):
if file.endswith(".gz"):
os.system("gunzip {}".format(os.path.join("data", "brightkite", file)))
# if there are no .txt files inside the gowalla folder, unzip the .gz files
if len([file for file in os.listdir(os.path.join("data", "gowalla")) if file.endswith(".txt")]) == 0:
for file in os.listdir(os.path.join("data", "gowalla")):
if file.endswith(".gz"):
os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))
def create_graph(dataset):
"""
This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well:
G = nx.Graph()
with open(file, "r") as f:
for line in f:
node1, node2 = line.split("\t")
G.add_edge(node1, node2)
"""
if dataset == "brightkite":
file = os.path.join("data", "brightkite", "loc-brightkite_edges.txt")
elif dataset == "gowalla":
file = os.path.join("data", "gowalla", "loc-gowalla_edges.txt")
else:
raise ValueError("The dataset must be brightkite or gowalla. If you want to use the foursquare dataset, use the create_foursquare_graph() function")
df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
return G
def create_foursquare_graph(dataset):
# we are given a .txt in tsv format, with 8 colums. Read the file with pandas, the first two colums are colles "UserID" and "VenueID", the other 6 are useless. Then create a graph with networkx for this function. The unique users ID are the nodes, two nodes are linked, if they have been in the same venue at least once. The weight of the edge is the number of times they have been in the same venue.
if dataset == "NYC":
file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt")
elif dataset == "TKY":
file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt")
df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "Timezone offset in minutes", "UTC time"])
# use the set() data structure to get the unique users ID
users = set(df["UserID"])
G = nx.Graph()
G.add_nodes_from(users)
# create a dictionary with the users ID as keys and the venues ID as values
users_venues = {}
for user in users:
users_venues[user] = set(df[df["UserID"] == user]["VenueID"])
# create the edges
for user1 in users: # nested for loop in python, I'm crying. C++ I miss you
for user2 in users:
if user1 != user2:
if len(users_venues[user1].intersection(users_venues[user2])) > 0:
G.add_edge(user1, user2, weight=len(users_venues[user1].intersection(users_venues[user2])))
return G

Binary file not shown.

@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"\n",
"import os\n",
"import zipfile\n",
"import wget\n",
"import networkx as nx\n",
"from main import *\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Discovering the datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To perform our analysis, we will use the following datasets:\n",
"\n",
"- **Brightkite**\n",
"- **Gowalla**\n",
"- **Foursquare**\n",
"\n",
"We can download the datasets using the function `download_dataset` from the `utils` module. It will download the datasets in the `data` folder, organized in sub-folders in the following way:\n",
"\n",
"```\n",
"data/\n",
"├── brightkite\n",
"│   ├── loc-brightkite_edges.txt.gz\n",
"│   ├── loc-brightkite_totalCheckins.txt.gz\n",
"├── foursquare\n",
"│   ├── loc-gowalla_edges.txt.gz\n",
"│   ├── loc-gowalla_totalCheckins.txt.gz\n",
"└── gowalla\n",
" ├── dataset_ubicomp2013_checkins.txt\n",
" ├── dataset_ubicomp2013_tags.txt\n",
" └── dataset_ubicomp2013_tips.txt\n",
"```\n",
"\n",
"If any of the datasets is already downloaded, it will not be downloaded again. For futher details about the function below, please refer to the `utils` module."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The brightkite dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
"The gowalla dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
"Downloading foursquare dataset...\n",
"Download completed of foursquare dataset\n"
]
}
],
"source": [
"download_datasets()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's have a deeper look at them.\n",
"\n",
"## Brightkite\n",
"\n",
"[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. The network was originally directed but the authors of the dataset have constructed a network with undirected edges when there is a friendship in both ways. They also have also collected a total of `4491143` checking of these users over the period of Apr. 2008 - Oct. 2010.\n",
"\n",
"Here is an example of check-in information"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Brightkite_df = pd.read_csv(\"data/brightkite/loc-brightkite_totalCheckins.txt.gz\", sep=\"\\t\", header=None, compression=\"gzip\", names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
"\n",
"Brightkite_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Gowalla\n",
"\n",
"Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. The authors have collected a total of `6442890` check-ins of these users over the period of Feb. 2009 - Oct. 2010.\n",
"\n",
"Here is an example of check-in information"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Gowalla_df = pd.read_csv(\"data/gowalla/loc-gowalla_totalCheckins.txt.gz\", sep=\"\\t\", header=None, compression=\"gzip\", names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
"\n",
"Gowalla_df.head() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Foursquare\n",
"\n",
"DA RISCRIVERE"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# remove from memory, they were created only for aesthetic purposes in the notebook\n",
"\n",
"del Brightkite_df\n",
"del Gowalla_df\n",
"del Foursquare_checks_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building the networks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We are asked to construct the networks for the three datasets as un undirected grah $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n",
"\n",
"We can use the fucntion create_graph from the utils module to create the networks. It takes as input the path to an edge list file and returns a networkx graph object. For further details about the function below, please refer to the `utils` module."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._string_convert\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._string_box_utf8\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_154187/2796184490.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mBrightkite_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"brightkite\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mGowalla_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gowalla\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mFoursquare_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_foursquare_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"NYC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/github/small-worlds/main.py\u001b[0m in \u001b[0;36mcreate_foursquare_graph\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"foursquare\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dataset_TSMC2014_TKY.txt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"UserID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"VenueID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"CategoryID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"CategoryName\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Latitude\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Longitude\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Timezone offset in minutes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"UTC time\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;31m# use the set() data structure to get the unique users ID\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minspect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcurrentframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 316\u001b[0m )\n\u001b[0;32m--> 317\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 948\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 950\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 611\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 612\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1771\u001b[0m \u001b[0mcol_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1772\u001b[0;31m \u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1773\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1774\u001b[0m )\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlow_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_low_memory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 244\u001b[0m \u001b[0;31m# destructive to chunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_concatenate_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._string_convert\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._string_box_utf8\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data"
]
}
],
"source": [
"Brightkite_G = create_graph(\"brightkite\")\n",
"Gowalla_G = create_graph(\"gowalla\")\n",
"Foursquare_G = create_foursquare_graph(\"NYC\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can have a look at the number of nodes and edges in each network."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Brightkite graph has {} nodes and {} edges\".format(Brightkite_G.number_of_nodes(), Brightkite_G.number_of_edges()))\n",
"\n",
"print(\"Gowalla graph has {} nodes and {} edges\".format(Gowalla_G.number_of_nodes(), Gowalla_G.number_of_edges()))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.6 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading…
Cancel
Save