Initial commit, started to build the networks

2 years ago · 83feae8fb9
parent fb92013911
commit 83feae8fb9
4 changed files with 405 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -127,3 +127,7 @@ dmypy.json
 # Pyre type checker
 .pyre/
 # Data folder
 data/
--- a/main.py
+++ b/main.py
@ -0,0 +1,140 @@
 # /bin/env/python3
 import os
 import wget
 import zipfile
 import networkx as nx
 import pandas as pd
 def download_datasets():
    urls = [
        ["https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz", "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"],
        ["https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz", "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"],
        ["http://www-public.it-sudparis.eu/~zhang_da/pub/dataset_tsmc2014.zip"]
    ]
    folders = ["brightkite", "gowalla", "foursquare"]
    # check if the data folder exists
    if not os.path.exists("data"):
        os.mkdir("data")
    # if they don't exist, create 3 subfolders in data called brightkite, gowalla and foursquare
    for folder in folders:
        if not os.path.exists(os.path.join("data", folder)):
            os.mkdir(os.path.join("data", folder))
    # download every url in urls[0] in the brightkite folder, and every url in urls[1] in the gowalla folder, and every url in urls[2] in the foursquare folder. At ech iteration, it checks if the file already exists, if yes, it skips the download and prints a message. If no, it downloads the file and prints a message.
    for i in range(len(urls)):
        for url in urls[i]:
            # check if there are .txt files inside folder, if yes, skip the download
            if len([file for file in os.listdir(os.path.join("data", folders[i])) if file.endswith(".txt")]) > 0:
                print("The {} dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder".format(folders[i]))
                break
            # check if there are .gz files inside folder, if yes, skip the download
            elif len([file for file in os.listdir(os.path.join("data", folders[i])) if file.endswith(".gz")]) > 0:
                print("The {} dataset is already downloaded as .gz file, if you want to download again the .gz file with this function, delete the .gz files in the folder".format(folders[i]))
                break
            # if there are no .txt or .gz files, download the file
            else:
                print("Downloading {} dataset...".format(folders[i]))
                wget.download(url, os.path.join("data", folders[i]))
                print("Download completed of {} dataset".format(folders[i]))
    # extract the data of foursquare in a nice way, checking all the edge cases as a maniac. More details below
    """
    The code below it's very ugly to read, but it's effective. Basically, in every possible messy situation we have the files (maybe after testing) inside the foursquare folder, it will fix them and bring them as the program expects them to be.
    Firstly it checks if in the foursquare folder there is a folder called dataset_tsmc2014. If true, it checks if there are 3 files inside the foursquare folders, if yes, skip the process (everything is in order). If false, it moves all the files inside the dataset_tsmc2014 folder to the foursquare folder and delete the dataset_tsmc2014 folder (we don't want a nested folder)
    Then, if there is no dataset_tsmc2014 folder, it unzips the file. Then move all the .txt files inside the dataset_tsmc2014 folder in the foursquare folder. Then delete the dataset_tsmc2014 folder and the .zip file.
    """
    for file in os.listdir("data/foursquare"):
        if file.endswith(".zip"):
            if os.path.exists(os.path.join("data", "foursquare", "dataset_tsmc2014")):
                if len(os.listdir(os.path.join("data", "foursquare"))) == 3:
                    pass
                else:
                    for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")):
                        os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file))
                    os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014"))
            else:
                with zipfile.ZipFile(os.path.join("data", "foursquare", file), 'r') as zip_ref:
                    zip_ref.extractall(os.path.join("data", "foursquare"))
                    os.remove(os.path.join("data", "foursquare", file))
                for file in os.listdir(os.path.join("data", "foursquare", "dataset_tsmc2014")):
                    os.rename(os.path.join("data", "foursquare", "dataset_tsmc2014", file), os.path.join("data", "foursquare", file))
                os.rmdir(os.path.join("data", "foursquare", "dataset_tsmc2014"))
    # if there are no .txt files inside the brightkite folder, unzip the .gz files
    if len([file for file in os.listdir(os.path.join("data", "brightkite")) if file.endswith(".txt")]) == 0:
        for file in os.listdir(os.path.join("data", "brightkite")):
            if file.endswith(".gz"):
                os.system("gunzip {}".format(os.path.join("data", "brightkite", file)))
    # if there are no .txt files inside the gowalla folder, unzip the .gz files
    if len([file for file in os.listdir(os.path.join("data", "gowalla")) if file.endswith(".txt")]) == 0:
        for file in os.listdir(os.path.join("data", "gowalla")):
            if file.endswith(".gz"):
                os.system("gunzip {}".format(os.path.join("data", "gowalla", file)))
 def create_graph(dataset):
    """
    This function takes in input a tsv file with two columns, Each line in the file is an edge. The function returns an undirected networkx graph object. It uses pandas to read the file since it's faster than the standard python open() function. If we don't want to use the standard python open() function, the following code works as well:
    G = nx.Graph()
    with open(file, "r") as f:
        for line in f:
            node1, node2 = line.split("\t")
            G.add_edge(node1, node2)
    """
    if dataset == "brightkite":
        file = os.path.join("data", "brightkite", "loc-brightkite_edges.txt")
    elif dataset == "gowalla":
        file = os.path.join("data", "gowalla", "loc-gowalla_edges.txt")
    else:
        raise ValueError("The dataset must be brightkite or gowalla. If you want to use the foursquare dataset, use the create_foursquare_graph() function")
    df = pd.read_csv(file, sep="\t", header=None, names=["node1", "node2"])
    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
    return G
 def create_foursquare_graph(dataset):
    # we are given a .txt in tsv format, with 8 colums. Read the file with pandas, the first two colums are colles "UserID" and "VenueID", the other 6 are useless. Then create a graph with networkx for this function. The unique users ID are the nodes, two nodes are linked, if they have been in the same venue at least once. The weight of the edge is the number of times they have been in the same venue.
    if dataset == "NYC":
        file = os.path.join("data", "foursquare", "dataset_TSMC2014_NYC.txt")
    elif dataset == "TKY":
        file = os.path.join("data", "foursquare", "dataset_TSMC2014_TKY.txt")
    df = pd.read_csv(file, sep="\t", header=None, names=["UserID", "VenueID", "CategoryID", "CategoryName", "Latitude", "Longitude", "Timezone offset in minutes", "UTC time"])
    # use the set() data structure to get the unique users ID
    users = set(df["UserID"])
    G = nx.Graph()
    G.add_nodes_from(users)
    # create a dictionary with the users ID as keys and the venues ID as values
    users_venues = {}
    for user in users:
        users_venues[user] = set(df[df["UserID"] == user]["VenueID"])
    # create the edges
    for user1 in users: # nested for loop in python, I'm crying. C++ I miss you
        for user2 in users:
            if user1 != user2:
                if len(users_venues[user1].intersection(users_venues[user2])) > 0:
                    G.add_edge(user1, user2, weight=len(users_venues[user1].intersection(users_venues[user2])))
    return G
--- a/project.pdf
+++ b/project.pdf
--- a/testing.ipynb
+++ b/testing.ipynb
@ -0,0 +1,261 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "\n",
    "import os\n",
    "import zipfile\n",
    "import wget\n",
    "import networkx as nx\n",
    "from main import *\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Discovering the datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To perform our analysis, we will use the following datasets:\n",
    "\n",
    "- **Brightkite**\n",
    "- **Gowalla**\n",
    "- **Foursquare**\n",
    "\n",
    "We can download the datasets using the function `download_dataset` from the `utils` module. It will download the datasets in the `data` folder, organized in sub-folders in the following way:\n",
    "\n",
    "```\n",
    "data/\n",
    "├── brightkite\n",
    "│   ├── loc-brightkite_edges.txt.gz\n",
    "│   ├── loc-brightkite_totalCheckins.txt.gz\n",
    "├── foursquare\n",
    "│   ├── loc-gowalla_edges.txt.gz\n",
    "│   ├── loc-gowalla_totalCheckins.txt.gz\n",
    "└── gowalla\n",
    "    ├── dataset_ubicomp2013_checkins.txt\n",
    "    ├── dataset_ubicomp2013_tags.txt\n",
    "    └── dataset_ubicomp2013_tips.txt\n",
    "```\n",
    "\n",
    "If any of the datasets is already downloaded, it will not be downloaded again. For futher details about the function below, please refer to the `utils` module."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The brightkite dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
      "The gowalla dataset is already downloaded and extracted as .txt file, if you want to download again the .gz file with this function, delete the .txt files in the folder\n",
      "Downloading foursquare dataset...\n",
      "Download completed of foursquare dataset\n"
     ]
    }
   ],
   "source": [
    "download_datasets()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's have a deeper look at them.\n",
    "\n",
    "## Brightkite\n",
    "\n",
    "[Brightkite](http://www.brightkite.com/) was once a location-based social networking service provider where users shared their locations by checking-in. The friendship network was collected using their public API. The network was originally directed but the authors of the dataset have constructed a network with undirected edges when there is a friendship in both ways. They also have also collected a total of `4491143` checking of these users over the period of Apr. 2008 - Oct. 2010.\n",
    "\n",
    "Here is an example of check-in information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Brightkite_df = pd.read_csv(\"data/brightkite/loc-brightkite_totalCheckins.txt.gz\", sep=\"\\t\", header=None, compression=\"gzip\", names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
    "\n",
    "Brightkite_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gowalla\n",
    "\n",
    "Gowalla is a location-based social networking website where users share their locations by checking-in. The friendship network is undirected and was collected using their public API. The authors have collected a total of `6442890` check-ins of these users over the period of Feb. 2009 - Oct. 2010.\n",
    "\n",
    "Here is an example of check-in information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Gowalla_df = pd.read_csv(\"data/gowalla/loc-gowalla_totalCheckins.txt.gz\", sep=\"\\t\", header=None, compression=\"gzip\", names=[\"user\", \"check-in time\", \"latitude\", \"longitude\", \"location_id\"])\n",
    "\n",
    "Gowalla_df.head() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Foursquare\n",
    "\n",
    "DA RISCRIVERE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove from memory, they were created only for aesthetic purposes in the notebook\n",
    "\n",
    "del Brightkite_df\n",
    "del Gowalla_df\n",
    "del Foursquare_checks_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Building the networks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We are asked to construct the networks for the three datasets as un undirected grah $M = (V, E)$, where $V$ is the set of nodes and $E$ is the set of edges. The nodes represent the users and the edges indicates that two individuals visited the same location at least once.\n",
    "\n",
    "We can use the fucntion create_graph from the utils module to create the networks. It takes as input the path to an edge list file and returns a networkx graph object. For further details about the function below, please refer to the `utils` module."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "UnicodeDecodeError",
     "evalue": "'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._string_convert\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._string_box_utf8\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_154187/2796184490.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mBrightkite_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"brightkite\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mGowalla_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gowalla\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mFoursquare_G\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcreate_foursquare_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"NYC\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/github/small-worlds/main.py\u001b[0m in \u001b[0;36mcreate_foursquare_graph\u001b[0;34m(dataset)\u001b[0m\n\u001b[1;32m    119\u001b[0m         \u001b[0mfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"foursquare\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dataset_TSMC2014_TKY.txt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m     \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"\\t\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"UserID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"VenueID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"CategoryID\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"CategoryName\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Latitude\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Longitude\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Timezone offset in minutes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"UTC time\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    123\u001b[0m     \u001b[0;31m# use the set() data structure to get the unique users ID\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    209\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m                     \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    213\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    315\u001b[0m                     \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minspect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcurrentframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    316\u001b[0m                 )\n\u001b[0;32m--> 317\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    319\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    948\u001b[0m     \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 950\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    952\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    610\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 611\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    612\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m   1770\u001b[0m                     \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1771\u001b[0m                     \u001b[0mcol_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1772\u001b[0;31m                 \u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m  \u001b[0;31m# type: ignore[attr-defined]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1773\u001b[0m                     \u001b[0mnrows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1774\u001b[0m                 )\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m    241\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    242\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlow_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m                 \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_low_memory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    244\u001b[0m                 \u001b[0;31m# destructive to chunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    245\u001b[0m                 \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_concatenate_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_with_dtype\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._string_convert\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers._string_box_utf8\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data"
     ]
    }
   ],
   "source": [
    "Brightkite_G = create_graph(\"brightkite\")\n",
    "Gowalla_G = create_graph(\"gowalla\")\n",
    "Foursquare_G = create_foursquare_graph(\"NYC\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can have a look at the number of nodes and edges in each network."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Brightkite graph has {} nodes and {} edges\".format(Brightkite_G.number_of_nodes(), Brightkite_G.number_of_edges()))\n",
    "\n",
    "print(\"Gowalla graph has {} nodes and {} edges\".format(Gowalla_G.number_of_nodes(), Gowalla_G.number_of_edges()))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.6 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }