final version to send

2 years ago · 3b4d0e3200
parent 55911adcde
commit 3b4d0e3200
4 changed files with 1288 additions and 1293 deletions
--- a/main.ipynb
+++ b/main.ipynb
--- a/omega_parallel_server.py
+++ b/omega_parallel_server.py
@ -9,134 +9,47 @@ import random
 import time
 from utils import *

-def parallel_omega(G: nx.Graph, k: float, nrand: int = 6, niter: int = 6, n_processes: int = None, seed: int = 42) -> float:
-    """
-    Computes the omega index for a given graph using parallelization.
+"""
+This script computes the omega index for a given graph using parallelization. To see the implementation of the omega index, see the file omega.py in the same folder and check the function parallel_omega
+"""

-    Parameters
-    ----------
-    function to compute the omega index of a graph in parallel. This is a much faster approach then the standard omega function. It parallelizes the computation of the random graphs and lattice networks.
+### PARSING ARGUMENTS ###

-    Parameters
-    ----------
-    `G`: nx.Graph
-        The graph to compute the omega index
+parser = argparse.ArgumentParser()

-    `k`: float
-        The percentage of nodes to sample from the graph.
+parser.add_argument("graph", help="Name of the graph to be used.", choices=['checkins-foursquare', 'checkins-gowalla', 'checkins-brightkite', 'friends-foursquare', 'friends-gowalla', 'friends-brightkite'])
+parser.add_argument("--k", help="Percentage of nodes to be sampled. Needs to be a float between 0 and 1. Default is 0.", default=0, type=float)
+parser.add_argument("--nrand", help="Number of random graphs. Needs to be an integer. Default is 12", default=12, type=int)
+parser.add_argument("--niter", help="Approximate number of rewiring per edge to compute the equivalent random graph. Default is 12", default=12, type=int)
+parser.add_argument("--processes", help="Number of processes to be used. Needs to be an integer. Default is the number of cores.", default=multiprocessing.cpu_count(), type=int)
+parser.add_argument("--seed", help="Seed for the random number generator. Needs to be an integer. Default is 42", default=42, type=int)

-    `niter`: int
-        Approximate number of rewiring per edge to compute the equivalent random graph. Default is 6.
+parser.add_help = True
+args = parser.parse_args()

-    `nrand`: int
-        Number of random graphs generated to compute the maximal clustering coefficient (Cr) and average shortest path length (Lr). Default is 6
-
-    `n_processes`: int
-        Number of processes to use. Default is the number of cores of the machine.
-
-    `seed`: int
-        The seed to use to generate the random graphs. Default is 42.
-
-    Returns
-    -------
-    `omega`: float
-    """
-
-    if n_processes is None:
-        n_processes = multiprocessing.cpu_count()
-    if n_processes > nrand:
-        n_processes = nrand
-
-    random.seed(seed)
-    if not nx.is_connected(G):
-        G = G.subgraph(max(nx.connected_components(G), key=len))
-
-    if len(G) == 1:
-        return 0
-
-    if k > 0:
-        G = random_sample(G, k)
-
-    def worker(queue_seeds, queue_results): # worker function to be used in parallel
-        while True:
-            try:
-                seed = queue_seeds.get(False)
-            except Empty:
-                break
-            random_graph = nx.random_reference(G, niter, seed=seed)
-            lattice_graph = nx.lattice_reference(G, niter, seed=seed)
-            random_shortest_path = nx.average_shortest_path_length(random_graph)
-            lattice_clustering = nx.average_clustering(lattice_graph)
-            queue_results.put((random_shortest_path, lattice_clustering))
-
-    manager = multiprocessing.Manager() # manager to share the queue
-    queue_seeds = manager.Queue() # queue to give the seeds to the processes
-    queue_results = manager.Queue() # queue to share the results
-    processes = [multiprocessing.Process(target=worker, args=(queue_seeds, queue_results))
-                 for _ in range(n_processes)] # processes to be used
-
-    for i in range(nrand): # put the tasks in the queue
-        queue_seeds.put(i + seed)
-
-    for process in processes: # start the processes
-        process.start()
-
-    for process in processes: # wait for the processes to finish
-        process.join()
-
-    # collect the results
-    shortest_paths = []
-    clustering_coeffs = []
-    while not queue_results.empty():
-        random_shortest_path, lattice_clustering = queue_results.get() # get the results from the queue
-        shortest_paths.append(random_shortest_path)
-        clustering_coeffs.append(lattice_clustering)
-
-    L = nx.average_shortest_path_length(G)
-    C = nx.average_clustering(G)
-
-    omega = (np.mean(shortest_paths) / L) - (C / np.mean(clustering_coeffs))
-    return omega
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("graph", help="Name of the graph to be used.", choices=['checkins-foursquare', 'checkins-gowalla', 'checkins-brightkite', 'friends-foursquare', 'friends-gowalla', 'friends-brightkite'])
-
-    parser.add_argument("--k", help="Percentage of nodes to be sampled. Needs to be a float between 0 and 1. Default is 0.", default=0, type=float)
-
-    parser.add_argument("--nrand", help="Number of random graphs. Needs to be an integer. Default is 12", default=12, type=int)
-
-    parser.add_argument("--niter", help="Approximate number of rewiring per edge to compute the equivalent random graph. Default is 12", default=12, type=int)
-
-    parser.add_argument("--processes", help="Number of processes to be used. Needs to be an integer. Default is the number of cores.", default=multiprocessing.cpu_count(), type=int)
-
-    parser.add_argument("--seed", help="Seed for the random number generator. Needs to be an integer. Default is 42", default=42, type=int)
-
-    parser.add_help = True
-    args = parser.parse_args()
-
-    if args.processes > multiprocessing.cpu_count():
+# check if the number of processes is valid
+if args.processes > multiprocessing.cpu_count():
    print("Number of processes is higher than available. Setting it to default value: all available")
    args.processes = multiprocessing.cpu_count()
-    elif args.processes < 1:
+elif args.processes < 1:
    raise ValueError("Number of processes needs to be at least 1")

-    name = args.graph.split('-')[1]
-    if 'checkins' in args.graph:
-        G = create_graph_from_checkins(name)
-    elif 'friends' in args.graph:
-        G = create_friendships_graph(name)
-    G.name = str(args.graph) + " Checkins Graph"
-
-    print("\nComputing omega for graph {} with {} nodes and {} edges".format(args.graph, len(G), G.number_of_edges()))
-    print("Number of processes used: ", args.processes)
-
-    start = time.time()
-    omega = parallel_omega(G, k = args.k, nrand=args.nrand, niter=args.niter, n_processes=args.processes, seed=42)
-    end = time.time()
-
-    print("\nOmega: ", omega)
-    print("Number of random graphs: ", args.nrand)
-    print("Time: ", round(end - start, 2), " seconds")
+# the name of the graph is the first part of the input string
+name = args.graph.split('-')[1]
+if 'checkins' in args.graph:
+    G = create_graph_from_checkins(name) #function from utils.py, check it out there
+elif 'friends' in args.graph:
+    G = create_friendships_graph(name) #function from utils.py, check it out there
+G.name = str(args.graph) + " Checkins Graph"
+
+print("\nThe full graph {} has {} nodes and {} edges".format(args.graph, len(G), G.number_of_edges()))
+print("Number of processes used: ", args.processes)
+
+start = time.time()
+# function from utils.py, check it out there (it's the parallel version of the omega index)
+omega = parallel_omega(G, k = args.k, nrand=args.nrand, niter=args.niter, n_processes=args.processes, seed=42)
+end = time.time()
+
+print("\nOmega: ", omega)
+print("Number of random graphs: ", args.nrand)
+print("Time: ", round(end - start, 2), " seconds")
--- a/omega_sampled_server.py
+++ b/omega_sampled_server.py
@ -1,56 +1,49 @@
 #! /usr/bin/python3

-import networkx as nx
-from utils import *
-import warnings
 import time
-import random
 import argparse
-warnings.filterwarnings("ignore")
+import networkx as nx
+from utils import *
+
+"""
+Standard function to compute the omega index for a given graph. To see the implementation of the omega index, refer to the networkx documentation

-def random_sample(graph, k):
-    nodes = list(graph.nodes())
-    n = int(k*len(nodes))
-    nodes_sample = random.sample(nodes, n)
+https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.smallworld.omega.html#networkx.algorithms.smallworld.omega

-    G = graph.subgraph(nodes_sample)
+This file has been created to be used with the server. It takes as input the name of the graph and the percentage of nodes to be sampled. It then computes the omega index for the sampled graph and returns the result. Run

-    if not nx.is_connected(G):
-        print("Graph is not connected. Taking the largest connected component")
-        connected = max(nx.connected_components(G), key=len)
-        G_connected = graph.subgraph(connected)
+```
+./omega_sampled_server.py -h
+```

-    print(nx.is_connected(G_connected))
+to see the list of available graphs and the other parameters that can be passed as input.
+"""

-    print("Number of nodes in the sampled graph: ", G.number_of_nodes())
-    print("Number of edges in the sampled graph: ", G.number_of_edges())
+parser = argparse.ArgumentParser()

-    return G_connected
+parser.add_argument("graph", help="Name of the graph to be used.", choices=['checkins-foursquare', 'checkins-gowalla', 'checkins-brightkite', 'friends-foursquare', 'friends-gowalla', 'friends-brightkite'])
+parser.add_argument("--k", help="Percentage of nodes to be sampled. Needs to be a float between 0 and 1", default=0)
+parser.add_argument("--niter", help="Number of rewiring per edge. Needs to be an integer. Default is 5", default=5)
+parser.add_argument("--nrand", help="Number of random graphs. Needs to be an integer. Default is 5", default=5)

-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("graph", help="Name of the graph to be used. Options are 'checkins-foursquare',  'checkins-gowalla', 'checkins-brightkite', 'friends-foursquare', 'friends-gowalla', 'friends-brightkite'")
-    parser.add_argument("k", help="Percentage of nodes to be sampled. Needs to be a float between 0 and 1")
-    parser.add_argument("--niter", help="Number of rewiring per edge. Needs to be an integer. Default is 5", default=5)
-    parser.add_argument("--nrand", help="Number of random graphs. Needs to be an integer. Default is 5", default=5)
-    parser.add_help = True
-    args = parser.parse_args()
+parser.add_help = True
+args = parser.parse_args()

-    # the name of the graph is the first part of the input string
-    name = args.graph.split('-')[1]
-    if 'checkins' in args.graph:
+# the name of the graph is the first part of the input string
+name = args.graph.split('-')[1]
+if 'checkins' in args.graph:
    G = create_graph_from_checkins(name)
-    elif 'friends' in args.graph:
+elif 'friends' in args.graph:
    G = create_friendships_graph(name)
-    G.name = str(args.graph) + " Checkins Graph"
-
-    # sample the graph
-    G_sample = random_sample(G, float(args.k))
-
-    # compute omega
-    start = time.time()
-    print("\nComputing omega for graph: ", G.name)
-    omega = nx.omega(G_sample, niter = int(args.niter), nrand = int(args.nrand))
-    end = time.time()
-    print("Omega coefficient for graph {}: {}".format(G.name, omega))
-    print("Time taken: ", round(end-start,2))
+G.name = str(args.graph) + " Checkins Graph"
+
+# sample the graph
+G_sample = random_sample(G, float(args.k)) # function from utils.py, check it out there
+
+# compute omega
+start = time.time()
+print("\nComputing omega for graph: ", G.name)
+omega = nx.omega(G_sample, niter = int(args.niter), nrand = int(args.nrand))
+end = time.time()
+print("\nOmega coefficient for graph {}: {}".format(G.name, omega))
+print("Time taken: ", round(end-start,2), " seconds")
--- a/utils.py
+++ b/utils.py
@ -23,7 +23,6 @@ from subprocess import run
 from typing import Literal
 from queue import Empty

-
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 DATA_DIR = os.path.join(SCRIPT_DIR, "data")

@ -47,7 +46,6 @@ def download_datasets():
    The datasets are downloaded in the "data" folder. If the folder doesn't exist, it will be created. If the dataset is already downloaded, it will be skipped. The files are renamed to make them more readable.
    """

-
    dict = {
        "brightkite": ["https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz", "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"],
        "gowalla": ["https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz", "https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz"],
@ -141,7 +139,7 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq

    Raises
    ------
-    ValueError
+    `ValueError`
        If the dataset is not valid.

    """
@ -149,7 +147,6 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq
    if dataset not in ['brightkite', 'gowalla', 'foursquare']:
        raise ValueError("Dataset not valid. Please choose between brightkite, gowalla, foursquare")

-
    file = os.path.join(DATA_DIR, dataset, dataset + "_checkins.txt")
    print("\nCreating the graph for the dataset {}...".format(dataset))
    df = pd.read_csv(file, sep="\t", header=None, names=["user_id", "venue_id"], engine='pyarrow')
@ -162,7 +159,7 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq

    # path to the file where we want to save the graph
    edges_path = os.path.join(DATA_DIR, dataset , dataset + "_checkins_edges.tsv")
-    print("Done! The graph has {} edges".format(G.number_of_edges()), " and {} nodes".format(G.number_of_nodes()))
+    print("Done! The graph has {} edges".format(G.number_of_edges()), "and {} nodes".format(G.number_of_nodes()))

    # delete from memory the dataframe
    del df
@ -173,20 +170,21 @@ def create_graph_from_checkins(dataset: Literal['brightkite', 'gowalla', 'foursq

    return G

-
 # ------------------------------------------------------------------------#

-def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT']) -> nx.Graph:
+def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursquareEU', 'foursquareIT'], create_file = True) -> nx.Graph:

    """
-    Create the graph of friendships for the dataset brightkite, gowalla or foursquare.
-    The graph is saved in a file.
+    Create the graph of friendships for the dataset brightkite, gowalla or foursquare. The graph is saved in a file.

    Parameters
    ----------
    `dataset` : str
        The dataset for which we want to create the graph of friendships.

+    `create_file` : bool, optional
+        If True, the graph is saved in a file, by default True
+
    Returns
    -------
    `G` : networkx.Graph
@ -213,10 +211,14 @@ def create_friendships_graph(dataset: Literal['brightkite', 'gowalla', 'foursqua
    # get the intersection of the two sets and filter the friendship graph
    unique_users = unique_friends.intersection(unique_checkins)
    df = df_friends_all[df_friends_all["node1"].isin(unique_users) & df_friends_all["node2"].isin(unique_users)]
+
+    # save the graph in a file
+    if create_file:
        df.to_csv(os.path.join(DATA_DIR, dataset, dataset + "_friends_edges_filtered.tsv"), sep="\t", header=False, index=False)

    G = nx.from_pandas_edgelist(df, "node1", "node2", create_using=nx.Graph())
    del df_friends_all, df_checkins, df # delete from memory the dataframes
+    print("Created the graph for the dataset {} with {} edges".format(dataset, G.number_of_edges()), "and {} nodes".format(G.number_of_nodes()))

    return G

@ -327,8 +329,16 @@ def betweenness_centrality_parallel(G, processes=None, k =None) -> dict:

    Returns
    -------
-    dict
+    `dict`
+        Dictionary of nodes with betweenness centrality as the value.
+
+    Raises
+    ------
+    `ValueError`
+        If the number of processes is greater than the number of cores in the system

+    `ValueError`
+        If k is not None and k is not between 0 and 1
    Notes
    -----
    Do not use more then 6 process for big graphs, otherwise the memory will be full. Do it only if you have more at least 32 GB of RAM. For small graphs, you can use more processes.
@ -357,10 +367,11 @@ def betweenness_centrality_parallel(G, processes=None, k =None) -> dict:
    if k is None:
        G_copy = G.copy()

-    p = Pool(processes=processes)
-    node_divisor = len(p._pool) * 4
-    node_chunks = list(chunks(G_copy.nodes(), G_copy.order() // node_divisor))
+    p = Pool(processes=processes) # create a pool of processes
+    node_divisor = len(p._pool) * 4 # number of nodes to be processed by each process
+    node_chunks = list(chunks(G_copy.nodes(), G_copy.order() // node_divisor)) # divide the nodes in chunks
    num_chunks = len(node_chunks)
+    # run the algorithm on each chunk
    bt_sc = p.starmap(
        nx.betweenness_centrality_subset,
        zip(
@ -396,12 +407,12 @@ def average_shortest_path(G: nx.Graph, k=None) -> float:

        Returns
        -------
-        float
+        `float`
            The average shortest path length of the graph.

        Raises
        ------
-        ValueError
+        `ValueError`
            If `k` is not between 0 and 1
        """

@ -412,12 +423,11 @@ def average_shortest_path(G: nx.Graph, k=None) -> float:
            connected_components = list(nx.connected_components(G))
        else:
            G_copy = G.copy()
-            # remove the k% of nodes from G
            G_copy.remove_nodes_from(random.sample(G_copy.nodes(), int((k)*G_copy.number_of_nodes())))
            print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes()))
            print("\tNumber of edges after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_edges()))

-        tmp = 0
+        tmp = 0 # temporary variable to store the sum of the average shortest path length of each connected component
        connected_components = list(nx.connected_components(G_copy))
        # remove all the connected components with less than 10 nodes
        connected_components = [c for c in connected_components if len(c) > 10]
@ -445,12 +455,12 @@ def average_clustering_coefficient(G: nx.Graph, k=None) -> float:

    Returns
    -------
-    float
+    `float`
        The average clustering coefficient of the graph.

    Raises
    ------
-    ValueError
+    `ValueError`
        If `k` is not between 0 and 1
    """

@ -461,9 +471,7 @@ def average_clustering_coefficient(G: nx.Graph, k=None) -> float:
        return nx.average_clustering(G)

    else:
-        G_copy = G.copy()
-        G_copy.remove_nodes_from(random.sample(list(G_copy.nodes()), int((k)*G_copy.number_of_nodes())))
-        print("\tNumber of nodes after removing {}% of nodes: {}" .format((k)*100, G_copy.number_of_nodes()))
+        G_copy = random_sample(G, k)
        return nx.average_clustering(G_copy)


@ -479,7 +487,7 @@ def generalized_average_clustering_coefficient(G: nx.Graph) -> float:

    Returns
    -------
-    float
+    `float`
        The generalized average clustering coefficient of the graph.
    """

@ -509,6 +517,10 @@ def create_random_graphs(G: nx.Graph, model = None, save = True) -> nx.Graph:
    -------
    `G_random` : nx.Graph

+    Notes
+    -----
+    This is just a time-saving and approximate way to create random graphs. If you want more accurate random graphs, you should use the function `random_reference` from the `networkx` library.
+
    """

    if model is None:
@ -595,7 +607,7 @@ def visualize_graphs(G: nx.Graph, k: float, connected = True):


    # create a networkx graph
-    net = net = Network(directed=False, bgcolor='#1e1f29', font_color='white')
+    net = Network(directed=False, bgcolor='#1e1f29', font_color='white')

    # for some reasons, if I put % values, the graph is not displayed correctly. So I use pixels, sorry non FHD users
    net.width = '1920px'
@ -652,14 +664,27 @@ def random_sample(graph: nx.Graph, k: float) -> nx.Graph:
    `k`: float
        The percentage of nodes to remove from the graph

+    Raises
+    ------
+    `ValueError`
+        If k is not between 0 and 1
+
    Returns
    -------
    `G`: nx.Graph
+        The sampled graph

    """

+    # edge cases
    if not 0 <= k <= 1:
        raise ValueError("Percentage of nodes needs to be between 0 and 1")
+    elif k == 0:
+        print("k is 0. Returning the original graph")
+        return graph
+    elif k == 1:
+        print("k is 1. Returning an empty graph")
+        return nx.Graph()

    nodes = list(graph.nodes())
    nodes_sample = np.random.choice(nodes, size=int((1-k)*len(nodes)), replace=False)
@ -667,7 +692,7 @@ def random_sample(graph: nx.Graph, k: float) -> nx.Graph:
    G = graph.subgraph(nodes_sample)

    if not nx.is_connected(G):
-        print("Graph is not connected. Taking the largest connected component")
+        print("\nGraph is not connected. Taking the largest connected component")
        connected = max(nx.connected_components(G), key=len)
        G_connected = graph.subgraph(connected)

@ -681,7 +706,7 @@ def random_sample(graph: nx.Graph, k: float) -> nx.Graph:
 def omega_sampled(G: nx.Graph, k: float, niter: int, nrand: int) -> float:

    """
-    Function to compute the omega index of a graph
+    Function to compute the omega index on a sampled graph

    Parameters
    ----------
@ -701,7 +726,6 @@ def omega_sampled(G: nx.Graph, k: float, niter: int, nrand: int) -> float:
    -------
    `omega`: float
        The omega index of the graph
-
    """

    # sample the graph
@ -739,23 +763,30 @@ def parallel_omega(G: nx.Graph, k: float, nrand: int = 6, niter: int = 6, n_proc
    `seed`: int
        The seed to use to generate the random graphs. Default is 42.

+    Raises
+    ------
+    `ValueError`
+        If n_processes is less than 1
+
    Returns
    -------
    `omega`: float

    Notes
    -----
-    This is just a notebook version of the program omega_parallel_server.py that you can find in the repository. This is supposed to be used just fo testing on small graphs.
-
+    This is an experimental function that has not been fully tested.
    """

    if n_processes is None:
        n_processes = multiprocessing.cpu_count()
    if n_processes > nrand:
        n_processes = nrand
+    if n_processes < 1:
+        raise ValueError("Number of processes needs to be at least 1")

    random.seed(seed)
    if not nx.is_connected(G):
+        # take the largest connected component
        G = G.subgraph(max(nx.connected_components(G), key=len))

    if len(G) == 1:
@ -764,6 +795,7 @@ def parallel_omega(G: nx.Graph, k: float, nrand: int = 6, niter: int = 6, n_proc
    # sample the graph
    G = random_sample(G, k)

+    # we are using two queues to share the seeds and the results between the processes
    def worker(queue_seeds, queue_results): # worker function to be used in parallel
        while True:
            try: