updated readme and cli tool

4 weeks ago · 0d12554e93
parent f0e93e92a2
commit 0d12554e93
2 changed files with 130 additions and 122 deletions
--- a/README.md
+++ b/README.md
@ -1,24 +1,51 @@
 # Progetto ASD 2023/2024
-## Usage
+Creazione e analisi di un Pangenome Graph
 ## Obbiettivi
 - [x] Caricare un Pangenome Graph dal formato GFA
 - [x] Classificare i nodi del grafo in base al tipo tree, back, forward, cross
 - [x] Rimuovere tutti i nodi di tipo back per rendere il grafo un DAG
 - [x] Restringere il grafo alla componente connessa più grande
-   `cargo run -- --help`
+- [x] Ricostruire le sequenze dei nodi del grafo in base ai cammini ed alla direzione di percorrenza dei nodi (forward o reverse)
-    Display help message.
+- [x] Ricerca di un pattern k-mer in queste sequenze utilizzando il rolling hash
-   `cargo run -- show -i ./dataset/example.gfa`
+- [~] Calcolare le frequenze di occorrenza di tutti i k-mer presenti nel grafo
-    Parses the GFA file and prints the graph and various information.
+## CLI Options
 - `-i, --input <input>`: file to read
 - `-c, --path_count <path_count>`: number of paths to visit when searching for the pattern (default: 1)
 - `-p, --pattern <pattern>`: k-mer pattern to search (default: "ACGT")
 - `-k, --kmer_size <kmer_size>`: k-mer length (default: 4)
 ## Usage
-   Visualize the graph using EGUI, go to `examples/configurable` and run:
+-   To show help message:
-    `cargo run -- ../../dataset/example.gfa`
+    ```
    cargo run -- --help`
    ```
-    `cargo run -- ../../dataset/DRB1-3123_unsorted.gfa`
+-   For example to try out the `chrX` dataset:
-   Running the project with all the flags:
+    ```bash
    GFA_URL='https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz'
    wget $GFA_URL -O dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
    gunzip dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
    cargo run --release -- -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3
    ```
-    `cargo run --release -- show -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3`
+    altri dataset sono elencati in [Note](#Note)
 ## Example GFA
--- a/src/main.rs
+++ b/src/main.rs
@ -21,20 +21,6 @@ use rolling_hash::RollingHasher;
 #[derive(FromArgs, PartialEq, Debug)]
 /// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024
 struct CliTool {
    #[argh(subcommand)]
    nested: CliSubcommands,
 }
 #[derive(FromArgs, PartialEq, Debug)]
 #[argh(subcommand)]
 enum CliSubcommands {
    Show(CommandShow),
 }
 #[derive(FromArgs, PartialEq, Debug)]
 /// Parse and show the content of a file
 #[argh(subcommand, name = "show")]
 struct CommandShow {
    #[argh(option, short = 'i')]
    /// file to read
    input: String,
@ -55,125 +41,120 @@ struct CommandShow {
 fn main() -> std::io::Result<()> {
    let opts = argh::from_env::<CliTool>();
-    match opts.nested {
+    // validate opts.pattern is a valid DNA sequence
-        CliSubcommands::Show(opts) => {
+    if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
-            // validate opts.pattern is a valid DNA sequence
+        eprintln!("Invalid pattern: {:?}", opts.pattern);
-            if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
+        process::exit(1);
-                eprintln!("Invalid pattern: {:?}", opts.pattern);
+    }
                process::exit(1);
            }
-            println!("Estimating line count...");
+    println!("Estimating line count...");
-
+
-            let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
+    let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
-                .lines()
+        .lines()
-                .progress_with(indicatif::ProgressBar::new_spinner())
+        .progress_with(indicatif::ProgressBar::new_spinner())
-                .count() as u64;
+        .count() as u64;
-
+
-            let entries =
+    let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
-                gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
+
-
+    println!("Number of entries: {}", entries.len());
-            println!("Number of entries: {}", entries.len());
+
-
+    let mut sequence_map = HashMap::new();
-            let mut sequence_map = HashMap::new();
+    let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
-            let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
+
-
+    let mut invalid_nodes = vec![];
-            let mut invalid_nodes = vec![];
+
-
+    for entry in entries {
-            for entry in entries {
+        match entry {
-                match entry {
+            Entry::Segment { id, sequence } => {
-                    Entry::Segment { id, sequence } => {
+                // validate sequence is a valid DNA sequence
-                        // validate sequence is a valid DNA sequence
+                if sequence.chars().any(|c| !"ACGT".contains(c)) {
-                        if sequence.chars().any(|c| !"ACGT".contains(c)) {
+                    invalid_nodes.push(id.clone());
-                            invalid_nodes.push(id.clone());
+                    continue;
                            continue;
                        }
                        sequence_map.insert(id.clone(), sequence);
                    }
                    Entry::Link {
                        from,
                        from_orient,
                        to,
                        to_orient,
                    } => {
                        graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
                    }
                    _ => {}
                }
            }
-            println!("Removing {} invalid nodes...", invalid_nodes.len());
+                sequence_map.insert(id.clone(), sequence);
            // remove invalid nodes
            for id in invalid_nodes.iter().progress() {
                graph.remove_node(&(id.clone(), Orientation::Forward));
                graph.remove_node(&(id.clone(), Orientation::Reverse));
            }
-            println!();
+            Entry::Link {
-
+                from,
-            compute_graph_degrees(&graph);
+                from_orient,
                to,
                to_orient,
            } => {
                graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
            }
            _ => {}
        }
    }
-            let dag = graph.dag();
+    println!("Removing {} invalid nodes...", invalid_nodes.len());
    // remove invalid nodes
    for id in invalid_nodes.iter().progress() {
        graph.remove_node(&(id.clone(), Orientation::Forward));
        graph.remove_node(&(id.clone(), Orientation::Reverse));
    }
    println!();
-            compute_edge_types(&dag);
+    compute_graph_degrees(&graph);
-            let ccs = compute_ccs(&dag);
+    let dag = graph.dag();
-            println!("Picking largest connected component...");
+    compute_edge_types(&dag);
            // pick the largest connected component
            let largest_cc = ccs
                .iter()
                .max_by_key(|cc| cc.len())
                .expect("at least one connected components");
-            let largest_cc_graph = dag.restricted(largest_cc);
+    let ccs = compute_ccs(&dag);
-            let degrees = compute_graph_degrees(&largest_cc_graph);
+    println!("Picking largest connected component...");
-            compute_edge_types(&largest_cc_graph); // to double check this is a DAG
+    // pick the largest connected component
    let largest_cc = ccs
        .iter()
        .max_by_key(|cc| cc.len())
        .expect("at least one connected components");
-            println!("Searching for a start node...");
+    let largest_cc_graph = dag.restricted(largest_cc);
            let start_node = degrees
                .iter()
                .find(|(_, degree)| degree.in_degree == 0)
                .expect("no start node found")
                .0;
-            println!("Start node: {:?}", start_node);
+    let degrees = compute_graph_degrees(&largest_cc_graph);
-            println!("{:?}", degrees.get(start_node).unwrap());
+    compute_edge_types(&largest_cc_graph); // to double check this is a DAG
-            compute_orientation_histogram(&largest_cc_graph);
+    println!("Searching for a start node...");
    let start_node = degrees
        .iter()
        .find(|(_, degree)| degree.in_degree == 0)
        .expect("no start node found")
        .0;
-            println!("Visiting the graph, searching {} paths...", opts.path_count);
+    println!("Start node: {:?}", start_node);
    println!("{:?}", degrees.get(start_node).unwrap());
-            let sequences = compute_sequences(
+    compute_orientation_histogram(&largest_cc_graph);
                &sequence_map,
                &largest_cc_graph,
                start_node,
                opts.path_count,
            );
-            for (i, sequence) in sequences.iter().enumerate() {
+    println!("Visiting the graph, searching {} paths...", opts.path_count);
                println!("Sequence #{} of length {}", i + 1, sequence.len());
-                println!("Searching {} (naive)...", opts.pattern);
+    let sequences = compute_sequences(
-                println!(
+        &sequence_map,
-                    "Occurrences: {:?}\n",
+        &largest_cc_graph,
-                    compute_sequence_occurrences_naive(sequence, &opts.pattern)
+        start_node,
-                );
+        opts.path_count,
    );
-                println!("Searching {} (rolling hash)...", opts.pattern);
+    for (i, sequence) in sequences.iter().enumerate() {
-                println!(
+        println!("Sequence #{} of length {}", i + 1, sequence.len());
                    "Occurrences: {:?}\n",
                    compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
                );
            }
-            compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
+        println!("Searching {} (naive)...", opts.pattern);
        println!(
            "Occurrences: {:?}\n",
            compute_sequence_occurrences_naive(sequence, &opts.pattern)
        );
-            println!("Cleaning up...");
+        println!("Searching {} (rolling hash)...", opts.pattern);
-            process::exit(0);
+        println!(
-        }
+            "Occurrences: {:?}\n",
            compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
        );
    }
    compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
    println!("Cleaning up...");
    process::exit(0);
 }
 fn compute_kmer_histogram_lb(