From 0d12554e93a9f242d64a6f5e803c695faf449fc9 Mon Sep 17 00:00:00 2001
From: Antonio De Lucreziis <antonio.delucreziis@gmail.com>
Date: Wed, 30 Oct 2024 12:54:26 +0100
Subject: [PATCH] updated readme and cli tool

---
 README.md   |  47 +++++++++---
 src/main.rs | 205 ++++++++++++++++++++++++----------------------------
 2 files changed, 130 insertions(+), 122 deletions(-)
diff --git a/README.md b/README.md
index 1759f72..7574e21 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1,51 @@
 # Progetto ASD 2023/2024
 
-## Usage
+Creazione e analisi di un Pangenome Graph
+
+## Obbiettivi
+
+- [x] Caricare un Pangenome Graph dal formato GFA
+
+- [x] Classificare i nodi del grafo in base al tipo tree, back, forward, cross
+
+- [x] Rimuovere tutti i nodi di tipo back per rendere il grafo un DAG
+
+- [x] Restringere il grafo alla componente connessa più grande
 
--   `cargo run -- --help`
+- [x] Ricostruire le sequenze dei nodi del grafo in base ai cammini ed alla direzione di percorrenza dei nodi (forward o reverse)
 
-    Display help message.
+- [x] Ricerca di un pattern k-mer in queste sequenze utilizzando il rolling hash
 
--   `cargo run -- show -i ./dataset/example.gfa`
+- [~] Calcolare le frequenze di occorrenza di tutti i k-mer presenti nel grafo
 
-    Parses the GFA file and prints the graph and various information.
+## CLI Options
+
+- `-i, --input <input>`: file to read
+
+- `-c, --path_count <path_count>`: number of paths to visit when searching for the pattern (default: 1)
+
+- `-p, --pattern <pattern>`: k-mer pattern to search (default: "ACGT")
+
+- `-k, --kmer_size <kmer_size>`: k-mer length (default: 4)
+
+## Usage
 
--   Visualize the graph using EGUI, go to `examples/configurable` and run:
+-   To show help message:
 
-    `cargo run -- ../../dataset/example.gfa`
+    ```
+    cargo run -- --help`
+    ```
 
-    `cargo run -- ../../dataset/DRB1-3123_unsorted.gfa`
+-   For example to try out the `chrX` dataset:
 
--   Running the project with all the flags:
+    ```bash
+    GFA_URL='https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz'
+    wget $GFA_URL -O dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
+    gunzip dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
+    cargo run --release -- -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3
+    ```
 
-    `cargo run --release -- show -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3`
+    altri dataset sono elencati in [Note](#Note)
 
 ## Example GFA
 
diff --git a/src/main.rs b/src/main.rs
index 460f880..3588f4b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -21,20 +21,6 @@ use rolling_hash::RollingHasher;
 #[derive(FromArgs, PartialEq, Debug)]
 /// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024
 struct CliTool {
-    #[argh(subcommand)]
-    nested: CliSubcommands,
-}
-
-#[derive(FromArgs, PartialEq, Debug)]
-#[argh(subcommand)]
-enum CliSubcommands {
-    Show(CommandShow),
-}
-
-#[derive(FromArgs, PartialEq, Debug)]
-/// Parse and show the content of a file
-#[argh(subcommand, name = "show")]
-struct CommandShow {
     #[argh(option, short = 'i')]
     /// file to read
     input: String,
@@ -55,125 +41,120 @@ struct CommandShow {
 fn main() -> std::io::Result<()> {
     let opts = argh::from_env::<CliTool>();
 
-    match opts.nested {
-        CliSubcommands::Show(opts) => {
-            // validate opts.pattern is a valid DNA sequence
-            if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
-                eprintln!("Invalid pattern: {:?}", opts.pattern);
-                process::exit(1);
-            }
+    // validate opts.pattern is a valid DNA sequence
+    if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
+        eprintln!("Invalid pattern: {:?}", opts.pattern);
+        process::exit(1);
+    }
 
-            println!("Estimating line count...");
-
-            let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
-                .lines()
-                .progress_with(indicatif::ProgressBar::new_spinner())
-                .count() as u64;
-
-            let entries =
-                gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
-
-            println!("Number of entries: {}", entries.len());
-
-            let mut sequence_map = HashMap::new();
-            let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
-
-            let mut invalid_nodes = vec![];
-
-            for entry in entries {
-                match entry {
-                    Entry::Segment { id, sequence } => {
-                        // validate sequence is a valid DNA sequence
-                        if sequence.chars().any(|c| !"ACGT".contains(c)) {
-                            invalid_nodes.push(id.clone());
-                            continue;
-                        }
-
-                        sequence_map.insert(id.clone(), sequence);
-                    }
-                    Entry::Link {
-                        from,
-                        from_orient,
-                        to,
-                        to_orient,
-                    } => {
-                        graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
-                    }
-                    _ => {}
+    println!("Estimating line count...");
+
+    let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
+        .lines()
+        .progress_with(indicatif::ProgressBar::new_spinner())
+        .count() as u64;
+
+    let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
+
+    println!("Number of entries: {}", entries.len());
+
+    let mut sequence_map = HashMap::new();
+    let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
+
+    let mut invalid_nodes = vec![];
+
+    for entry in entries {
+        match entry {
+            Entry::Segment { id, sequence } => {
+                // validate sequence is a valid DNA sequence
+                if sequence.chars().any(|c| !"ACGT".contains(c)) {
+                    invalid_nodes.push(id.clone());
+                    continue;
                 }
-            }
 
-            println!("Removing {} invalid nodes...", invalid_nodes.len());
-            // remove invalid nodes
-            for id in invalid_nodes.iter().progress() {
-                graph.remove_node(&(id.clone(), Orientation::Forward));
-                graph.remove_node(&(id.clone(), Orientation::Reverse));
+                sequence_map.insert(id.clone(), sequence);
             }
-            println!();
-
-            compute_graph_degrees(&graph);
+            Entry::Link {
+                from,
+                from_orient,
+                to,
+                to_orient,
+            } => {
+                graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
+            }
+            _ => {}
+        }
+    }
 
-            let dag = graph.dag();
+    println!("Removing {} invalid nodes...", invalid_nodes.len());
+    // remove invalid nodes
+    for id in invalid_nodes.iter().progress() {
+        graph.remove_node(&(id.clone(), Orientation::Forward));
+        graph.remove_node(&(id.clone(), Orientation::Reverse));
+    }
+    println!();
 
-            compute_edge_types(&dag);
+    compute_graph_degrees(&graph);
 
-            let ccs = compute_ccs(&dag);
+    let dag = graph.dag();
 
-            println!("Picking largest connected component...");
-            // pick the largest connected component
-            let largest_cc = ccs
-                .iter()
-                .max_by_key(|cc| cc.len())
-                .expect("at least one connected components");
+    compute_edge_types(&dag);
 
-            let largest_cc_graph = dag.restricted(largest_cc);
+    let ccs = compute_ccs(&dag);
 
-            let degrees = compute_graph_degrees(&largest_cc_graph);
-            compute_edge_types(&largest_cc_graph); // to double check this is a DAG
+    println!("Picking largest connected component...");
+    // pick the largest connected component
+    let largest_cc = ccs
+        .iter()
+        .max_by_key(|cc| cc.len())
+        .expect("at least one connected components");
 
-            println!("Searching for a start node...");
-            let start_node = degrees
-                .iter()
-                .find(|(_, degree)| degree.in_degree == 0)
-                .expect("no start node found")
-                .0;
+    let largest_cc_graph = dag.restricted(largest_cc);
 
-            println!("Start node: {:?}", start_node);
-            println!("{:?}", degrees.get(start_node).unwrap());
+    let degrees = compute_graph_degrees(&largest_cc_graph);
+    compute_edge_types(&largest_cc_graph); // to double check this is a DAG
 
-            compute_orientation_histogram(&largest_cc_graph);
+    println!("Searching for a start node...");
+    let start_node = degrees
+        .iter()
+        .find(|(_, degree)| degree.in_degree == 0)
+        .expect("no start node found")
+        .0;
 
-            println!("Visiting the graph, searching {} paths...", opts.path_count);
+    println!("Start node: {:?}", start_node);
+    println!("{:?}", degrees.get(start_node).unwrap());
 
-            let sequences = compute_sequences(
-                &sequence_map,
-                &largest_cc_graph,
-                start_node,
-                opts.path_count,
-            );
+    compute_orientation_histogram(&largest_cc_graph);
 
-            for (i, sequence) in sequences.iter().enumerate() {
-                println!("Sequence #{} of length {}", i + 1, sequence.len());
+    println!("Visiting the graph, searching {} paths...", opts.path_count);
 
-                println!("Searching {} (naive)...", opts.pattern);
-                println!(
-                    "Occurrences: {:?}\n",
-                    compute_sequence_occurrences_naive(sequence, &opts.pattern)
-                );
+    let sequences = compute_sequences(
+        &sequence_map,
+        &largest_cc_graph,
+        start_node,
+        opts.path_count,
+    );
 
-                println!("Searching {} (rolling hash)...", opts.pattern);
-                println!(
-                    "Occurrences: {:?}\n",
-                    compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
-                );
-            }
+    for (i, sequence) in sequences.iter().enumerate() {
+        println!("Sequence #{} of length {}", i + 1, sequence.len());
 
-            compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
+        println!("Searching {} (naive)...", opts.pattern);
+        println!(
+            "Occurrences: {:?}\n",
+            compute_sequence_occurrences_naive(sequence, &opts.pattern)
+        );
 
-            println!("Cleaning up...");
-            process::exit(0);
-        }
+        println!("Searching {} (rolling hash)...", opts.pattern);
+        println!(
+            "Occurrences: {:?}\n",
+            compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
+        );
     }
+
+    compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
+
+    println!("Cleaning up...");
+    process::exit(0);
 }
 
 fn compute_kmer_histogram_lb(