From 0d12554e93a9f242d64a6f5e803c695faf449fc9 Mon Sep 17 00:00:00 2001 From: Antonio De Lucreziis Date: Wed, 30 Oct 2024 12:54:26 +0100 Subject: [PATCH] updated readme and cli tool --- README.md | 47 +++++++++--- src/main.rs | 205 ++++++++++++++++++++++++---------------------------- 2 files changed, 130 insertions(+), 122 deletions(-) diff --git a/README.md b/README.md index 1759f72..7574e21 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,51 @@ # Progetto ASD 2023/2024 -## Usage +Creazione e analisi di un Pangenome Graph + +## Obbiettivi + +- [x] Caricare un Pangenome Graph dal formato GFA + +- [x] Classificare i nodi del grafo in base al tipo tree, back, forward, cross + +- [x] Rimuovere tutti i nodi di tipo back per rendere il grafo un DAG + +- [x] Restringere il grafo alla componente connessa piĆ¹ grande -- `cargo run -- --help` +- [x] Ricostruire le sequenze dei nodi del grafo in base ai cammini ed alla direzione di percorrenza dei nodi (forward o reverse) - Display help message. +- [x] Ricerca di un pattern k-mer in queste sequenze utilizzando il rolling hash -- `cargo run -- show -i ./dataset/example.gfa` +- [~] Calcolare le frequenze di occorrenza di tutti i k-mer presenti nel grafo - Parses the GFA file and prints the graph and various information. +## CLI Options + +- `-i, --input `: file to read + +- `-c, --path_count `: number of paths to visit when searching for the pattern (default: 1) + +- `-p, --pattern `: k-mer pattern to search (default: "ACGT") + +- `-k, --kmer_size `: k-mer length (default: 4) + +## Usage -- Visualize the graph using EGUI, go to `examples/configurable` and run: +- To show help message: - `cargo run -- ../../dataset/example.gfa` + ``` + cargo run -- --help` + ``` - `cargo run -- ../../dataset/DRB1-3123_unsorted.gfa` +- For example to try out the `chrX` dataset: -- Running the project with all the flags: + ```bash + GFA_URL='https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz' + wget $GFA_URL -O dataset/chrX.hprc-v1.0-pggb.local.gfa.gz + gunzip dataset/chrX.hprc-v1.0-pggb.local.gfa.gz + cargo run --release -- -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3 + ``` - `cargo run --release -- show -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3` + altri dataset sono elencati in [Note](#Note) ## Example GFA diff --git a/src/main.rs b/src/main.rs index 460f880..3588f4b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,20 +21,6 @@ use rolling_hash::RollingHasher; #[derive(FromArgs, PartialEq, Debug)] /// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024 struct CliTool { - #[argh(subcommand)] - nested: CliSubcommands, -} - -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand)] -enum CliSubcommands { - Show(CommandShow), -} - -#[derive(FromArgs, PartialEq, Debug)] -/// Parse and show the content of a file -#[argh(subcommand, name = "show")] -struct CommandShow { #[argh(option, short = 'i')] /// file to read input: String, @@ -55,125 +41,120 @@ struct CommandShow { fn main() -> std::io::Result<()> { let opts = argh::from_env::(); - match opts.nested { - CliSubcommands::Show(opts) => { - // validate opts.pattern is a valid DNA sequence - if opts.pattern.chars().any(|c| !"ACGT".contains(c)) { - eprintln!("Invalid pattern: {:?}", opts.pattern); - process::exit(1); - } + // validate opts.pattern is a valid DNA sequence + if opts.pattern.chars().any(|c| !"ACGT".contains(c)) { + eprintln!("Invalid pattern: {:?}", opts.pattern); + process::exit(1); + } - println!("Estimating line count..."); - - let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?) - .lines() - .progress_with(indicatif::ProgressBar::new_spinner()) - .count() as u64; - - let entries = - gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?; - - println!("Number of entries: {}", entries.len()); - - let mut sequence_map = HashMap::new(); - let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new(); - - let mut invalid_nodes = vec![]; - - for entry in entries { - match entry { - Entry::Segment { id, sequence } => { - // validate sequence is a valid DNA sequence - if sequence.chars().any(|c| !"ACGT".contains(c)) { - invalid_nodes.push(id.clone()); - continue; - } - - sequence_map.insert(id.clone(), sequence); - } - Entry::Link { - from, - from_orient, - to, - to_orient, - } => { - graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient)); - } - _ => {} + println!("Estimating line count..."); + + let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?) + .lines() + .progress_with(indicatif::ProgressBar::new_spinner()) + .count() as u64; + + let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?; + + println!("Number of entries: {}", entries.len()); + + let mut sequence_map = HashMap::new(); + let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new(); + + let mut invalid_nodes = vec![]; + + for entry in entries { + match entry { + Entry::Segment { id, sequence } => { + // validate sequence is a valid DNA sequence + if sequence.chars().any(|c| !"ACGT".contains(c)) { + invalid_nodes.push(id.clone()); + continue; } - } - println!("Removing {} invalid nodes...", invalid_nodes.len()); - // remove invalid nodes - for id in invalid_nodes.iter().progress() { - graph.remove_node(&(id.clone(), Orientation::Forward)); - graph.remove_node(&(id.clone(), Orientation::Reverse)); + sequence_map.insert(id.clone(), sequence); } - println!(); - - compute_graph_degrees(&graph); + Entry::Link { + from, + from_orient, + to, + to_orient, + } => { + graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient)); + } + _ => {} + } + } - let dag = graph.dag(); + println!("Removing {} invalid nodes...", invalid_nodes.len()); + // remove invalid nodes + for id in invalid_nodes.iter().progress() { + graph.remove_node(&(id.clone(), Orientation::Forward)); + graph.remove_node(&(id.clone(), Orientation::Reverse)); + } + println!(); - compute_edge_types(&dag); + compute_graph_degrees(&graph); - let ccs = compute_ccs(&dag); + let dag = graph.dag(); - println!("Picking largest connected component..."); - // pick the largest connected component - let largest_cc = ccs - .iter() - .max_by_key(|cc| cc.len()) - .expect("at least one connected components"); + compute_edge_types(&dag); - let largest_cc_graph = dag.restricted(largest_cc); + let ccs = compute_ccs(&dag); - let degrees = compute_graph_degrees(&largest_cc_graph); - compute_edge_types(&largest_cc_graph); // to double check this is a DAG + println!("Picking largest connected component..."); + // pick the largest connected component + let largest_cc = ccs + .iter() + .max_by_key(|cc| cc.len()) + .expect("at least one connected components"); - println!("Searching for a start node..."); - let start_node = degrees - .iter() - .find(|(_, degree)| degree.in_degree == 0) - .expect("no start node found") - .0; + let largest_cc_graph = dag.restricted(largest_cc); - println!("Start node: {:?}", start_node); - println!("{:?}", degrees.get(start_node).unwrap()); + let degrees = compute_graph_degrees(&largest_cc_graph); + compute_edge_types(&largest_cc_graph); // to double check this is a DAG - compute_orientation_histogram(&largest_cc_graph); + println!("Searching for a start node..."); + let start_node = degrees + .iter() + .find(|(_, degree)| degree.in_degree == 0) + .expect("no start node found") + .0; - println!("Visiting the graph, searching {} paths...", opts.path_count); + println!("Start node: {:?}", start_node); + println!("{:?}", degrees.get(start_node).unwrap()); - let sequences = compute_sequences( - &sequence_map, - &largest_cc_graph, - start_node, - opts.path_count, - ); + compute_orientation_histogram(&largest_cc_graph); - for (i, sequence) in sequences.iter().enumerate() { - println!("Sequence #{} of length {}", i + 1, sequence.len()); + println!("Visiting the graph, searching {} paths...", opts.path_count); - println!("Searching {} (naive)...", opts.pattern); - println!( - "Occurrences: {:?}\n", - compute_sequence_occurrences_naive(sequence, &opts.pattern) - ); + let sequences = compute_sequences( + &sequence_map, + &largest_cc_graph, + start_node, + opts.path_count, + ); - println!("Searching {} (rolling hash)...", opts.pattern); - println!( - "Occurrences: {:?}\n", - compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern) - ); - } + for (i, sequence) in sequences.iter().enumerate() { + println!("Sequence #{} of length {}", i + 1, sequence.len()); - compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size); + println!("Searching {} (naive)...", opts.pattern); + println!( + "Occurrences: {:?}\n", + compute_sequence_occurrences_naive(sequence, &opts.pattern) + ); - println!("Cleaning up..."); - process::exit(0); - } + println!("Searching {} (rolling hash)...", opts.pattern); + println!( + "Occurrences: {:?}\n", + compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern) + ); } + + compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size); + + println!("Cleaning up..."); + process::exit(0); } fn compute_kmer_histogram_lb(