diff --git a/README.md b/README.md
index 1759f72..7574e21 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1,51 @@
# Progetto ASD 2023/2024
-## Usage
+Creazione e analisi di un Pangenome Graph
+
+## Obbiettivi
+
+- [x] Caricare un Pangenome Graph dal formato GFA
+
+- [x] Classificare i nodi del grafo in base al tipo tree, back, forward, cross
+
+- [x] Rimuovere tutti i nodi di tipo back per rendere il grafo un DAG
+
+- [x] Restringere il grafo alla componente connessa piĆ¹ grande
-- `cargo run -- --help`
+- [x] Ricostruire le sequenze dei nodi del grafo in base ai cammini ed alla direzione di percorrenza dei nodi (forward o reverse)
- Display help message.
+- [x] Ricerca di un pattern k-mer in queste sequenze utilizzando il rolling hash
-- `cargo run -- show -i ./dataset/example.gfa`
+- [~] Calcolare le frequenze di occorrenza di tutti i k-mer presenti nel grafo
- Parses the GFA file and prints the graph and various information.
+## CLI Options
+
+- `-i, --input `: file to read
+
+- `-c, --path_count `: number of paths to visit when searching for the pattern (default: 1)
+
+- `-p, --pattern `: k-mer pattern to search (default: "ACGT")
+
+- `-k, --kmer_size `: k-mer length (default: 4)
+
+## Usage
-- Visualize the graph using EGUI, go to `examples/configurable` and run:
+- To show help message:
- `cargo run -- ../../dataset/example.gfa`
+ ```
+ cargo run -- --help`
+ ```
- `cargo run -- ../../dataset/DRB1-3123_unsorted.gfa`
+- For example to try out the `chrX` dataset:
-- Running the project with all the flags:
+ ```bash
+ GFA_URL='https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz'
+ wget $GFA_URL -O dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
+ gunzip dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
+ cargo run --release -- -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3
+ ```
- `cargo run --release -- show -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3`
+ altri dataset sono elencati in [Note](#Note)
## Example GFA
diff --git a/src/main.rs b/src/main.rs
index 460f880..3588f4b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -21,20 +21,6 @@ use rolling_hash::RollingHasher;
#[derive(FromArgs, PartialEq, Debug)]
/// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024
struct CliTool {
- #[argh(subcommand)]
- nested: CliSubcommands,
-}
-
-#[derive(FromArgs, PartialEq, Debug)]
-#[argh(subcommand)]
-enum CliSubcommands {
- Show(CommandShow),
-}
-
-#[derive(FromArgs, PartialEq, Debug)]
-/// Parse and show the content of a file
-#[argh(subcommand, name = "show")]
-struct CommandShow {
#[argh(option, short = 'i')]
/// file to read
input: String,
@@ -55,125 +41,120 @@ struct CommandShow {
fn main() -> std::io::Result<()> {
let opts = argh::from_env::();
- match opts.nested {
- CliSubcommands::Show(opts) => {
- // validate opts.pattern is a valid DNA sequence
- if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
- eprintln!("Invalid pattern: {:?}", opts.pattern);
- process::exit(1);
- }
+ // validate opts.pattern is a valid DNA sequence
+ if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
+ eprintln!("Invalid pattern: {:?}", opts.pattern);
+ process::exit(1);
+ }
- println!("Estimating line count...");
-
- let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
- .lines()
- .progress_with(indicatif::ProgressBar::new_spinner())
- .count() as u64;
-
- let entries =
- gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
-
- println!("Number of entries: {}", entries.len());
-
- let mut sequence_map = HashMap::new();
- let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
-
- let mut invalid_nodes = vec![];
-
- for entry in entries {
- match entry {
- Entry::Segment { id, sequence } => {
- // validate sequence is a valid DNA sequence
- if sequence.chars().any(|c| !"ACGT".contains(c)) {
- invalid_nodes.push(id.clone());
- continue;
- }
-
- sequence_map.insert(id.clone(), sequence);
- }
- Entry::Link {
- from,
- from_orient,
- to,
- to_orient,
- } => {
- graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
- }
- _ => {}
+ println!("Estimating line count...");
+
+ let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
+ .lines()
+ .progress_with(indicatif::ProgressBar::new_spinner())
+ .count() as u64;
+
+ let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
+
+ println!("Number of entries: {}", entries.len());
+
+ let mut sequence_map = HashMap::new();
+ let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
+
+ let mut invalid_nodes = vec![];
+
+ for entry in entries {
+ match entry {
+ Entry::Segment { id, sequence } => {
+ // validate sequence is a valid DNA sequence
+ if sequence.chars().any(|c| !"ACGT".contains(c)) {
+ invalid_nodes.push(id.clone());
+ continue;
}
- }
- println!("Removing {} invalid nodes...", invalid_nodes.len());
- // remove invalid nodes
- for id in invalid_nodes.iter().progress() {
- graph.remove_node(&(id.clone(), Orientation::Forward));
- graph.remove_node(&(id.clone(), Orientation::Reverse));
+ sequence_map.insert(id.clone(), sequence);
}
- println!();
-
- compute_graph_degrees(&graph);
+ Entry::Link {
+ from,
+ from_orient,
+ to,
+ to_orient,
+ } => {
+ graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
+ }
+ _ => {}
+ }
+ }
- let dag = graph.dag();
+ println!("Removing {} invalid nodes...", invalid_nodes.len());
+ // remove invalid nodes
+ for id in invalid_nodes.iter().progress() {
+ graph.remove_node(&(id.clone(), Orientation::Forward));
+ graph.remove_node(&(id.clone(), Orientation::Reverse));
+ }
+ println!();
- compute_edge_types(&dag);
+ compute_graph_degrees(&graph);
- let ccs = compute_ccs(&dag);
+ let dag = graph.dag();
- println!("Picking largest connected component...");
- // pick the largest connected component
- let largest_cc = ccs
- .iter()
- .max_by_key(|cc| cc.len())
- .expect("at least one connected components");
+ compute_edge_types(&dag);
- let largest_cc_graph = dag.restricted(largest_cc);
+ let ccs = compute_ccs(&dag);
- let degrees = compute_graph_degrees(&largest_cc_graph);
- compute_edge_types(&largest_cc_graph); // to double check this is a DAG
+ println!("Picking largest connected component...");
+ // pick the largest connected component
+ let largest_cc = ccs
+ .iter()
+ .max_by_key(|cc| cc.len())
+ .expect("at least one connected components");
- println!("Searching for a start node...");
- let start_node = degrees
- .iter()
- .find(|(_, degree)| degree.in_degree == 0)
- .expect("no start node found")
- .0;
+ let largest_cc_graph = dag.restricted(largest_cc);
- println!("Start node: {:?}", start_node);
- println!("{:?}", degrees.get(start_node).unwrap());
+ let degrees = compute_graph_degrees(&largest_cc_graph);
+ compute_edge_types(&largest_cc_graph); // to double check this is a DAG
- compute_orientation_histogram(&largest_cc_graph);
+ println!("Searching for a start node...");
+ let start_node = degrees
+ .iter()
+ .find(|(_, degree)| degree.in_degree == 0)
+ .expect("no start node found")
+ .0;
- println!("Visiting the graph, searching {} paths...", opts.path_count);
+ println!("Start node: {:?}", start_node);
+ println!("{:?}", degrees.get(start_node).unwrap());
- let sequences = compute_sequences(
- &sequence_map,
- &largest_cc_graph,
- start_node,
- opts.path_count,
- );
+ compute_orientation_histogram(&largest_cc_graph);
- for (i, sequence) in sequences.iter().enumerate() {
- println!("Sequence #{} of length {}", i + 1, sequence.len());
+ println!("Visiting the graph, searching {} paths...", opts.path_count);
- println!("Searching {} (naive)...", opts.pattern);
- println!(
- "Occurrences: {:?}\n",
- compute_sequence_occurrences_naive(sequence, &opts.pattern)
- );
+ let sequences = compute_sequences(
+ &sequence_map,
+ &largest_cc_graph,
+ start_node,
+ opts.path_count,
+ );
- println!("Searching {} (rolling hash)...", opts.pattern);
- println!(
- "Occurrences: {:?}\n",
- compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
- );
- }
+ for (i, sequence) in sequences.iter().enumerate() {
+ println!("Sequence #{} of length {}", i + 1, sequence.len());
- compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
+ println!("Searching {} (naive)...", opts.pattern);
+ println!(
+ "Occurrences: {:?}\n",
+ compute_sequence_occurrences_naive(sequence, &opts.pattern)
+ );
- println!("Cleaning up...");
- process::exit(0);
- }
+ println!("Searching {} (rolling hash)...", opts.pattern);
+ println!(
+ "Occurrences: {:?}\n",
+ compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
+ );
}
+
+ compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
+
+ println!("Cleaning up...");
+ process::exit(0);
}
fn compute_kmer_histogram_lb(