updated readme and cli tool

main
Antonio De Lucreziis 4 weeks ago
parent f0e93e92a2
commit 0d12554e93

@ -1,24 +1,51 @@
# Progetto ASD 2023/2024 # Progetto ASD 2023/2024
## Usage Creazione e analisi di un Pangenome Graph
## Obbiettivi
- [x] Caricare un Pangenome Graph dal formato GFA
- [x] Classificare i nodi del grafo in base al tipo tree, back, forward, cross
- [x] Rimuovere tutti i nodi di tipo back per rendere il grafo un DAG
- [x] Restringere il grafo alla componente connessa più grande
- `cargo run -- --help` - [x] Ricostruire le sequenze dei nodi del grafo in base ai cammini ed alla direzione di percorrenza dei nodi (forward o reverse)
Display help message. - [x] Ricerca di un pattern k-mer in queste sequenze utilizzando il rolling hash
- `cargo run -- show -i ./dataset/example.gfa` - [~] Calcolare le frequenze di occorrenza di tutti i k-mer presenti nel grafo
Parses the GFA file and prints the graph and various information. ## CLI Options
- `-i, --input <input>`: file to read
- `-c, --path_count <path_count>`: number of paths to visit when searching for the pattern (default: 1)
- `-p, --pattern <pattern>`: k-mer pattern to search (default: "ACGT")
- `-k, --kmer_size <kmer_size>`: k-mer length (default: 4)
## Usage
- Visualize the graph using EGUI, go to `examples/configurable` and run: - To show help message:
`cargo run -- ../../dataset/example.gfa` ```
cargo run -- --help`
```
`cargo run -- ../../dataset/DRB1-3123_unsorted.gfa` - For example to try out the `chrX` dataset:
- Running the project with all the flags: ```bash
GFA_URL='https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz'
wget $GFA_URL -O dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
gunzip dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
cargo run --release -- -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3
```
`cargo run --release -- show -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3` altri dataset sono elencati in [Note](#Note)
## Example GFA ## Example GFA

@ -21,20 +21,6 @@ use rolling_hash::RollingHasher;
#[derive(FromArgs, PartialEq, Debug)] #[derive(FromArgs, PartialEq, Debug)]
/// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024 /// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024
struct CliTool { struct CliTool {
#[argh(subcommand)]
nested: CliSubcommands,
}
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand)]
enum CliSubcommands {
Show(CommandShow),
}
#[derive(FromArgs, PartialEq, Debug)]
/// Parse and show the content of a file
#[argh(subcommand, name = "show")]
struct CommandShow {
#[argh(option, short = 'i')] #[argh(option, short = 'i')]
/// file to read /// file to read
input: String, input: String,
@ -55,125 +41,120 @@ struct CommandShow {
fn main() -> std::io::Result<()> { fn main() -> std::io::Result<()> {
let opts = argh::from_env::<CliTool>(); let opts = argh::from_env::<CliTool>();
match opts.nested { // validate opts.pattern is a valid DNA sequence
CliSubcommands::Show(opts) => { if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
// validate opts.pattern is a valid DNA sequence eprintln!("Invalid pattern: {:?}", opts.pattern);
if opts.pattern.chars().any(|c| !"ACGT".contains(c)) { process::exit(1);
eprintln!("Invalid pattern: {:?}", opts.pattern); }
process::exit(1);
}
println!("Estimating line count..."); println!("Estimating line count...");
let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?) let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
.lines() .lines()
.progress_with(indicatif::ProgressBar::new_spinner()) .progress_with(indicatif::ProgressBar::new_spinner())
.count() as u64; .count() as u64;
let entries = let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
println!("Number of entries: {}", entries.len());
println!("Number of entries: {}", entries.len());
let mut sequence_map = HashMap::new();
let mut sequence_map = HashMap::new(); let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
let mut invalid_nodes = vec![];
let mut invalid_nodes = vec![];
for entry in entries {
for entry in entries { match entry {
match entry { Entry::Segment { id, sequence } => {
Entry::Segment { id, sequence } => { // validate sequence is a valid DNA sequence
// validate sequence is a valid DNA sequence if sequence.chars().any(|c| !"ACGT".contains(c)) {
if sequence.chars().any(|c| !"ACGT".contains(c)) { invalid_nodes.push(id.clone());
invalid_nodes.push(id.clone()); continue;
continue;
}
sequence_map.insert(id.clone(), sequence);
}
Entry::Link {
from,
from_orient,
to,
to_orient,
} => {
graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
}
_ => {}
} }
}
println!("Removing {} invalid nodes...", invalid_nodes.len()); sequence_map.insert(id.clone(), sequence);
// remove invalid nodes
for id in invalid_nodes.iter().progress() {
graph.remove_node(&(id.clone(), Orientation::Forward));
graph.remove_node(&(id.clone(), Orientation::Reverse));
} }
println!(); Entry::Link {
from,
compute_graph_degrees(&graph); from_orient,
to,
to_orient,
} => {
graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
}
_ => {}
}
}
let dag = graph.dag(); println!("Removing {} invalid nodes...", invalid_nodes.len());
// remove invalid nodes
for id in invalid_nodes.iter().progress() {
graph.remove_node(&(id.clone(), Orientation::Forward));
graph.remove_node(&(id.clone(), Orientation::Reverse));
}
println!();
compute_edge_types(&dag); compute_graph_degrees(&graph);
let ccs = compute_ccs(&dag); let dag = graph.dag();
println!("Picking largest connected component..."); compute_edge_types(&dag);
// pick the largest connected component
let largest_cc = ccs
.iter()
.max_by_key(|cc| cc.len())
.expect("at least one connected components");
let largest_cc_graph = dag.restricted(largest_cc); let ccs = compute_ccs(&dag);
let degrees = compute_graph_degrees(&largest_cc_graph); println!("Picking largest connected component...");
compute_edge_types(&largest_cc_graph); // to double check this is a DAG // pick the largest connected component
let largest_cc = ccs
.iter()
.max_by_key(|cc| cc.len())
.expect("at least one connected components");
println!("Searching for a start node..."); let largest_cc_graph = dag.restricted(largest_cc);
let start_node = degrees
.iter()
.find(|(_, degree)| degree.in_degree == 0)
.expect("no start node found")
.0;
println!("Start node: {:?}", start_node); let degrees = compute_graph_degrees(&largest_cc_graph);
println!("{:?}", degrees.get(start_node).unwrap()); compute_edge_types(&largest_cc_graph); // to double check this is a DAG
compute_orientation_histogram(&largest_cc_graph); println!("Searching for a start node...");
let start_node = degrees
.iter()
.find(|(_, degree)| degree.in_degree == 0)
.expect("no start node found")
.0;
println!("Visiting the graph, searching {} paths...", opts.path_count); println!("Start node: {:?}", start_node);
println!("{:?}", degrees.get(start_node).unwrap());
let sequences = compute_sequences( compute_orientation_histogram(&largest_cc_graph);
&sequence_map,
&largest_cc_graph,
start_node,
opts.path_count,
);
for (i, sequence) in sequences.iter().enumerate() { println!("Visiting the graph, searching {} paths...", opts.path_count);
println!("Sequence #{} of length {}", i + 1, sequence.len());
println!("Searching {} (naive)...", opts.pattern); let sequences = compute_sequences(
println!( &sequence_map,
"Occurrences: {:?}\n", &largest_cc_graph,
compute_sequence_occurrences_naive(sequence, &opts.pattern) start_node,
); opts.path_count,
);
println!("Searching {} (rolling hash)...", opts.pattern); for (i, sequence) in sequences.iter().enumerate() {
println!( println!("Sequence #{} of length {}", i + 1, sequence.len());
"Occurrences: {:?}\n",
compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
);
}
compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size); println!("Searching {} (naive)...", opts.pattern);
println!(
"Occurrences: {:?}\n",
compute_sequence_occurrences_naive(sequence, &opts.pattern)
);
println!("Cleaning up..."); println!("Searching {} (rolling hash)...", opts.pattern);
process::exit(0); println!(
} "Occurrences: {:?}\n",
compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
);
} }
compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
println!("Cleaning up...");
process::exit(0);
} }
fn compute_kmer_histogram_lb( fn compute_kmer_histogram_lb(

Loading…
Cancel
Save