updated readme and cli tool

main
Antonio De Lucreziis 2 months ago
parent f0e93e92a2
commit 0d12554e93

@ -1,24 +1,51 @@
# Progetto ASD 2023/2024
## Usage
Creazione e analisi di un Pangenome Graph
## Obbiettivi
- [x] Caricare un Pangenome Graph dal formato GFA
- [x] Classificare i nodi del grafo in base al tipo tree, back, forward, cross
- [x] Rimuovere tutti i nodi di tipo back per rendere il grafo un DAG
- [x] Restringere il grafo alla componente connessa più grande
- `cargo run -- --help`
- [x] Ricostruire le sequenze dei nodi del grafo in base ai cammini ed alla direzione di percorrenza dei nodi (forward o reverse)
Display help message.
- [x] Ricerca di un pattern k-mer in queste sequenze utilizzando il rolling hash
- `cargo run -- show -i ./dataset/example.gfa`
- [~] Calcolare le frequenze di occorrenza di tutti i k-mer presenti nel grafo
Parses the GFA file and prints the graph and various information.
## CLI Options
- `-i, --input <input>`: file to read
- `-c, --path_count <path_count>`: number of paths to visit when searching for the pattern (default: 1)
- `-p, --pattern <pattern>`: k-mer pattern to search (default: "ACGT")
- `-k, --kmer_size <kmer_size>`: k-mer length (default: 4)
## Usage
- Visualize the graph using EGUI, go to `examples/configurable` and run:
- To show help message:
`cargo run -- ../../dataset/example.gfa`
```
cargo run -- --help`
```
`cargo run -- ../../dataset/DRB1-3123_unsorted.gfa`
- For example to try out the `chrX` dataset:
- Running the project with all the flags:
```bash
GFA_URL='https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz'
wget $GFA_URL -O dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
gunzip dataset/chrX.hprc-v1.0-pggb.local.gfa.gz
cargo run --release -- -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3
```
`cargo run --release -- show -i dataset/chrX.hprc-v1.0-pggb.local.gfa -c 2 -p ACGT -k 3`
altri dataset sono elencati in [Note](#Note)
## Example GFA

@ -21,20 +21,6 @@ use rolling_hash::RollingHasher;
#[derive(FromArgs, PartialEq, Debug)]
/// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024
struct CliTool {
#[argh(subcommand)]
nested: CliSubcommands,
}
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand)]
enum CliSubcommands {
Show(CommandShow),
}
#[derive(FromArgs, PartialEq, Debug)]
/// Parse and show the content of a file
#[argh(subcommand, name = "show")]
struct CommandShow {
#[argh(option, short = 'i')]
/// file to read
input: String,
@ -55,125 +41,120 @@ struct CommandShow {
fn main() -> std::io::Result<()> {
let opts = argh::from_env::<CliTool>();
match opts.nested {
CliSubcommands::Show(opts) => {
// validate opts.pattern is a valid DNA sequence
if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
eprintln!("Invalid pattern: {:?}", opts.pattern);
process::exit(1);
}
// validate opts.pattern is a valid DNA sequence
if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
eprintln!("Invalid pattern: {:?}", opts.pattern);
process::exit(1);
}
println!("Estimating line count...");
let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
.lines()
.progress_with(indicatif::ProgressBar::new_spinner())
.count() as u64;
let entries =
gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
println!("Number of entries: {}", entries.len());
let mut sequence_map = HashMap::new();
let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
let mut invalid_nodes = vec![];
for entry in entries {
match entry {
Entry::Segment { id, sequence } => {
// validate sequence is a valid DNA sequence
if sequence.chars().any(|c| !"ACGT".contains(c)) {
invalid_nodes.push(id.clone());
continue;
}
sequence_map.insert(id.clone(), sequence);
}
Entry::Link {
from,
from_orient,
to,
to_orient,
} => {
graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
}
_ => {}
println!("Estimating line count...");
let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
.lines()
.progress_with(indicatif::ProgressBar::new_spinner())
.count() as u64;
let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
println!("Number of entries: {}", entries.len());
let mut sequence_map = HashMap::new();
let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
let mut invalid_nodes = vec![];
for entry in entries {
match entry {
Entry::Segment { id, sequence } => {
// validate sequence is a valid DNA sequence
if sequence.chars().any(|c| !"ACGT".contains(c)) {
invalid_nodes.push(id.clone());
continue;
}
}
println!("Removing {} invalid nodes...", invalid_nodes.len());
// remove invalid nodes
for id in invalid_nodes.iter().progress() {
graph.remove_node(&(id.clone(), Orientation::Forward));
graph.remove_node(&(id.clone(), Orientation::Reverse));
sequence_map.insert(id.clone(), sequence);
}
println!();
compute_graph_degrees(&graph);
Entry::Link {
from,
from_orient,
to,
to_orient,
} => {
graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
}
_ => {}
}
}
let dag = graph.dag();
println!("Removing {} invalid nodes...", invalid_nodes.len());
// remove invalid nodes
for id in invalid_nodes.iter().progress() {
graph.remove_node(&(id.clone(), Orientation::Forward));
graph.remove_node(&(id.clone(), Orientation::Reverse));
}
println!();
compute_edge_types(&dag);
compute_graph_degrees(&graph);
let ccs = compute_ccs(&dag);
let dag = graph.dag();
println!("Picking largest connected component...");
// pick the largest connected component
let largest_cc = ccs
.iter()
.max_by_key(|cc| cc.len())
.expect("at least one connected components");
compute_edge_types(&dag);
let largest_cc_graph = dag.restricted(largest_cc);
let ccs = compute_ccs(&dag);
let degrees = compute_graph_degrees(&largest_cc_graph);
compute_edge_types(&largest_cc_graph); // to double check this is a DAG
println!("Picking largest connected component...");
// pick the largest connected component
let largest_cc = ccs
.iter()
.max_by_key(|cc| cc.len())
.expect("at least one connected components");
println!("Searching for a start node...");
let start_node = degrees
.iter()
.find(|(_, degree)| degree.in_degree == 0)
.expect("no start node found")
.0;
let largest_cc_graph = dag.restricted(largest_cc);
println!("Start node: {:?}", start_node);
println!("{:?}", degrees.get(start_node).unwrap());
let degrees = compute_graph_degrees(&largest_cc_graph);
compute_edge_types(&largest_cc_graph); // to double check this is a DAG
compute_orientation_histogram(&largest_cc_graph);
println!("Searching for a start node...");
let start_node = degrees
.iter()
.find(|(_, degree)| degree.in_degree == 0)
.expect("no start node found")
.0;
println!("Visiting the graph, searching {} paths...", opts.path_count);
println!("Start node: {:?}", start_node);
println!("{:?}", degrees.get(start_node).unwrap());
let sequences = compute_sequences(
&sequence_map,
&largest_cc_graph,
start_node,
opts.path_count,
);
compute_orientation_histogram(&largest_cc_graph);
for (i, sequence) in sequences.iter().enumerate() {
println!("Sequence #{} of length {}", i + 1, sequence.len());
println!("Visiting the graph, searching {} paths...", opts.path_count);
println!("Searching {} (naive)...", opts.pattern);
println!(
"Occurrences: {:?}\n",
compute_sequence_occurrences_naive(sequence, &opts.pattern)
);
let sequences = compute_sequences(
&sequence_map,
&largest_cc_graph,
start_node,
opts.path_count,
);
println!("Searching {} (rolling hash)...", opts.pattern);
println!(
"Occurrences: {:?}\n",
compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
);
}
for (i, sequence) in sequences.iter().enumerate() {
println!("Sequence #{} of length {}", i + 1, sequence.len());
compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
println!("Searching {} (naive)...", opts.pattern);
println!(
"Occurrences: {:?}\n",
compute_sequence_occurrences_naive(sequence, &opts.pattern)
);
println!("Cleaning up...");
process::exit(0);
}
println!("Searching {} (rolling hash)...", opts.pattern);
println!(
"Occurrences: {:?}\n",
compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern)
);
}
compute_kmer_histogram_lb(&sequence_map, &largest_cc_graph, opts.kmer_size);
println!("Cleaning up...");
process::exit(0);
}
fn compute_kmer_histogram_lb(

Loading…
Cancel
Save