|
|
@ -5,7 +5,7 @@ mod graph;
|
|
|
|
mod rolling_hash;
|
|
|
|
mod rolling_hash;
|
|
|
|
|
|
|
|
|
|
|
|
use std::{
|
|
|
|
use std::{
|
|
|
|
collections::{BTreeMap, HashMap},
|
|
|
|
collections::{BTreeMap, BTreeSet, HashMap},
|
|
|
|
fmt::Debug,
|
|
|
|
fmt::Debug,
|
|
|
|
io::{BufRead, BufReader},
|
|
|
|
io::{BufRead, BufReader},
|
|
|
|
process,
|
|
|
|
process,
|
|
|
@ -55,42 +55,104 @@ fn main() -> std::io::Result<()> {
|
|
|
|
.count() as u64;
|
|
|
|
.count() as u64;
|
|
|
|
|
|
|
|
|
|
|
|
let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
|
|
|
|
let entries = gfa::parser::parse_source(std::fs::File::open(opts.input)?, file_lines_count)?;
|
|
|
|
|
|
|
|
|
|
|
|
println!("Number of entries: {}", entries.len());
|
|
|
|
println!("Number of entries: {}", entries.len());
|
|
|
|
|
|
|
|
println!();
|
|
|
|
|
|
|
|
|
|
|
|
let mut sequence_map = HashMap::new();
|
|
|
|
let mut sequence_map = HashMap::new();
|
|
|
|
let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
|
|
|
|
let mut graph: AdjacencyGraph<(String, Orientation)> = AdjacencyGraph::new();
|
|
|
|
|
|
|
|
|
|
|
|
let mut invalid_nodes = vec![];
|
|
|
|
let mut invalid_nodes = BTreeSet::new();
|
|
|
|
|
|
|
|
|
|
|
|
for entry in entries {
|
|
|
|
println!("Populating nodes...");
|
|
|
|
|
|
|
|
for entry in &entries {
|
|
|
|
match entry {
|
|
|
|
match entry {
|
|
|
|
Entry::Segment { id, sequence } => {
|
|
|
|
Entry::Segment { id, sequence } => {
|
|
|
|
// validate sequence is a valid DNA sequence
|
|
|
|
// validate sequence is a valid DNA sequence
|
|
|
|
if sequence.chars().any(|c| !"ACGT".contains(c)) {
|
|
|
|
if sequence.chars().any(|c| !"ACGT".contains(c)) {
|
|
|
|
invalid_nodes.push(id.clone());
|
|
|
|
invalid_nodes.insert(id.clone());
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sequence_map.insert(id.clone(), sequence);
|
|
|
|
sequence_map.insert(id.clone(), sequence.clone());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
_ => {}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
println!("Number of nodes: {}", sequence_map.len());
|
|
|
|
|
|
|
|
println!();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
println!("Populating edges...");
|
|
|
|
|
|
|
|
for entry in &entries {
|
|
|
|
|
|
|
|
match entry {
|
|
|
|
Entry::Link {
|
|
|
|
Entry::Link {
|
|
|
|
from,
|
|
|
|
from,
|
|
|
|
from_orient,
|
|
|
|
from_orient,
|
|
|
|
to,
|
|
|
|
to,
|
|
|
|
to_orient,
|
|
|
|
to_orient,
|
|
|
|
} => {
|
|
|
|
} => {
|
|
|
|
graph.add_edge((from.clone(), from_orient), (to.clone(), to_orient));
|
|
|
|
if invalid_nodes.contains(from) || invalid_nodes.contains(to) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graph.add_edge(
|
|
|
|
|
|
|
|
(from.clone(), from_orient.clone()),
|
|
|
|
|
|
|
|
(to.clone(), to_orient.clone()),
|
|
|
|
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_ => {}
|
|
|
|
_ => {}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
println!("Number of edges: {}", graph.edges().len());
|
|
|
|
|
|
|
|
println!();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
println!("Sequences stats:");
|
|
|
|
|
|
|
|
let mut sequence_lengths: Vec<_> = sequence_map.values().map(|seq| seq.len()).collect();
|
|
|
|
|
|
|
|
sequence_lengths.sort();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let sequence_lengths_average =
|
|
|
|
|
|
|
|
sequence_lengths.iter().sum::<usize>() as f64 / sequence_lengths.len() as f64;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
println!("- Average length: {:.2}", sequence_lengths_average);
|
|
|
|
|
|
|
|
println!(
|
|
|
|
|
|
|
|
"- Standard deviation: {:.2}",
|
|
|
|
|
|
|
|
(sequence_lengths
|
|
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
|
|
.map(|len| (*len as f64 - sequence_lengths_average).powi(2))
|
|
|
|
|
|
|
|
.sum::<f64>()
|
|
|
|
|
|
|
|
/ sequence_lengths.len() as f64)
|
|
|
|
|
|
|
|
.sqrt()
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
println!(
|
|
|
|
|
|
|
|
"- Max length: {}",
|
|
|
|
|
|
|
|
sequence_lengths
|
|
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
|
|
.max()
|
|
|
|
|
|
|
|
.expect("at least one sequence")
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
println!(
|
|
|
|
|
|
|
|
"- Min length: {}",
|
|
|
|
|
|
|
|
sequence_lengths
|
|
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
|
|
.min()
|
|
|
|
|
|
|
|
.expect("at least one sequence")
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
println!(
|
|
|
|
|
|
|
|
"- Median length: {}",
|
|
|
|
|
|
|
|
sequence_lengths[sequence_lengths.len() / 2]
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
println!();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let sequence_lengths_histogram: BTreeMap<usize, usize> =
|
|
|
|
|
|
|
|
sequence_lengths
|
|
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
|
|
.fold(BTreeMap::new(), |mut acc, len| {
|
|
|
|
|
|
|
|
*acc.entry(*len).or_insert(0) += 1;
|
|
|
|
|
|
|
|
acc
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
println!("Removing {} invalid nodes...", invalid_nodes.len());
|
|
|
|
println!("Sequence lengths histogram (length/count):");
|
|
|
|
// remove invalid nodes
|
|
|
|
for (length, count) in sequence_lengths_histogram.iter() {
|
|
|
|
for id in invalid_nodes.iter().progress() {
|
|
|
|
println!("- {}: {}", length, count);
|
|
|
|
graph.remove_node(&(id.clone(), Orientation::Forward));
|
|
|
|
|
|
|
|
graph.remove_node(&(id.clone(), Orientation::Reverse));
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
println!();
|
|
|
|
println!();
|
|
|
|
|
|
|
|
|
|
|
|