finito circa

main
Antonio De Lucreziis 3 weeks ago
parent fed8d30de2
commit 98e324786c

@ -9,110 +9,57 @@ use indicatif::{ProgressBar, ProgressIterator};
use super::{AdjacencyGraph, Graph, UndirectedGraph}; use super::{AdjacencyGraph, Graph, UndirectedGraph};
impl<V> Graph<V> for AdjacencyGraph<V>
where
V: Ord + Clone,
{
fn nodes(&self) -> BTreeSet<V> {
self.nodes.clone()
}
fn adjacencies(&self) -> BTreeMap<V, BTreeSet<V>> {
self.adjacencies.clone()
}
fn edges(&self) -> BTreeSet<(V, V)> {
self.adjacencies
.iter()
.flat_map(|(from, tos)| tos.iter().map(move |to| (from.clone(), to.clone())))
.collect()
}
// pub fn edges(&self) -> impl Iterator<Item = (&V, &V)> {
// self.adjacencies
// .iter()
// .flat_map(|(from, tos)| tos.iter().map(move |to| (from, to)))
// }
}
impl<V> Graph<V> for UndirectedGraph<V>
where
V: Ord + Clone,
{
fn nodes(&self) -> BTreeSet<V> {
self.directed.nodes()
}
fn adjacencies(&self) -> BTreeMap<V, BTreeSet<V>> {
self.directed.adjacencies()
}
fn edges(&self) -> BTreeSet<(V, V)> {
self.directed.edges()
}
}
#[allow(dead_code)] #[allow(dead_code)]
impl<V> AdjacencyGraph<V> impl<V> AdjacencyGraph<V>
where where
V: Ord + Eq + Clone + Debug, V: Ord + Eq + Clone + Debug,
{ {
pub fn new() -> Self { // pub fn new() -> Self {
AdjacencyGraph { // AdjacencyGraph {
nodes: BTreeSet::new(), // nodes: BTreeSet::new(),
adjacencies: BTreeMap::new(), // adjacencies: BTreeMap::new(),
} // }
} // }
pub fn from_edges(edges: &[(V, V)]) -> Self {
let mut graph = AdjacencyGraph::new();
for (from, to) in edges {
graph.add_edge(from.clone(), to.clone());
}
graph
}
pub fn add_node(&mut self, node: V) { // pub fn add_node(&mut self, node: V) {
self.nodes.insert(node); // self.nodes.insert(node);
} // }
pub fn add_edge(&mut self, from: V, to: V) { // pub fn add_edge(&mut self, from: V, to: V) {
self.add_node(from.clone()); // self.add_node(from.clone());
self.add_node(to.clone()); // self.add_node(to.clone());
self.adjacencies // self.adjacencies
.entry(from) // .entry(from)
.or_insert_with(BTreeSet::new) // .or_insert_with(BTreeSet::new)
.insert(to); // .insert(to);
} // }
pub fn remove_edge(&mut self, from: &V, to: &V) { // pub fn remove_edge(&mut self, from: &V, to: &V) {
if let Some(adjacencies) = self.adjacencies.get_mut(from) { // if let Some(adjacencies) = self.adjacencies.get_mut(from) {
adjacencies.remove(to); // adjacencies.remove(to);
} // }
} // }
pub fn get_adjacencies(&self, node: &V) -> Option<&BTreeSet<V>> { // pub fn get_adjacencies(&self, node: &V) -> Option<&BTreeSet<V>> {
self.adjacencies.get(node) // self.adjacencies.get(node)
} // }
pub fn adjacencies(&self) -> &BTreeMap<V, BTreeSet<V>> { // pub fn adjacencies(&self) -> &BTreeMap<V, BTreeSet<V>> {
&self.adjacencies // &self.adjacencies
} // }
pub fn nodes(&self) -> &BTreeSet<V> { // pub fn nodes(&self) -> &BTreeSet<V> {
&self.nodes // &self.nodes
} // }
pub fn edges(&self) -> impl Iterator<Item = (&V, &V)> { // pub fn edges(&self) -> impl Iterator<Item = (&V, &V)> {
self.adjacencies // self.adjacencies
.iter() // .iter()
.flat_map(|(from, tos)| tos.iter().map(move |to| (from, to))) // .flat_map(|(from, tos)| tos.iter().map(move |to| (from, to)))
} // }
pub fn opposite(&self) -> AdjacencyGraph<&V> { pub fn opposite(&self) -> AdjacencyGraph<V> {
let mut opposite = AdjacencyGraph::new(); let mut opposite = AdjacencyGraph::new();
// O(|E|) // O(|E|)
@ -142,79 +89,67 @@ where
let mut restricted = AdjacencyGraph::new(); let mut restricted = AdjacencyGraph::new();
for node in nodes { for node in nodes {
if let Some(adjacencies) = self.get_adjacencies(&node) { for adj in self.neighbors(&node) {
for adj in adjacencies { if nodes_index.contains(&adj) {
if nodes_index.contains(adj) {
restricted.add_edge(node.clone(), adj.clone()); restricted.add_edge(node.clone(), adj.clone());
} }
} }
} }
}
restricted restricted
} }
pub fn has_edge(&self, from: &V, to: &V) -> bool { pub fn has_edge(&self, from: &V, to: &V) -> bool {
// O(1) self.neighbors(from).contains(to)
if let Some(adjacencies) = self.get_adjacencies(from) {
// O(1)
adjacencies.contains(&to.to_owned())
} else {
false
}
} }
pub fn dfs<'a>(&'a self, node: &'a V) -> impl Iterator<Item = V> + 'a { // pub fn dfs<'a>(&'a self, node: &'a V) -> impl Iterator<Item = V> + 'a {
let mut visited = BTreeSet::new(); // let mut visited = BTreeSet::new();
let mut stack = VecDeque::from([node]); // let mut stack = VecDeque::from([node]);
std::iter::from_fn(move || { // std::iter::from_fn(move || {
while let Some(node) = stack.pop_back() { // while let Some(node) = stack.pop_back() {
if !visited.insert(node.clone()) { // if !visited.insert(node.clone()) {
continue; // continue;
} // }
if let Some(adjacencies) = self.get_adjacencies(node) { // stack.extend(self.neighbors(node).iter());
stack.extend(adjacencies);
}
return Some(node.clone()); // return Some(node.clone());
} // }
None // None
}) // })
} // }
pub fn shortest_path_matrix(&self) -> BTreeMap<&V, BTreeMap<&V, usize>> { pub fn shortest_path_matrix(&self) -> BTreeMap<V, BTreeMap<V, usize>> {
let mut result = BTreeMap::new(); let mut result = BTreeMap::new();
for node in self.nodes.iter() { for node in self.nodes.iter() {
let mut distances = BTreeMap::new(); let mut distances = BTreeMap::new();
let mut visited = BTreeSet::new(); let mut visited = BTreeSet::new();
let mut queue = VecDeque::from([node]); let mut queue = VecDeque::from([node.clone()]);
distances.insert(node, 0); distances.insert(node.clone(), 0);
while let Some(node) = queue.pop_front() { while let Some(node) = queue.pop_front() {
if visited.contains(node) { if visited.contains(&node) {
continue; continue;
} }
visited.insert(node.clone()); visited.insert(node.clone());
let distance = *distances.get(node).unwrap(); let distance = *distances.get(&node).unwrap();
if let Some(adjacencies) = self.get_adjacencies(node) { for adj in self.neighbors(&node) {
for adj in adjacencies { if !distances.contains_key(&adj) {
if !distances.contains_key(adj) { distances.insert(adj.clone(), distance + 1);
distances.insert(adj, distance + 1); queue.push_back(adj.clone());
queue.push_back(adj);
}
} }
} }
} }
result.insert(node, distances); result.insert(node.clone(), distances);
} }
result result
@ -242,27 +177,23 @@ where
} }
let mut cc: BTreeSet<V> = BTreeSet::new(); let mut cc: BTreeSet<V> = BTreeSet::new();
let mut stack: Vec<&V> = vec![node]; let mut stack: Vec<V> = vec![node.clone()];
while let Some(node) = stack.pop() { while let Some(node) = stack.pop() {
if cc.contains(node) { if cc.contains(&node) {
continue; continue;
} }
cc.insert(node.clone()); cc.insert(node.clone());
if let Some(adjacencies) = self.get_adjacencies(&node) { for adj in self.neighbors(&node) {
for adj in adjacencies {
stack.push(adj); stack.push(adj);
} }
}
if let Some(adjacencies) = op.get_adjacencies(&node) { for adj in op.neighbors(&node) {
for adj in adjacencies {
stack.push(adj); stack.push(adj);
} }
} }
}
visited.extend(cc.iter().map(|x| x.to_owned())); visited.extend(cc.iter().map(|x| x.to_owned()));
result.push(cc.iter().map(|x| x.to_owned()).collect()); result.push(cc.iter().map(|x| x.to_owned()).collect());
@ -275,12 +206,10 @@ where
let mut to_remove = Vec::new(); let mut to_remove = Vec::new();
for node in self.nodes.iter() { for node in self.nodes.iter() {
if let Some(adjacencies) = self.get_adjacencies(node) { if self.neighbors(node).is_empty() {
if adjacencies.is_empty() {
to_remove.push(node.clone()); to_remove.push(node.clone());
} }
} }
}
for node in to_remove { for node in to_remove {
self.nodes.remove(&node); self.nodes.remove(&node);
@ -315,22 +244,20 @@ where
} }
let mut cc: BTreeSet<V> = BTreeSet::new(); let mut cc: BTreeSet<V> = BTreeSet::new();
let mut stack: Vec<&V> = vec![node]; let mut stack: Vec<V> = vec![node.clone()];
while let Some(node) = stack.pop() { while let Some(node) = stack.pop() {
if cc.contains(node) { if cc.contains(&node) {
continue; continue;
} }
pb.inc(1); pb.inc(1);
cc.insert(node.clone()); cc.insert(node.clone());
if let Some(adjacencies) = self.directed.get_adjacencies(&node) { for adj in self.neighbors(&node) {
for adj in adjacencies {
stack.push(adj); stack.push(adj);
} }
} }
}
visited.extend(cc.iter().map(|x| x.to_owned())); visited.extend(cc.iter().map(|x| x.to_owned()));
result.push(cc.iter().map(|x| x.to_owned()).collect()); result.push(cc.iter().map(|x| x.to_owned()).collect());
@ -371,7 +298,12 @@ where
let mut curr = node; let mut curr = node;
let mut path = vec![curr.clone()]; let mut path = vec![curr.clone()];
while let Some(adjacencies) = self.directed.get_adjacencies(&curr) { loop {
let adjacencies = self.neighbors(&curr);
if adjacencies.is_empty() {
break;
}
let probes = adjacencies let probes = adjacencies
.iter() .iter()
.filter(|&x| !path.contains(x)) .filter(|&x| !path.contains(x))
@ -399,10 +331,8 @@ where
compacted_count += path.len() - 2; compacted_count += path.len() - 2;
if let Some(adjacencies) = self.directed.get_adjacencies(&curr) { for adj in self.neighbors(&curr) {
for adj in adjacencies { stack.push(adj);
stack.push(adj.clone());
}
} }
} }
} }

@ -0,0 +1,92 @@
use std::{
collections::{BTreeMap, BTreeSet},
fmt::Debug,
};
use super::{AdjacencyGraph, DirectedAcyclicGraph, Graph};
impl<V> Graph<V> for DirectedAcyclicGraph<V>
where
V: Ord + Clone,
{
fn new() -> Self {
DirectedAcyclicGraph(AdjacencyGraph::new())
}
fn to_adjecency_graph(&self) -> AdjacencyGraph<V> {
self.0.clone()
}
fn nodes(&self) -> BTreeSet<V> {
self.0.nodes()
}
fn adjacencies(&self) -> BTreeMap<V, BTreeSet<V>> {
self.0.adjacencies()
}
fn neighbors(&self, from: &V) -> BTreeSet<V> {
self.0.neighbors(from)
}
fn edges(&self) -> BTreeSet<(V, V)> {
self.0.edges()
}
fn add_node(&mut self, node: V) {
self.0.add_node(node);
}
fn add_edge(&mut self, from: V, to: V) {
self.0.add_edge(from, to);
}
fn remove_node(&mut self, node: &V) {
self.0.remove_node(node);
}
fn remove_edge(&mut self, from: &V, to: &V) {
self.0.remove_edge(from, to);
}
}
impl<V> DirectedAcyclicGraph<V>
where
V: Ord + Eq + Clone + Debug,
{
pub fn all_paths<F>(&self, start: &V, mut visit_fn: F)
where
F: FnMut(Vec<V>) -> bool,
{
let mut prev: BTreeMap<V, V> = BTreeMap::new();
let mut stack: Vec<(V, Option<V>)> = vec![(start.clone(), None)];
while let Some((node, parent)) = stack.pop() {
if let Some(p) = parent {
prev.insert(node.clone(), p.clone());
}
let neighbors = self.neighbors(&node);
if neighbors.is_empty() {
let mut path = vec![];
let mut current = node.clone();
while let Some(next) = prev.get(&current) {
path.push(current);
current = next.clone();
}
path.reverse();
if !visit_fn(path) {
return;
}
}
for n in neighbors {
stack.push((n.clone(), Some(node.clone())));
}
}
}
}

@ -0,0 +1,74 @@
use std::collections::{BTreeMap, BTreeSet};
use super::{AdjacencyGraph, Graph};
impl<V> Graph<V> for AdjacencyGraph<V>
where
V: Ord + Clone,
{
fn new() -> Self
where
Self: Sized,
{
AdjacencyGraph {
nodes: BTreeSet::new(),
adjacencies: BTreeMap::new(),
}
}
fn to_adjecency_graph(&self) -> AdjacencyGraph<V> {
self.clone()
}
fn nodes(&self) -> BTreeSet<V> {
self.nodes.clone()
}
fn adjacencies(&self) -> BTreeMap<V, BTreeSet<V>> {
self.adjacencies.clone()
}
fn edges(&self) -> BTreeSet<(V, V)> {
self.adjacencies
.iter()
.flat_map(|(from, tos)| tos.iter().map(move |to| (from.clone(), to.clone())))
.collect()
}
fn neighbors(&self, from: &V) -> BTreeSet<V> {
if let Some(neighbors) = self.adjacencies.get(from) {
neighbors.clone()
} else {
BTreeSet::new()
}
}
fn add_node(&mut self, node: V) {
self.nodes.insert(node);
}
fn add_edge(&mut self, from: V, to: V) {
self.nodes.insert(from.clone());
self.nodes.insert(to.clone());
self.adjacencies
.entry(from)
.or_insert_with(BTreeSet::new)
.insert(to.clone());
}
fn remove_node(&mut self, node: &V) {
self.nodes.remove(node);
self.adjacencies.remove(node);
for adjacencies in self.adjacencies.values_mut() {
adjacencies.remove(node);
}
}
fn remove_edge(&mut self, from: &V, to: &V) {
if let Some(adjacencies) = self.adjacencies.get_mut(from) {
adjacencies.remove(to);
}
}
}

@ -8,7 +8,7 @@ use std::{
use indicatif::ProgressBar; use indicatif::ProgressBar;
use super::AdjacencyGraph; use super::{AdjacencyGraph, DirectedAcyclicGraph, Graph};
#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] #[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
pub enum EdgeType { pub enum EdgeType {
@ -63,15 +63,14 @@ where
.insert((parent.clone(), node.clone()), EdgeType::TreeEdge); .insert((parent.clone(), node.clone()), EdgeType::TreeEdge);
} }
if let Some(adjacencies) = graph.get_adjacencies(node) { for adj in graph.neighbors(node) {
for adj in adjacencies.iter() { if !self.visited.contains(&adj) {
if !self.visited.contains(adj) { self.dfs(graph, &adj, Some(node));
self.dfs(graph, adj, Some(node));
} else { } else {
if !self.finished_nodes.contains(adj) { if !self.finished_nodes.contains(&adj) {
self.edge_types self.edge_types
.insert((node.clone(), adj.clone()), EdgeType::BackEdge); .insert((node.clone(), adj.clone()), EdgeType::BackEdge);
} else if self.start_times.get(node) < self.start_times.get(adj) { } else if self.start_times.get(node) < self.start_times.get(&adj) {
self.edge_types self.edge_types
.insert((node.clone(), adj.clone()), EdgeType::ForwardEdge); .insert((node.clone(), adj.clone()), EdgeType::ForwardEdge);
} else { } else {
@ -80,7 +79,6 @@ where
} }
} }
} }
}
self.time += 1; self.time += 1;
self.finished_nodes.insert(node.clone()); self.finished_nodes.insert(node.clone());
@ -152,8 +150,7 @@ where
node, node,
continue_from: index, continue_from: index,
} => { } => {
if let Some(adjacencies) = self.get_adjacencies(&node) { for (i, adj) in self.neighbors(&node).iter().enumerate() {
for (i, adj) in adjacencies.iter().enumerate() {
if i < index { if i < index {
continue; continue;
} }
@ -170,21 +167,14 @@ where
break; break;
} else { } else {
if !finished_nodes.contains(adj) { if !finished_nodes.contains(adj) {
edge_types.insert( edge_types
(node.clone(), adj.clone()), .insert((node.clone(), adj.clone()), EdgeType::BackEdge);
EdgeType::BackEdge,
);
} else if start_times.get(&node) < start_times.get(adj) { } else if start_times.get(&node) < start_times.get(adj) {
edge_types.insert( edge_types
(node.clone(), adj.clone()), .insert((node.clone(), adj.clone()), EdgeType::ForwardEdge);
EdgeType::ForwardEdge,
);
} else { } else {
edge_types.insert( edge_types
(node.clone(), adj.clone()), .insert((node.clone(), adj.clone()), EdgeType::CrossEdge);
EdgeType::CrossEdge,
);
}
} }
} }
} }
@ -200,4 +190,22 @@ where
progress_bar.finish(); progress_bar.finish();
return edge_types; return edge_types;
} }
// Constructs a Directed Acyclic Graph from the current graph by stripping out the back edges
pub fn dag(&self) -> DirectedAcyclicGraph<V> {
let edge_types = self.compute_edge_types();
let mut graph = AdjacencyGraph::new();
for ((from, to), edge_type) in edge_types.iter() {
match edge_type {
EdgeType::BackEdge => {}
_ => {
graph.add_edge(from.clone(), to.clone());
}
}
}
return DirectedAcyclicGraph(graph);
}
} }

@ -5,14 +5,58 @@ use std::{
pub trait Graph<V> pub trait Graph<V>
where where
V: Clone, V: Ord + Clone,
{
fn new() -> Self
where
Self: Sized;
fn from_edges(edges: &[(V, V)]) -> Self
where
Self: Sized,
{ {
let mut graph = Self::new();
for (from, to) in edges {
graph.add_edge(from.clone(), to.clone());
}
graph
}
fn to_adjecency_graph(&self) -> AdjacencyGraph<V>;
fn nodes(&self) -> BTreeSet<V>; fn nodes(&self) -> BTreeSet<V>;
fn adjacencies(&self) -> BTreeMap<V, BTreeSet<V>>; fn adjacencies(&self) -> BTreeMap<V, BTreeSet<V>>;
fn edges(&self) -> BTreeSet<(V, V)>; fn edges(&self) -> BTreeSet<(V, V)>;
fn neighbors(&self, from: &V) -> BTreeSet<V>;
fn add_node(&mut self, node: V);
fn add_edge(&mut self, from: V, to: V);
fn remove_node(&mut self, node: &V);
fn remove_edge(&mut self, from: &V, to: &V);
fn restricted(&self, nodes: &Vec<V>) -> Self
where
Self: Sized,
{
let nodes_index = nodes.iter().collect::<BTreeSet<_>>();
let mut restricted = Self::new();
for node in nodes {
for adj in self.neighbors(&node) {
if nodes_index.contains(&adj) {
restricted.add_edge(node.clone(), adj.clone());
}
}
} }
#[derive(Debug)] restricted
}
}
#[derive(Debug, Clone)]
pub struct AdjacencyGraph<V> pub struct AdjacencyGraph<V>
where where
V: Clone, V: Clone,
@ -29,8 +73,16 @@ where
pub directed: AdjacencyGraph<V>, pub directed: AdjacencyGraph<V>,
} }
#[derive(Debug)]
pub struct DirectedAcyclicGraph<V>(pub AdjacencyGraph<V>)
where
V: Clone;
pub mod algorithms; pub mod algorithms;
pub mod dag;
pub mod directed;
pub mod edge_types; pub mod edge_types;
pub mod undirected;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
@ -141,4 +193,29 @@ mod tests {
g.compact_chains(); g.compact_chains();
println!("{:?}", g); println!("{:?}", g);
} }
#[test]
fn test_all_paths() {
let g = AdjacencyGraph::from_edges(&[
//
("u", "v"),
("u", "x"),
("v", "y"),
("y", "x"),
("y", "w"),
("w", "z"),
("z", "x"),
("x", "h"),
("x", "g"),
("h", "i"),
("g", "i"),
]);
let g = g.dag();
g.all_paths(&"u", |path| {
println!("{:?}", path);
false
});
}
} }

@ -0,0 +1,55 @@
use std::collections::{BTreeMap, BTreeSet};
use super::{AdjacencyGraph, Graph, UndirectedGraph};
impl<V> Graph<V> for UndirectedGraph<V>
where
V: Ord + Clone,
{
fn new() -> Self
where
Self: Sized,
{
UndirectedGraph {
directed: AdjacencyGraph::new(),
}
}
fn to_adjecency_graph(&self) -> AdjacencyGraph<V> {
self.directed.clone()
}
fn nodes(&self) -> BTreeSet<V> {
self.directed.nodes()
}
fn adjacencies(&self) -> BTreeMap<V, BTreeSet<V>> {
self.directed.adjacencies()
}
fn neighbors(&self, from: &V) -> BTreeSet<V> {
self.directed.neighbors(from)
}
fn edges(&self) -> BTreeSet<(V, V)> {
self.directed.edges()
}
fn add_node(&mut self, node: V) {
self.directed.add_node(node);
}
fn add_edge(&mut self, from: V, to: V) {
self.directed.add_edge(from.clone(), to.clone());
self.directed.add_edge(to, from);
}
fn remove_node(&mut self, node: &V) {
self.directed.remove_node(node);
}
fn remove_edge(&mut self, from: &V, to: &V) {
self.directed.remove_edge(from, to);
self.directed.remove_edge(to, from);
}
}

@ -2,6 +2,7 @@
mod gfa; mod gfa;
mod graph; mod graph;
mod rolling_hash;
use std::{ use std::{
collections::{BTreeMap, HashMap}, collections::{BTreeMap, HashMap},
@ -12,9 +13,10 @@ use std::{
use argh::FromArgs; use argh::FromArgs;
use gfa::{Entry, Orientation}; use gfa::{Entry, Orientation};
use graph::{AdjacencyGraph, Graph}; use graph::{AdjacencyGraph, DirectedAcyclicGraph, Graph};
use indicatif::ProgressIterator; use indicatif::ProgressIterator;
use rand::seq::SliceRandom; use rand::seq::SliceRandom;
use rolling_hash::RollingHash;
#[derive(FromArgs, PartialEq, Debug)] #[derive(FromArgs, PartialEq, Debug)]
/// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024 /// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024
@ -36,21 +38,39 @@ struct CommandShow {
#[argh(option, short = 'i')] #[argh(option, short = 'i')]
/// file to read /// file to read
input: String, input: String,
#[argh(option, short = 'c', default = "1")]
/// number of paths to visit
path_count: usize,
#[argh(option, short = 'p', default = "\"ACGT\".to_string()")]
/// k-mer pattern to search
pattern: String,
#[argh(option, short = 'k', default = "4")]
/// k-mer length
k: usize,
} }
fn main() -> std::io::Result<()> { fn main() -> std::io::Result<()> {
let opts = argh::from_env::<CliTool>(); let opts = argh::from_env::<CliTool>();
match opts.nested { match opts.nested {
CliSubcommands::Show(show) => { CliSubcommands::Show(opts) => {
let file_lines_count = BufReader::new(std::fs::File::open(&show.input)?) // validate opts.pattern is a valid DNA sequence
if opts.pattern.chars().any(|c| !"ACGT".contains(c)) {
eprintln!("Invalid pattern: {:?}", opts.pattern);
process::exit(1);
}
let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?)
.lines() .lines()
.progress_with( .progress_with(
indicatif::ProgressBar::new_spinner().with_message("estimating line count"), indicatif::ProgressBar::new_spinner().with_message("estimating line count"),
) )
.count() as u64; .count() as u64;
let file = std::fs::File::open(show.input)?; let file = std::fs::File::open(opts.input)?;
let entries = gfa::parser::parse_source(file, file_lines_count)?; let entries = gfa::parser::parse_source(file, file_lines_count)?;
@ -76,24 +96,24 @@ fn main() -> std::io::Result<()> {
} }
} }
compute_graph_stats(&graph); compute_graph_degrees(&graph);
let edge_types = compute_edge_types(&graph); let dag = graph.dag();
println!("Removing back edges..."); // let edge_types = compute_edge_types(&graph);
for ((from, to), edge_type) in edge_types.iter() { // for ((from, to), edge_type) in edge_types.iter() {
match edge_type { // match edge_type {
graph::edge_types::EdgeType::BackEdge => { // graph::edge_types::EdgeType::BackEdge => {
graph.remove_edge(from, to); // graph.remove_edge(from, to);
} // }
_ => {} // _ => {}
} // }
} // }
compute_edge_types(&graph); compute_edge_types(&dag.0);
let ccs = compute_ccs(&graph); let ccs = compute_ccs(&dag.0);
println!("Picking largest connected component..."); println!("Picking largest connected component...");
// pick the largest connected component // pick the largest connected component
@ -102,21 +122,47 @@ fn main() -> std::io::Result<()> {
.max_by_key(|cc| cc.len()) .max_by_key(|cc| cc.len())
.expect("at least one connected components"); .expect("at least one connected components");
let largest_cc_graph = graph.restricted(largest_cc); let largest_cc_graph = dag.restricted(largest_cc);
compute_graph_stats(&largest_cc_graph); let degrees = compute_graph_degrees(&largest_cc_graph);
compute_edge_types(&largest_cc_graph); compute_edge_types(&largest_cc_graph); // to double check this is a DAG
// let mut largest_cc_graph = graph.restricted(largest_cc).undirected(); println!("Searching for a start node...");
let start_node = degrees
.iter()
.find(|(_, degree)| degree.in_degree == 0)
.expect("no start node found")
.0;
// compute_graph_stats(&largest_cc_graph); println!("Start node: {:?}", start_node);
// // compute_edge_types(&largest_cc_graph); println!("{:?}", degrees.get(start_node).unwrap());
// println!("Compacting chains..."); compute_orientation_histogram(&largest_cc_graph);
// largest_cc_graph.compact_chains();
// compute_graph_stats(&largest_cc_graph); println!("Visiting the graph, searching {} paths...", opts.path_count);
// compute_edge_types(&largest_cc_graph.directed);
let sequences = compute_sequences(
&sequence_map,
&largest_cc_graph,
start_node,
opts.path_count,
);
for (i, sequence) in sequences.iter().enumerate() {
println!("Sequence #{} of length {}", i + 1, sequence.len());
println!("Searching {} in the sequence (naive)...", opts.pattern);
let occurrences = compute_sequence_occurrences_naive(sequence, &opts.pattern);
println!("Occurrences: {:?}\n", occurrences);
println!(
"Searching {} in the sequence (rolling hash)...",
opts.pattern
);
let occurrences =
compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern);
println!("Occurrences: {:?}\n", occurrences);
}
println!("Cleaning up..."); println!("Cleaning up...");
process::exit(0); process::exit(0);
@ -124,6 +170,123 @@ fn main() -> std::io::Result<()> {
} }
} }
fn letter_to_number(letter: char) -> u64 {
match letter {
'A' => 1,
'C' => 2,
'G' => 3,
'T' => 4,
_ => panic!("Invalid letter: {}", letter),
}
}
fn compute_sequence_occurrences_rolling_hash(sequence: &str, pattern: &str) -> Vec<usize> {
let chars = sequence.chars().map(letter_to_number).collect::<Vec<_>>();
let mut occurrences = vec![];
let mut rl = RollingHash::new(3000, 5);
// let mut rl = RollingHash::new(1_000_000, 5);
let pattern_hash = rl.hash_pattern(&pattern.chars().map(letter_to_number).collect::<Vec<_>>());
for i in 0..pattern.len() {
// println!("Adding letter: {}", chars[i]);
rl.add_last(chars[i]);
}
for i in pattern.len()..sequence.len() {
let hash = rl.hash();
if rl.compare(&hash, &pattern_hash) {
println!("Hash match at position {}", i);
// check for false positives
if &sequence[i - pattern.len()..i] != pattern {
println!("=> False positive");
} else {
println!("=> Correct");
occurrences.push(i - pattern.len());
}
}
rl.advance(chars[i]);
}
occurrences
}
fn compute_sequence_occurrences_naive(sequence: &str, pattern: &str) -> Vec<usize> {
let mut occurrences = vec![];
for i in 0..sequence.len() - pattern.len() {
if &sequence[i..i + pattern.len()] == pattern {
occurrences.push(i);
}
}
occurrences
}
fn compute_sequences(
sequence_map: &HashMap<String, String>,
graph: &DirectedAcyclicGraph<(String, Orientation)>,
start_node: &(String, Orientation),
count: usize,
) -> Vec<String> {
let mut sequences = vec![];
let mut path_counter = 0;
graph.all_paths(start_node, |path| {
println!("Path #{} of length {}", path_counter + 1, path.len());
let mut sequence = String::new();
for node in path {
let (id, orientation) = node;
let seq = sequence_map.get(&id).expect("sequence not found");
match orientation {
Orientation::Forward => sequence.push_str(seq),
Orientation::Reverse => sequence.push_str(
&seq.chars()
.map(|c| match c {
'A' => 'T',
'T' => 'A',
'C' => 'G',
'G' => 'C',
_ => panic!("Invalid letter: {}", c),
})
.rev()
.collect::<String>(),
),
}
}
sequences.push(sequence);
path_counter += 1;
path_counter < count
});
sequences
}
fn compute_orientation_histogram(graph: &impl Graph<(String, Orientation)>) {
let orientation_histogram =
graph
.nodes()
.iter()
.map(|node| node.1)
.fold(BTreeMap::new(), |mut acc, orientation| {
*acc.entry(orientation).or_insert(0) += 1;
acc
});
println!("Orientation histogram:");
for (orientation, count) in orientation_histogram.iter() {
println!("- {:?}: {}", orientation, count);
}
}
fn compute_ccs<V>(graph: &AdjacencyGraph<V>) -> Vec<Vec<V>> fn compute_ccs<V>(graph: &AdjacencyGraph<V>) -> Vec<Vec<V>>
where where
V: Ord + Eq + Clone + Debug, V: Ord + Eq + Clone + Debug,
@ -148,12 +311,12 @@ where
ccs ccs
} }
fn compute_edge_types<V>(graph: &AdjacencyGraph<V>) -> BTreeMap<(V, V), graph::edge_types::EdgeType> fn compute_edge_types<V>(graph: &impl Graph<V>) -> BTreeMap<(V, V), graph::edge_types::EdgeType>
where where
V: Ord + Eq + Clone + Debug, V: Ord + Eq + Clone + Debug,
{ {
println!("Computing edge types..."); println!("Computing edge types...");
let edge_types = graph.compute_edge_types(); let edge_types = graph.to_adjecency_graph().compute_edge_types();
println!("Computing edge types histogram..."); println!("Computing edge types histogram...");
let histogram = edge_types.iter().map(|(_, edge_type)| edge_type).fold( let histogram = edge_types.iter().map(|(_, edge_type)| edge_type).fold(
@ -167,7 +330,7 @@ where
println!("Node count: {}", graph.nodes().len()); println!("Node count: {}", graph.nodes().len());
println!( println!(
"Edge count: {}, Total edge count: {}", "Edge count: {}, Total edge count: {}",
graph.edges().count(), graph.edges().len(),
edge_types.len() edge_types.len()
); );
println!("Edge types histogram (type/count):"); println!("Edge types histogram (type/count):");
@ -186,7 +349,7 @@ where
let mut g2 = AdjacencyGraph::new(); let mut g2 = AdjacencyGraph::new();
let mut shuffled_nodes: Vec<_> = graph.nodes().iter().collect::<Vec<_>>(); let mut shuffled_nodes: Vec<_> = graph.nodes().into_iter().collect::<Vec<_>>();
shuffled_nodes.shuffle(&mut rand::thread_rng()); shuffled_nodes.shuffle(&mut rand::thread_rng());
for node in shuffled_nodes.iter() { for node in shuffled_nodes.iter() {
@ -210,9 +373,16 @@ where
g2 g2
} }
#[derive(Debug)]
struct NodeDegree {
degree: usize,
in_degree: usize,
out_degree: usize,
}
/// This function prints the number of nodes, edges and a histogram of the degrees of the nodes /// This function prints the number of nodes, edges and a histogram of the degrees of the nodes
/// in the graph (computing the degrees might take a long time) /// in the graph (computing the degrees might take a long time)
fn compute_graph_stats<V>(graph: &impl Graph<V>) fn compute_graph_degrees<V>(graph: &impl Graph<V>) -> BTreeMap<V, NodeDegree>
where where
V: Ord + Eq + Clone + Debug, V: Ord + Eq + Clone + Debug,
{ {
@ -226,6 +396,12 @@ where
let progress_bar = indicatif::ProgressBar::new(graph.edges().len() as u64); let progress_bar = indicatif::ProgressBar::new(graph.edges().len() as u64);
for node in graph.nodes() {
vertices_degrees.insert(node.clone(), 0);
vertices_in_degrees.insert(node.clone(), 0);
vertices_out_degrees.insert(node.clone(), 0);
}
for (from, tos) in graph.adjacencies() { for (from, tos) in graph.adjacencies() {
*vertices_degrees.entry(from.clone()).or_insert(0) += tos.len(); *vertices_degrees.entry(from.clone()).or_insert(0) += tos.len();
*vertices_out_degrees.entry(from.clone()).or_insert(0) += tos.len(); *vertices_out_degrees.entry(from.clone()).or_insert(0) += tos.len();
@ -248,6 +424,20 @@ where
*acc.entry(degree).or_insert(0) += 1; *acc.entry(degree).or_insert(0) += 1;
acc acc
}); });
let histogram_in: BTreeMap<usize, usize> = vertices_in_degrees
.iter()
.map(|(_, degree)| *degree)
.fold(BTreeMap::new(), |mut acc, degree| {
*acc.entry(degree).or_insert(0) += 1;
acc
});
let histogram_out: BTreeMap<usize, usize> = vertices_out_degrees
.iter()
.map(|(_, degree)| *degree)
.fold(BTreeMap::new(), |mut acc, degree| {
*acc.entry(degree).or_insert(0) += 1;
acc
});
println!("Stats:"); println!("Stats:");
println!("- Nodes: {}", graph.nodes().len()); println!("- Nodes: {}", graph.nodes().len());
@ -257,4 +447,49 @@ where
for (degree, count) in histogram.iter() { for (degree, count) in histogram.iter() {
println!("- {}: {}", degree, count); println!("- {}: {}", degree, count);
} }
println!("In-degrees histogram (degree/count):");
for (degree, count) in histogram_in.iter() {
println!("- {}: {}", degree, count);
}
println!("Out-degrees histogram (degree/count):");
for (degree, count) in histogram_out.iter() {
println!("- {}: {}", degree, count);
}
let mut node_degrees = BTreeMap::new();
for node in graph.nodes() {
node_degrees.insert(
node.clone(),
NodeDegree {
degree: *vertices_degrees.get(&node).expect("node already computed"),
in_degree: *vertices_in_degrees
.get(&node)
.expect("node already computed"),
out_degree: *vertices_out_degrees
.get(&node)
.expect("node already computed"),
},
);
}
node_degrees
} }
// fn compute_compact_graph<V>(graph: &mut UndirectedGraph<V>) -> UndirectedGraph<V>
// where
// V: Ord + Eq + Clone + Debug,
// {
// compute_graph_degrees(graph);
// // compute_edge_types(graph);
// println!("Compacting chains...");
// graph.compact_chains();
// compute_graph_degrees(graph);
// compute_edge_types(graph.directed);
// graph.clone()
// }

@ -0,0 +1,238 @@
use std::{collections::VecDeque, fmt::Debug};
pub struct RollingHash<T: Into<u64> + Clone> {
modulus: u64,
alphabet_size: u64,
offset: u64,
current_word: VecDeque<T>,
hash: u64,
}
#[derive(Debug)]
pub struct Hashed {
hash: u64,
offset: u64,
}
fn wrapping_pow_correct(a: u64, b: u64) -> u64 {
// // println!("Wrapping pow: {}^{}", a, b);
// let mut result = 1u64;
// for _ in 0..b {
// result = result.wrapping_mul(a);
// }
// // println!("=> {}", result);
// result
a.wrapping_pow(b as u32)
}
impl<T> RollingHash<T>
where
T: Into<u64> + Clone + Debug,
{
pub fn new(modulus: u64, alphabet_size: u64) -> Self {
RollingHash {
modulus,
alphabet_size,
offset: 0,
current_word: VecDeque::new(),
hash: 0,
}
}
// pub fn hash(&self) -> u64 {
// self.hash % self.modulus
// }
pub fn hash(&self) -> Hashed {
Hashed {
hash: self.hash % self.modulus,
offset: self.offset,
}
}
pub fn compare(&self, lhs: &Hashed, rhs: &Hashed) -> bool {
// println!("Comparing: {:?} {:?}", lhs, rhs);
let (lhs, rhs) = if lhs.offset < rhs.offset {
(lhs, rhs)
} else {
(rhs, lhs)
};
// Shift lhs to the right by the difference in offsets
let shifted_lhs = (lhs.hash
* wrapping_pow_correct(self.alphabet_size, rhs.offset - lhs.offset))
% self.modulus;
shifted_lhs == rhs.hash
}
pub fn hash_pattern(&self, pattern: &[T]) -> Hashed {
let mut hash = 0;
for (i, value) in pattern.iter().enumerate() {
let char_hash =
value.clone().into() * wrapping_pow_correct(self.alphabet_size, i as u64);
hash += char_hash;
}
Hashed { hash, offset: 0 }
}
pub fn add_last(&mut self, value: T) {
self.current_word.push_back(value.clone());
let i = self.offset + (self.current_word.len() as u64) - 1;
// println!("Alphabet size: {}", self.alphabet_size);
// println!("Index: {}", i);
// println!(
// "Adding: {:?} * {} to {}",
// value,
// wrapping_pow_correct(self.alphabet_size, i),
// self.hash
// );
self.hash += value.into() * wrapping_pow_correct(self.alphabet_size, i);
}
pub fn remove_first(&mut self) {
let value = self.current_word.pop_front().unwrap();
let i = self.offset;
self.hash -= value.into() * wrapping_pow_correct(self.alphabet_size, i);
self.offset += 1;
}
pub fn advance(&mut self, value: T) {
self.add_last(value);
self.remove_first();
}
pub fn hash_value_at(&self, h: &Hashed, pos: u64) -> u64 {
let offset = h.offset;
let hash = h.hash;
let diff = pos as i64 - offset as i64;
if diff < 0 {
panic!("Invalid position");
}
(hash * wrapping_pow_correct(self.alphabet_size, diff as u64)) % self.modulus
}
pub fn hash_value_at_caret(&self, h: &Hashed) -> u64 {
self.hash_value_at(h, self.offset)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_rolling_hash() {
println!("Rolling hash test");
let modulus = 42;
let alphabet_size = 4;
let mut rh = RollingHash::<u64>::new(modulus, alphabet_size);
let initial_pattern_hash = rh.hash_pattern(&[1, 2, 3, 4, 5]);
println!("Initial pattern hash: {:?}", initial_pattern_hash);
rh.add_last(1);
rh.add_last(2);
rh.add_last(3);
rh.add_last(4);
rh.add_last(5);
println!("Hash: {:?}", rh.hash());
rh.advance(0);
println!("Hash: {:?}", rh.hash());
rh.advance(1);
rh.advance(2);
rh.advance(3);
rh.advance(4);
rh.advance(5);
println!("Current word: {:?}", rh.current_word);
println!("Hash: {:?}", rh.hash());
println!(
"Shifted pattern hash: {}",
rh.hash_value_at_caret(&initial_pattern_hash)
);
let pattern = initial_pattern_hash;
let curr_hash = rh.hash();
println!("Pattern hash: {:?}", pattern);
println!("Current hash: {:?}", curr_hash);
println!(
"Pattern hash at caret: {}",
rh.hash_value_at_caret(&pattern)
);
println!("Compare: {:?}", rh.compare(&pattern, &curr_hash));
}
#[test]
fn test_geometry_rolling_hash() {
println!("Geometry rolling hash test");
let modulus = 10_000_000;
let alphabet_size = 2;
let mut rh = RollingHash::<u64>::new(modulus, alphabet_size);
let initial_pattern_hash = rh.hash_pattern(&[1, 1, 1, 1]);
rh.add_last(1);
rh.add_last(1);
rh.add_last(1);
rh.add_last(1);
println!("Initial pattern hash: {:?}", initial_pattern_hash);
println!("Hash: {:?}", rh.hash());
rh.advance(1);
println!("Hash: {:?}", rh.hash());
println!(
"Shifted pattern hash: {:?}",
rh.hash_value_at(&initial_pattern_hash, rh.offset)
);
}
#[test]
fn test_wrappping_pow() {
println!("Wrapping pow test");
let a = 2;
let b = 3;
let result = wrapping_pow_correct(a, b);
assert_eq!(result, 8);
let a = 3;
let b = 100;
let result = wrapping_pow_correct(a, b);
assert_ne!(result, 0);
}
}
Loading…
Cancel
Save