diff --git a/src/graph/algorithms.rs b/src/graph/algorithms.rs index cff1623..518b26f 100644 --- a/src/graph/algorithms.rs +++ b/src/graph/algorithms.rs @@ -9,110 +9,57 @@ use indicatif::{ProgressBar, ProgressIterator}; use super::{AdjacencyGraph, Graph, UndirectedGraph}; -impl Graph for AdjacencyGraph -where - V: Ord + Clone, -{ - fn nodes(&self) -> BTreeSet { - self.nodes.clone() - } - - fn adjacencies(&self) -> BTreeMap> { - self.adjacencies.clone() - } - - fn edges(&self) -> BTreeSet<(V, V)> { - self.adjacencies - .iter() - .flat_map(|(from, tos)| tos.iter().map(move |to| (from.clone(), to.clone()))) - .collect() - } - - // pub fn edges(&self) -> impl Iterator { - // self.adjacencies - // .iter() - // .flat_map(|(from, tos)| tos.iter().map(move |to| (from, to))) - // } -} - -impl Graph for UndirectedGraph -where - V: Ord + Clone, -{ - fn nodes(&self) -> BTreeSet { - self.directed.nodes() - } - - fn adjacencies(&self) -> BTreeMap> { - self.directed.adjacencies() - } - - fn edges(&self) -> BTreeSet<(V, V)> { - self.directed.edges() - } -} - #[allow(dead_code)] impl AdjacencyGraph where V: Ord + Eq + Clone + Debug, { - pub fn new() -> Self { - AdjacencyGraph { - nodes: BTreeSet::new(), - adjacencies: BTreeMap::new(), - } - } - - pub fn from_edges(edges: &[(V, V)]) -> Self { - let mut graph = AdjacencyGraph::new(); - - for (from, to) in edges { - graph.add_edge(from.clone(), to.clone()); - } - - graph - } + // pub fn new() -> Self { + // AdjacencyGraph { + // nodes: BTreeSet::new(), + // adjacencies: BTreeMap::new(), + // } + // } - pub fn add_node(&mut self, node: V) { - self.nodes.insert(node); - } + // pub fn add_node(&mut self, node: V) { + // self.nodes.insert(node); + // } - pub fn add_edge(&mut self, from: V, to: V) { - self.add_node(from.clone()); - self.add_node(to.clone()); + // pub fn add_edge(&mut self, from: V, to: V) { + // self.add_node(from.clone()); + // self.add_node(to.clone()); - self.adjacencies - .entry(from) - .or_insert_with(BTreeSet::new) - .insert(to); - } + // self.adjacencies + // .entry(from) + // .or_insert_with(BTreeSet::new) + // .insert(to); + // } - pub fn remove_edge(&mut self, from: &V, to: &V) { - if let Some(adjacencies) = self.adjacencies.get_mut(from) { - adjacencies.remove(to); - } - } + // pub fn remove_edge(&mut self, from: &V, to: &V) { + // if let Some(adjacencies) = self.adjacencies.get_mut(from) { + // adjacencies.remove(to); + // } + // } - pub fn get_adjacencies(&self, node: &V) -> Option<&BTreeSet> { - self.adjacencies.get(node) - } + // pub fn get_adjacencies(&self, node: &V) -> Option<&BTreeSet> { + // self.adjacencies.get(node) + // } - pub fn adjacencies(&self) -> &BTreeMap> { - &self.adjacencies - } + // pub fn adjacencies(&self) -> &BTreeMap> { + // &self.adjacencies + // } - pub fn nodes(&self) -> &BTreeSet { - &self.nodes - } + // pub fn nodes(&self) -> &BTreeSet { + // &self.nodes + // } - pub fn edges(&self) -> impl Iterator { - self.adjacencies - .iter() - .flat_map(|(from, tos)| tos.iter().map(move |to| (from, to))) - } + // pub fn edges(&self) -> impl Iterator { + // self.adjacencies + // .iter() + // .flat_map(|(from, tos)| tos.iter().map(move |to| (from, to))) + // } - pub fn opposite(&self) -> AdjacencyGraph<&V> { + pub fn opposite(&self) -> AdjacencyGraph { let mut opposite = AdjacencyGraph::new(); // O(|E|) @@ -142,11 +89,9 @@ where let mut restricted = AdjacencyGraph::new(); for node in nodes { - if let Some(adjacencies) = self.get_adjacencies(&node) { - for adj in adjacencies { - if nodes_index.contains(adj) { - restricted.add_edge(node.clone(), adj.clone()); - } + for adj in self.neighbors(&node) { + if nodes_index.contains(&adj) { + restricted.add_edge(node.clone(), adj.clone()); } } } @@ -155,66 +100,56 @@ where } pub fn has_edge(&self, from: &V, to: &V) -> bool { - // O(1) - if let Some(adjacencies) = self.get_adjacencies(from) { - // O(1) - adjacencies.contains(&to.to_owned()) - } else { - false - } + self.neighbors(from).contains(to) } - pub fn dfs<'a>(&'a self, node: &'a V) -> impl Iterator + 'a { - let mut visited = BTreeSet::new(); - let mut stack = VecDeque::from([node]); + // pub fn dfs<'a>(&'a self, node: &'a V) -> impl Iterator + 'a { + // let mut visited = BTreeSet::new(); + // let mut stack = VecDeque::from([node]); - std::iter::from_fn(move || { - while let Some(node) = stack.pop_back() { - if !visited.insert(node.clone()) { - continue; - } + // std::iter::from_fn(move || { + // while let Some(node) = stack.pop_back() { + // if !visited.insert(node.clone()) { + // continue; + // } - if let Some(adjacencies) = self.get_adjacencies(node) { - stack.extend(adjacencies); - } + // stack.extend(self.neighbors(node).iter()); - return Some(node.clone()); - } + // return Some(node.clone()); + // } - None - }) - } + // None + // }) + // } - pub fn shortest_path_matrix(&self) -> BTreeMap<&V, BTreeMap<&V, usize>> { + pub fn shortest_path_matrix(&self) -> BTreeMap> { let mut result = BTreeMap::new(); for node in self.nodes.iter() { let mut distances = BTreeMap::new(); let mut visited = BTreeSet::new(); - let mut queue = VecDeque::from([node]); + let mut queue = VecDeque::from([node.clone()]); - distances.insert(node, 0); + distances.insert(node.clone(), 0); while let Some(node) = queue.pop_front() { - if visited.contains(node) { + if visited.contains(&node) { continue; } visited.insert(node.clone()); - let distance = *distances.get(node).unwrap(); + let distance = *distances.get(&node).unwrap(); - if let Some(adjacencies) = self.get_adjacencies(node) { - for adj in adjacencies { - if !distances.contains_key(adj) { - distances.insert(adj, distance + 1); - queue.push_back(adj); - } + for adj in self.neighbors(&node) { + if !distances.contains_key(&adj) { + distances.insert(adj.clone(), distance + 1); + queue.push_back(adj.clone()); } } } - result.insert(node, distances); + result.insert(node.clone(), distances); } result @@ -242,25 +177,21 @@ where } let mut cc: BTreeSet = BTreeSet::new(); - let mut stack: Vec<&V> = vec![node]; + let mut stack: Vec = vec![node.clone()]; while let Some(node) = stack.pop() { - if cc.contains(node) { + if cc.contains(&node) { continue; } cc.insert(node.clone()); - if let Some(adjacencies) = self.get_adjacencies(&node) { - for adj in adjacencies { - stack.push(adj); - } + for adj in self.neighbors(&node) { + stack.push(adj); } - if let Some(adjacencies) = op.get_adjacencies(&node) { - for adj in adjacencies { - stack.push(adj); - } + for adj in op.neighbors(&node) { + stack.push(adj); } } @@ -275,10 +206,8 @@ where let mut to_remove = Vec::new(); for node in self.nodes.iter() { - if let Some(adjacencies) = self.get_adjacencies(node) { - if adjacencies.is_empty() { - to_remove.push(node.clone()); - } + if self.neighbors(node).is_empty() { + to_remove.push(node.clone()); } } @@ -315,20 +244,18 @@ where } let mut cc: BTreeSet = BTreeSet::new(); - let mut stack: Vec<&V> = vec![node]; + let mut stack: Vec = vec![node.clone()]; while let Some(node) = stack.pop() { - if cc.contains(node) { + if cc.contains(&node) { continue; } pb.inc(1); cc.insert(node.clone()); - if let Some(adjacencies) = self.directed.get_adjacencies(&node) { - for adj in adjacencies { - stack.push(adj); - } + for adj in self.neighbors(&node) { + stack.push(adj); } } @@ -371,7 +298,12 @@ where let mut curr = node; let mut path = vec![curr.clone()]; - while let Some(adjacencies) = self.directed.get_adjacencies(&curr) { + loop { + let adjacencies = self.neighbors(&curr); + if adjacencies.is_empty() { + break; + } + let probes = adjacencies .iter() .filter(|&x| !path.contains(x)) @@ -399,10 +331,8 @@ where compacted_count += path.len() - 2; - if let Some(adjacencies) = self.directed.get_adjacencies(&curr) { - for adj in adjacencies { - stack.push(adj.clone()); - } + for adj in self.neighbors(&curr) { + stack.push(adj); } } } diff --git a/src/graph/dag.rs b/src/graph/dag.rs new file mode 100644 index 0000000..5a70142 --- /dev/null +++ b/src/graph/dag.rs @@ -0,0 +1,92 @@ +use std::{ + collections::{BTreeMap, BTreeSet}, + fmt::Debug, +}; + +use super::{AdjacencyGraph, DirectedAcyclicGraph, Graph}; + +impl Graph for DirectedAcyclicGraph +where + V: Ord + Clone, +{ + fn new() -> Self { + DirectedAcyclicGraph(AdjacencyGraph::new()) + } + + fn to_adjecency_graph(&self) -> AdjacencyGraph { + self.0.clone() + } + + fn nodes(&self) -> BTreeSet { + self.0.nodes() + } + + fn adjacencies(&self) -> BTreeMap> { + self.0.adjacencies() + } + + fn neighbors(&self, from: &V) -> BTreeSet { + self.0.neighbors(from) + } + + fn edges(&self) -> BTreeSet<(V, V)> { + self.0.edges() + } + + fn add_node(&mut self, node: V) { + self.0.add_node(node); + } + + fn add_edge(&mut self, from: V, to: V) { + self.0.add_edge(from, to); + } + + fn remove_node(&mut self, node: &V) { + self.0.remove_node(node); + } + + fn remove_edge(&mut self, from: &V, to: &V) { + self.0.remove_edge(from, to); + } +} + +impl DirectedAcyclicGraph +where + V: Ord + Eq + Clone + Debug, +{ + pub fn all_paths(&self, start: &V, mut visit_fn: F) + where + F: FnMut(Vec) -> bool, + { + let mut prev: BTreeMap = BTreeMap::new(); + + let mut stack: Vec<(V, Option)> = vec![(start.clone(), None)]; + + while let Some((node, parent)) = stack.pop() { + if let Some(p) = parent { + prev.insert(node.clone(), p.clone()); + } + + let neighbors = self.neighbors(&node); + if neighbors.is_empty() { + let mut path = vec![]; + + let mut current = node.clone(); + while let Some(next) = prev.get(¤t) { + path.push(current); + current = next.clone(); + } + + path.reverse(); + + if !visit_fn(path) { + return; + } + } + + for n in neighbors { + stack.push((n.clone(), Some(node.clone()))); + } + } + } +} diff --git a/src/graph/directed.rs b/src/graph/directed.rs new file mode 100644 index 0000000..332f183 --- /dev/null +++ b/src/graph/directed.rs @@ -0,0 +1,74 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use super::{AdjacencyGraph, Graph}; + +impl Graph for AdjacencyGraph +where + V: Ord + Clone, +{ + fn new() -> Self + where + Self: Sized, + { + AdjacencyGraph { + nodes: BTreeSet::new(), + adjacencies: BTreeMap::new(), + } + } + + fn to_adjecency_graph(&self) -> AdjacencyGraph { + self.clone() + } + + fn nodes(&self) -> BTreeSet { + self.nodes.clone() + } + + fn adjacencies(&self) -> BTreeMap> { + self.adjacencies.clone() + } + + fn edges(&self) -> BTreeSet<(V, V)> { + self.adjacencies + .iter() + .flat_map(|(from, tos)| tos.iter().map(move |to| (from.clone(), to.clone()))) + .collect() + } + + fn neighbors(&self, from: &V) -> BTreeSet { + if let Some(neighbors) = self.adjacencies.get(from) { + neighbors.clone() + } else { + BTreeSet::new() + } + } + + fn add_node(&mut self, node: V) { + self.nodes.insert(node); + } + + fn add_edge(&mut self, from: V, to: V) { + self.nodes.insert(from.clone()); + self.nodes.insert(to.clone()); + + self.adjacencies + .entry(from) + .or_insert_with(BTreeSet::new) + .insert(to.clone()); + } + + fn remove_node(&mut self, node: &V) { + self.nodes.remove(node); + self.adjacencies.remove(node); + + for adjacencies in self.adjacencies.values_mut() { + adjacencies.remove(node); + } + } + + fn remove_edge(&mut self, from: &V, to: &V) { + if let Some(adjacencies) = self.adjacencies.get_mut(from) { + adjacencies.remove(to); + } + } +} diff --git a/src/graph/edge_types.rs b/src/graph/edge_types.rs index 868ca13..8308332 100644 --- a/src/graph/edge_types.rs +++ b/src/graph/edge_types.rs @@ -8,7 +8,7 @@ use std::{ use indicatif::ProgressBar; -use super::AdjacencyGraph; +use super::{AdjacencyGraph, DirectedAcyclicGraph, Graph}; #[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] pub enum EdgeType { @@ -63,21 +63,19 @@ where .insert((parent.clone(), node.clone()), EdgeType::TreeEdge); } - if let Some(adjacencies) = graph.get_adjacencies(node) { - for adj in adjacencies.iter() { - if !self.visited.contains(adj) { - self.dfs(graph, adj, Some(node)); + for adj in graph.neighbors(node) { + if !self.visited.contains(&adj) { + self.dfs(graph, &adj, Some(node)); + } else { + if !self.finished_nodes.contains(&adj) { + self.edge_types + .insert((node.clone(), adj.clone()), EdgeType::BackEdge); + } else if self.start_times.get(node) < self.start_times.get(&adj) { + self.edge_types + .insert((node.clone(), adj.clone()), EdgeType::ForwardEdge); } else { - if !self.finished_nodes.contains(adj) { - self.edge_types - .insert((node.clone(), adj.clone()), EdgeType::BackEdge); - } else if self.start_times.get(node) < self.start_times.get(adj) { - self.edge_types - .insert((node.clone(), adj.clone()), EdgeType::ForwardEdge); - } else { - self.edge_types - .insert((node.clone(), adj.clone()), EdgeType::CrossEdge); - } + self.edge_types + .insert((node.clone(), adj.clone()), EdgeType::CrossEdge); } } } @@ -152,39 +150,31 @@ where node, continue_from: index, } => { - if let Some(adjacencies) = self.get_adjacencies(&node) { - for (i, adj) in adjacencies.iter().enumerate() { - if i < index { - continue; - } + for (i, adj) in self.neighbors(&node).iter().enumerate() { + if i < index { + continue; + } - if !visited.contains(adj) { - continuations.push(Continuation::Neighbors { - node: node.clone(), - continue_from: i + 1, - }); - continuations.push(Continuation::Start { - node: adj.clone(), - parent: Some(node.clone()), - }); - break; + if !visited.contains(adj) { + continuations.push(Continuation::Neighbors { + node: node.clone(), + continue_from: i + 1, + }); + continuations.push(Continuation::Start { + node: adj.clone(), + parent: Some(node.clone()), + }); + break; + } else { + if !finished_nodes.contains(adj) { + edge_types + .insert((node.clone(), adj.clone()), EdgeType::BackEdge); + } else if start_times.get(&node) < start_times.get(adj) { + edge_types + .insert((node.clone(), adj.clone()), EdgeType::ForwardEdge); } else { - if !finished_nodes.contains(adj) { - edge_types.insert( - (node.clone(), adj.clone()), - EdgeType::BackEdge, - ); - } else if start_times.get(&node) < start_times.get(adj) { - edge_types.insert( - (node.clone(), adj.clone()), - EdgeType::ForwardEdge, - ); - } else { - edge_types.insert( - (node.clone(), adj.clone()), - EdgeType::CrossEdge, - ); - } + edge_types + .insert((node.clone(), adj.clone()), EdgeType::CrossEdge); } } } @@ -200,4 +190,22 @@ where progress_bar.finish(); return edge_types; } + + // Constructs a Directed Acyclic Graph from the current graph by stripping out the back edges + pub fn dag(&self) -> DirectedAcyclicGraph { + let edge_types = self.compute_edge_types(); + + let mut graph = AdjacencyGraph::new(); + + for ((from, to), edge_type) in edge_types.iter() { + match edge_type { + EdgeType::BackEdge => {} + _ => { + graph.add_edge(from.clone(), to.clone()); + } + } + } + + return DirectedAcyclicGraph(graph); + } } diff --git a/src/graph/mod.rs b/src/graph/mod.rs index c1892c3..0f741ec 100644 --- a/src/graph/mod.rs +++ b/src/graph/mod.rs @@ -5,14 +5,58 @@ use std::{ pub trait Graph where - V: Clone, + V: Ord + Clone, { + fn new() -> Self + where + Self: Sized; + + fn from_edges(edges: &[(V, V)]) -> Self + where + Self: Sized, + { + let mut graph = Self::new(); + + for (from, to) in edges { + graph.add_edge(from.clone(), to.clone()); + } + + graph + } + + fn to_adjecency_graph(&self) -> AdjacencyGraph; + fn nodes(&self) -> BTreeSet; fn adjacencies(&self) -> BTreeMap>; fn edges(&self) -> BTreeSet<(V, V)>; + fn neighbors(&self, from: &V) -> BTreeSet; + + fn add_node(&mut self, node: V); + fn add_edge(&mut self, from: V, to: V); + + fn remove_node(&mut self, node: &V); + fn remove_edge(&mut self, from: &V, to: &V); + + fn restricted(&self, nodes: &Vec) -> Self + where + Self: Sized, + { + let nodes_index = nodes.iter().collect::>(); + let mut restricted = Self::new(); + + for node in nodes { + for adj in self.neighbors(&node) { + if nodes_index.contains(&adj) { + restricted.add_edge(node.clone(), adj.clone()); + } + } + } + + restricted + } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct AdjacencyGraph where V: Clone, @@ -29,8 +73,16 @@ where pub directed: AdjacencyGraph, } +#[derive(Debug)] +pub struct DirectedAcyclicGraph(pub AdjacencyGraph) +where + V: Clone; + pub mod algorithms; +pub mod dag; +pub mod directed; pub mod edge_types; +pub mod undirected; #[cfg(test)] mod tests { @@ -141,4 +193,29 @@ mod tests { g.compact_chains(); println!("{:?}", g); } + + #[test] + fn test_all_paths() { + let g = AdjacencyGraph::from_edges(&[ + // + ("u", "v"), + ("u", "x"), + ("v", "y"), + ("y", "x"), + ("y", "w"), + ("w", "z"), + ("z", "x"), + ("x", "h"), + ("x", "g"), + ("h", "i"), + ("g", "i"), + ]); + + let g = g.dag(); + + g.all_paths(&"u", |path| { + println!("{:?}", path); + false + }); + } } diff --git a/src/graph/undirected.rs b/src/graph/undirected.rs new file mode 100644 index 0000000..1eb3271 --- /dev/null +++ b/src/graph/undirected.rs @@ -0,0 +1,55 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use super::{AdjacencyGraph, Graph, UndirectedGraph}; + +impl Graph for UndirectedGraph +where + V: Ord + Clone, +{ + fn new() -> Self + where + Self: Sized, + { + UndirectedGraph { + directed: AdjacencyGraph::new(), + } + } + + fn to_adjecency_graph(&self) -> AdjacencyGraph { + self.directed.clone() + } + + fn nodes(&self) -> BTreeSet { + self.directed.nodes() + } + + fn adjacencies(&self) -> BTreeMap> { + self.directed.adjacencies() + } + + fn neighbors(&self, from: &V) -> BTreeSet { + self.directed.neighbors(from) + } + + fn edges(&self) -> BTreeSet<(V, V)> { + self.directed.edges() + } + + fn add_node(&mut self, node: V) { + self.directed.add_node(node); + } + + fn add_edge(&mut self, from: V, to: V) { + self.directed.add_edge(from.clone(), to.clone()); + self.directed.add_edge(to, from); + } + + fn remove_node(&mut self, node: &V) { + self.directed.remove_node(node); + } + + fn remove_edge(&mut self, from: &V, to: &V) { + self.directed.remove_edge(from, to); + self.directed.remove_edge(to, from); + } +} diff --git a/src/main.rs b/src/main.rs index 3b88a66..4be651f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ mod gfa; mod graph; +mod rolling_hash; use std::{ collections::{BTreeMap, HashMap}, @@ -12,9 +13,10 @@ use std::{ use argh::FromArgs; use gfa::{Entry, Orientation}; -use graph::{AdjacencyGraph, Graph}; +use graph::{AdjacencyGraph, DirectedAcyclicGraph, Graph}; use indicatif::ProgressIterator; use rand::seq::SliceRandom; +use rolling_hash::RollingHash; #[derive(FromArgs, PartialEq, Debug)] /// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024 @@ -36,21 +38,39 @@ struct CommandShow { #[argh(option, short = 'i')] /// file to read input: String, + + #[argh(option, short = 'c', default = "1")] + /// number of paths to visit + path_count: usize, + + #[argh(option, short = 'p', default = "\"ACGT\".to_string()")] + /// k-mer pattern to search + pattern: String, + + #[argh(option, short = 'k', default = "4")] + /// k-mer length + k: usize, } fn main() -> std::io::Result<()> { let opts = argh::from_env::(); match opts.nested { - CliSubcommands::Show(show) => { - let file_lines_count = BufReader::new(std::fs::File::open(&show.input)?) + CliSubcommands::Show(opts) => { + // validate opts.pattern is a valid DNA sequence + if opts.pattern.chars().any(|c| !"ACGT".contains(c)) { + eprintln!("Invalid pattern: {:?}", opts.pattern); + process::exit(1); + } + + let file_lines_count = BufReader::new(std::fs::File::open(&opts.input)?) .lines() .progress_with( indicatif::ProgressBar::new_spinner().with_message("estimating line count"), ) .count() as u64; - let file = std::fs::File::open(show.input)?; + let file = std::fs::File::open(opts.input)?; let entries = gfa::parser::parse_source(file, file_lines_count)?; @@ -76,24 +96,24 @@ fn main() -> std::io::Result<()> { } } - compute_graph_stats(&graph); + compute_graph_degrees(&graph); - let edge_types = compute_edge_types(&graph); + let dag = graph.dag(); - println!("Removing back edges..."); + // let edge_types = compute_edge_types(&graph); - for ((from, to), edge_type) in edge_types.iter() { - match edge_type { - graph::edge_types::EdgeType::BackEdge => { - graph.remove_edge(from, to); - } - _ => {} - } - } + // for ((from, to), edge_type) in edge_types.iter() { + // match edge_type { + // graph::edge_types::EdgeType::BackEdge => { + // graph.remove_edge(from, to); + // } + // _ => {} + // } + // } - compute_edge_types(&graph); + compute_edge_types(&dag.0); - let ccs = compute_ccs(&graph); + let ccs = compute_ccs(&dag.0); println!("Picking largest connected component..."); // pick the largest connected component @@ -102,25 +122,168 @@ fn main() -> std::io::Result<()> { .max_by_key(|cc| cc.len()) .expect("at least one connected components"); - let largest_cc_graph = graph.restricted(largest_cc); + let largest_cc_graph = dag.restricted(largest_cc); - compute_graph_stats(&largest_cc_graph); - compute_edge_types(&largest_cc_graph); + let degrees = compute_graph_degrees(&largest_cc_graph); + compute_edge_types(&largest_cc_graph); // to double check this is a DAG - // let mut largest_cc_graph = graph.restricted(largest_cc).undirected(); + println!("Searching for a start node..."); + let start_node = degrees + .iter() + .find(|(_, degree)| degree.in_degree == 0) + .expect("no start node found") + .0; + + println!("Start node: {:?}", start_node); + println!("{:?}", degrees.get(start_node).unwrap()); + + compute_orientation_histogram(&largest_cc_graph); + + println!("Visiting the graph, searching {} paths...", opts.path_count); + + let sequences = compute_sequences( + &sequence_map, + &largest_cc_graph, + start_node, + opts.path_count, + ); + + for (i, sequence) in sequences.iter().enumerate() { + println!("Sequence #{} of length {}", i + 1, sequence.len()); + + println!("Searching {} in the sequence (naive)...", opts.pattern); + let occurrences = compute_sequence_occurrences_naive(sequence, &opts.pattern); + println!("Occurrences: {:?}\n", occurrences); + + println!( + "Searching {} in the sequence (rolling hash)...", + opts.pattern + ); + let occurrences = + compute_sequence_occurrences_rolling_hash(sequence, &opts.pattern); + println!("Occurrences: {:?}\n", occurrences); + } - // compute_graph_stats(&largest_cc_graph); - // // compute_edge_types(&largest_cc_graph); + println!("Cleaning up..."); + process::exit(0); + } + } +} - // println!("Compacting chains..."); - // largest_cc_graph.compact_chains(); +fn letter_to_number(letter: char) -> u64 { + match letter { + 'A' => 1, + 'C' => 2, + 'G' => 3, + 'T' => 4, + _ => panic!("Invalid letter: {}", letter), + } +} - // compute_graph_stats(&largest_cc_graph); - // compute_edge_types(&largest_cc_graph.directed); +fn compute_sequence_occurrences_rolling_hash(sequence: &str, pattern: &str) -> Vec { + let chars = sequence.chars().map(letter_to_number).collect::>(); - println!("Cleaning up..."); - process::exit(0); + let mut occurrences = vec![]; + + let mut rl = RollingHash::new(3000, 5); + // let mut rl = RollingHash::new(1_000_000, 5); + + let pattern_hash = rl.hash_pattern(&pattern.chars().map(letter_to_number).collect::>()); + + for i in 0..pattern.len() { + // println!("Adding letter: {}", chars[i]); + rl.add_last(chars[i]); + } + + for i in pattern.len()..sequence.len() { + let hash = rl.hash(); + + if rl.compare(&hash, &pattern_hash) { + println!("Hash match at position {}", i); + + // check for false positives + if &sequence[i - pattern.len()..i] != pattern { + println!("=> False positive"); + } else { + println!("=> Correct"); + occurrences.push(i - pattern.len()); + } + } + + rl.advance(chars[i]); + } + + occurrences +} + +fn compute_sequence_occurrences_naive(sequence: &str, pattern: &str) -> Vec { + let mut occurrences = vec![]; + + for i in 0..sequence.len() - pattern.len() { + if &sequence[i..i + pattern.len()] == pattern { + occurrences.push(i); + } + } + + occurrences +} + +fn compute_sequences( + sequence_map: &HashMap, + graph: &DirectedAcyclicGraph<(String, Orientation)>, + start_node: &(String, Orientation), + count: usize, +) -> Vec { + let mut sequences = vec![]; + + let mut path_counter = 0; + graph.all_paths(start_node, |path| { + println!("Path #{} of length {}", path_counter + 1, path.len()); + + let mut sequence = String::new(); + for node in path { + let (id, orientation) = node; + let seq = sequence_map.get(&id).expect("sequence not found"); + match orientation { + Orientation::Forward => sequence.push_str(seq), + Orientation::Reverse => sequence.push_str( + &seq.chars() + .map(|c| match c { + 'A' => 'T', + 'T' => 'A', + 'C' => 'G', + 'G' => 'C', + _ => panic!("Invalid letter: {}", c), + }) + .rev() + .collect::(), + ), + } } + + sequences.push(sequence); + + path_counter += 1; + path_counter < count + }); + + sequences +} + +fn compute_orientation_histogram(graph: &impl Graph<(String, Orientation)>) { + let orientation_histogram = + graph + .nodes() + .iter() + .map(|node| node.1) + .fold(BTreeMap::new(), |mut acc, orientation| { + *acc.entry(orientation).or_insert(0) += 1; + acc + }); + + println!("Orientation histogram:"); + for (orientation, count) in orientation_histogram.iter() { + println!("- {:?}: {}", orientation, count); } } @@ -148,12 +311,12 @@ where ccs } -fn compute_edge_types(graph: &AdjacencyGraph) -> BTreeMap<(V, V), graph::edge_types::EdgeType> +fn compute_edge_types(graph: &impl Graph) -> BTreeMap<(V, V), graph::edge_types::EdgeType> where V: Ord + Eq + Clone + Debug, { println!("Computing edge types..."); - let edge_types = graph.compute_edge_types(); + let edge_types = graph.to_adjecency_graph().compute_edge_types(); println!("Computing edge types histogram..."); let histogram = edge_types.iter().map(|(_, edge_type)| edge_type).fold( @@ -167,7 +330,7 @@ where println!("Node count: {}", graph.nodes().len()); println!( "Edge count: {}, Total edge count: {}", - graph.edges().count(), + graph.edges().len(), edge_types.len() ); println!("Edge types histogram (type/count):"); @@ -186,7 +349,7 @@ where let mut g2 = AdjacencyGraph::new(); - let mut shuffled_nodes: Vec<_> = graph.nodes().iter().collect::>(); + let mut shuffled_nodes: Vec<_> = graph.nodes().into_iter().collect::>(); shuffled_nodes.shuffle(&mut rand::thread_rng()); for node in shuffled_nodes.iter() { @@ -210,9 +373,16 @@ where g2 } +#[derive(Debug)] +struct NodeDegree { + degree: usize, + in_degree: usize, + out_degree: usize, +} + /// This function prints the number of nodes, edges and a histogram of the degrees of the nodes /// in the graph (computing the degrees might take a long time) -fn compute_graph_stats(graph: &impl Graph) +fn compute_graph_degrees(graph: &impl Graph) -> BTreeMap where V: Ord + Eq + Clone + Debug, { @@ -226,6 +396,12 @@ where let progress_bar = indicatif::ProgressBar::new(graph.edges().len() as u64); + for node in graph.nodes() { + vertices_degrees.insert(node.clone(), 0); + vertices_in_degrees.insert(node.clone(), 0); + vertices_out_degrees.insert(node.clone(), 0); + } + for (from, tos) in graph.adjacencies() { *vertices_degrees.entry(from.clone()).or_insert(0) += tos.len(); *vertices_out_degrees.entry(from.clone()).or_insert(0) += tos.len(); @@ -248,6 +424,20 @@ where *acc.entry(degree).or_insert(0) += 1; acc }); + let histogram_in: BTreeMap = vertices_in_degrees + .iter() + .map(|(_, degree)| *degree) + .fold(BTreeMap::new(), |mut acc, degree| { + *acc.entry(degree).or_insert(0) += 1; + acc + }); + let histogram_out: BTreeMap = vertices_out_degrees + .iter() + .map(|(_, degree)| *degree) + .fold(BTreeMap::new(), |mut acc, degree| { + *acc.entry(degree).or_insert(0) += 1; + acc + }); println!("Stats:"); println!("- Nodes: {}", graph.nodes().len()); @@ -257,4 +447,49 @@ where for (degree, count) in histogram.iter() { println!("- {}: {}", degree, count); } + + println!("In-degrees histogram (degree/count):"); + for (degree, count) in histogram_in.iter() { + println!("- {}: {}", degree, count); + } + + println!("Out-degrees histogram (degree/count):"); + for (degree, count) in histogram_out.iter() { + println!("- {}: {}", degree, count); + } + + let mut node_degrees = BTreeMap::new(); + + for node in graph.nodes() { + node_degrees.insert( + node.clone(), + NodeDegree { + degree: *vertices_degrees.get(&node).expect("node already computed"), + in_degree: *vertices_in_degrees + .get(&node) + .expect("node already computed"), + out_degree: *vertices_out_degrees + .get(&node) + .expect("node already computed"), + }, + ); + } + + node_degrees } + +// fn compute_compact_graph(graph: &mut UndirectedGraph) -> UndirectedGraph +// where +// V: Ord + Eq + Clone + Debug, +// { +// compute_graph_degrees(graph); +// // compute_edge_types(graph); + +// println!("Compacting chains..."); +// graph.compact_chains(); + +// compute_graph_degrees(graph); +// compute_edge_types(graph.directed); + +// graph.clone() +// } diff --git a/src/rolling_hash/mod.rs b/src/rolling_hash/mod.rs new file mode 100644 index 0000000..fc7324d --- /dev/null +++ b/src/rolling_hash/mod.rs @@ -0,0 +1,238 @@ +use std::{collections::VecDeque, fmt::Debug}; + +pub struct RollingHash + Clone> { + modulus: u64, + alphabet_size: u64, + + offset: u64, + current_word: VecDeque, + hash: u64, +} + +#[derive(Debug)] +pub struct Hashed { + hash: u64, + offset: u64, +} + +fn wrapping_pow_correct(a: u64, b: u64) -> u64 { + // // println!("Wrapping pow: {}^{}", a, b); + + // let mut result = 1u64; + // for _ in 0..b { + // result = result.wrapping_mul(a); + // } + + // // println!("=> {}", result); + + // result + + a.wrapping_pow(b as u32) +} + +impl RollingHash +where + T: Into + Clone + Debug, +{ + pub fn new(modulus: u64, alphabet_size: u64) -> Self { + RollingHash { + modulus, + alphabet_size, + + offset: 0, + current_word: VecDeque::new(), + hash: 0, + } + } + + // pub fn hash(&self) -> u64 { + // self.hash % self.modulus + // } + + pub fn hash(&self) -> Hashed { + Hashed { + hash: self.hash % self.modulus, + offset: self.offset, + } + } + + pub fn compare(&self, lhs: &Hashed, rhs: &Hashed) -> bool { + // println!("Comparing: {:?} {:?}", lhs, rhs); + + let (lhs, rhs) = if lhs.offset < rhs.offset { + (lhs, rhs) + } else { + (rhs, lhs) + }; + + // Shift lhs to the right by the difference in offsets + let shifted_lhs = (lhs.hash + * wrapping_pow_correct(self.alphabet_size, rhs.offset - lhs.offset)) + % self.modulus; + + shifted_lhs == rhs.hash + } + + pub fn hash_pattern(&self, pattern: &[T]) -> Hashed { + let mut hash = 0; + + for (i, value) in pattern.iter().enumerate() { + let char_hash = + value.clone().into() * wrapping_pow_correct(self.alphabet_size, i as u64); + + hash += char_hash; + } + + Hashed { hash, offset: 0 } + } + + pub fn add_last(&mut self, value: T) { + self.current_word.push_back(value.clone()); + + let i = self.offset + (self.current_word.len() as u64) - 1; + + // println!("Alphabet size: {}", self.alphabet_size); + // println!("Index: {}", i); + + // println!( + // "Adding: {:?} * {} to {}", + // value, + // wrapping_pow_correct(self.alphabet_size, i), + // self.hash + // ); + self.hash += value.into() * wrapping_pow_correct(self.alphabet_size, i); + } + + pub fn remove_first(&mut self) { + let value = self.current_word.pop_front().unwrap(); + + let i = self.offset; + + self.hash -= value.into() * wrapping_pow_correct(self.alphabet_size, i); + + self.offset += 1; + } + + pub fn advance(&mut self, value: T) { + self.add_last(value); + self.remove_first(); + } + + pub fn hash_value_at(&self, h: &Hashed, pos: u64) -> u64 { + let offset = h.offset; + let hash = h.hash; + + let diff = pos as i64 - offset as i64; + + if diff < 0 { + panic!("Invalid position"); + } + + (hash * wrapping_pow_correct(self.alphabet_size, diff as u64)) % self.modulus + } + + pub fn hash_value_at_caret(&self, h: &Hashed) -> u64 { + self.hash_value_at(h, self.offset) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rolling_hash() { + println!("Rolling hash test"); + + let modulus = 42; + let alphabet_size = 4; + + let mut rh = RollingHash::::new(modulus, alphabet_size); + let initial_pattern_hash = rh.hash_pattern(&[1, 2, 3, 4, 5]); + println!("Initial pattern hash: {:?}", initial_pattern_hash); + + rh.add_last(1); + rh.add_last(2); + rh.add_last(3); + rh.add_last(4); + rh.add_last(5); + + println!("Hash: {:?}", rh.hash()); + + rh.advance(0); + + println!("Hash: {:?}", rh.hash()); + + rh.advance(1); + rh.advance(2); + rh.advance(3); + rh.advance(4); + rh.advance(5); + + println!("Current word: {:?}", rh.current_word); + println!("Hash: {:?}", rh.hash()); + println!( + "Shifted pattern hash: {}", + rh.hash_value_at_caret(&initial_pattern_hash) + ); + + let pattern = initial_pattern_hash; + let curr_hash = rh.hash(); + + println!("Pattern hash: {:?}", pattern); + println!("Current hash: {:?}", curr_hash); + + println!( + "Pattern hash at caret: {}", + rh.hash_value_at_caret(&pattern) + ); + + println!("Compare: {:?}", rh.compare(&pattern, &curr_hash)); + } + + #[test] + fn test_geometry_rolling_hash() { + println!("Geometry rolling hash test"); + + let modulus = 10_000_000; + let alphabet_size = 2; + + let mut rh = RollingHash::::new(modulus, alphabet_size); + + let initial_pattern_hash = rh.hash_pattern(&[1, 1, 1, 1]); + + rh.add_last(1); + rh.add_last(1); + rh.add_last(1); + rh.add_last(1); + + println!("Initial pattern hash: {:?}", initial_pattern_hash); + println!("Hash: {:?}", rh.hash()); + + rh.advance(1); + + println!("Hash: {:?}", rh.hash()); + println!( + "Shifted pattern hash: {:?}", + rh.hash_value_at(&initial_pattern_hash, rh.offset) + ); + } + + #[test] + fn test_wrappping_pow() { + println!("Wrapping pow test"); + + let a = 2; + let b = 3; + + let result = wrapping_pow_correct(a, b); + + assert_eq!(result, 8); + + let a = 3; + let b = 100; + let result = wrapping_pow_correct(a, b); + + assert_ne!(result, 0); + } +}