From 607d2fc14b977be41bd241247030cd78198833cb Mon Sep 17 00:00:00 2001 From: Antonio De Lucreziis Date: Wed, 22 May 2024 19:39:39 +0200 Subject: [PATCH] connected components algorithm --- src/gfa.rs | 36 ++++++++++ src/graph.rs | 185 +++++++++++++++++++++++++++++++++++++++++++++++--- src/main.rs | 51 +++++++++++++- src/parser.rs | 79 +++++++-------------- 4 files changed, 282 insertions(+), 69 deletions(-) create mode 100644 src/gfa.rs diff --git a/src/gfa.rs b/src/gfa.rs new file mode 100644 index 0000000..490f8e6 --- /dev/null +++ b/src/gfa.rs @@ -0,0 +1,36 @@ +#[derive(Debug)] +pub enum Orientation { + Forward, + Reverse, +} + +#[derive(Debug)] +pub enum Entry { + Header { + version: String, + }, + Segment { + id: String, + sequence: String, + }, + Link { + from: String, + from_orient: Orientation, + to: String, + to_orient: Orientation, + }, + Path { + name: String, + segments: Vec<(String, Orientation)>, + }, + Walk { + sample: String, + + haplotype_index: usize, + seq_id: String, + seq_start: usize, + seq_end: usize, + + segments: Vec<(String, Orientation)>, + }, +} diff --git a/src/graph.rs b/src/graph.rs index 0a3a3c6..8d9f0ee 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -1,26 +1,189 @@ -struct AdjacencyGraph { - nodes: Vec<&str>, - adjacencies: HashMap<&str, Vec<&str>>, +use std::{ + cell::RefCell, + collections::{HashMap, HashSet}, + rc::Rc, + result, +}; + +#[derive(Debug)] +pub struct AdjacencyGraph { + nodes: HashMap, + adjacencies: HashMap>, } impl AdjacencyGraph { - fn new() -> Self { + pub fn new() -> Self { AdjacencyGraph { - nodes: Vec::new(), + nodes: HashMap::new(), adjacencies: HashMap::new(), } } - fn add_node(&mut self, node: &str) { - self.nodes.push(node); - self.adjacencies.insert(node, Vec::new()); + pub fn add_node(&mut self, key: String, value: String) { + self.nodes.insert(key, value); } - fn add_edge(&mut self, from: &str, to: &str) { - self.adjacencies.get_mut(from).unwrap().push(to); + pub fn add_edge(&mut self, from: String, to: String) { + self.adjacencies + .entry(from) + .or_insert(HashSet::new()) + .insert(to); } - fn neighbors(&self, node: &str) -> Option<&Vec<&str>> { + pub fn get_adjacencies(&self, node: &str) -> Option<&HashSet> { self.adjacencies.get(node) } + + pub fn adjacencies(&self) -> &HashMap> { + &self.adjacencies + } + + pub fn opposite(&self) -> AdjacencyGraph { + let mut opposite = AdjacencyGraph::new(); + + for (from, adjacencies) in self.adjacencies.iter() { + for to in adjacencies { + opposite.add_edge(to.clone(), from.clone()); + } + } + + opposite + } + + pub fn has_edge(&self, from: &str, to: &str) -> bool { + if let Some(adjacencies) = self.get_adjacencies(from) { + adjacencies.contains(&to.to_string()) + } else { + false + } + } + + // pub fn dfs(&self, node: String) -> HashSet { + // let mut visited = HashSet::new(); + // let mut stack = vec![node]; + + // while let Some(node) = stack.pop() { + // if visited.contains(&node) { + // continue; + // } + + // visited.insert(node.clone()); + + // if let Some(adjacencies) = self.get_adjacencies(&node) { + // for adj in adjacencies { + // stack.push(adj.clone()); + // } + // } + // } + + // visited + // } + + pub fn compute_ccs(&self) -> Vec> { + let mut visited = HashSet::new(); + let mut result = Vec::new(); + + let op = self.opposite(); + + for node in self.nodes.keys() { + if visited.contains(node) { + continue; + } + + let mut cc = HashSet::new(); + let mut stack = vec![node.to_string()]; + + while let Some(node) = stack.pop() { + if cc.contains(&node) { + continue; + } + + cc.insert(node.clone()); + + if let Some(adjacencies) = self.get_adjacencies(&node) { + for adj in adjacencies { + stack.push(adj.clone()); + } + } + + if let Some(adjacencies) = op.get_adjacencies(&node) { + for adj in adjacencies { + stack.push(adj.clone()); + } + } + } + + visited.extend(cc.iter().map(|x| x.to_owned())); + result.push(cc.iter().map(|x| x.to_owned()).collect()); + } + + result + } + + pub fn compute_ccs_2(&self) -> Vec> { + let mut cc = HashMap::>>>::new(); + + for node in self.nodes.keys() { + if cc.contains_key(node) { + continue; + } + + println!("All CC: {:?}", cc); + + let new_cc = Rc::new(RefCell::new(HashSet::new())); + + let mut stack = vec![node.to_string()]; + + while let Some(node) = stack.pop() { + println!("New CC: {:?}", new_cc.borrow()); + + new_cc.borrow_mut().insert(node.clone()); + + if cc.contains_key(&node) { + // merge the two connected components and go to the next node + + let old_cc = cc.get(&node).unwrap(); + + println!("Merging {:?} with {:?}", new_cc.borrow(), old_cc.borrow()); + + new_cc + .borrow_mut() + .extend(old_cc.borrow().iter().map(|x| x.to_owned())); + + break; + } + + if new_cc.borrow().contains(&node) { + continue; + } + + if let Some(adjacencies) = self.get_adjacencies(&node) { + for adj in adjacencies { + stack.push(adj.clone()); + } + } + } + + for n in new_cc.borrow().iter() { + cc.insert(n.to_owned(), new_cc.clone()); + } + } + + // extract the unique connected components by pointers + let mut result = Vec::new(); + let mut seen = HashSet::new(); + + for node in self.nodes.keys() { + if seen.contains(node) { + continue; + } + + let cc = cc.get(node).unwrap(); + seen.extend(cc.borrow().iter().map(|x| x.to_owned())); + + result.push(cc.borrow().iter().map(|x| x.to_owned()).collect()); + } + + result + } } diff --git a/src/main.rs b/src/main.rs index e12ec41..9d560a8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,9 @@ use argh::FromArgs; +use gfa::{Entry, Orientation}; +use graph::AdjacencyGraph; +mod gfa; +mod graph; mod parser; #[derive(FromArgs, PartialEq, Debug)] @@ -24,17 +28,58 @@ struct CommandShow { input: String, } -fn main() { +fn main() -> std::io::Result<()> { let opts = argh::from_env::(); match opts.nested { MySubCommandEnum::Show(show) => { - let file = std::fs::read_to_string(show.input).expect("cannot read file"); - let entries = parser::parse_source(file.as_str()); + let file = std::fs::File::open(show.input)?; + let entries = parser::parse_source(file)?; + + let mut graph = AdjacencyGraph::new(); for entry in entries { println!("{:?}", entry); + + match entry { + Entry::Segment { id, sequence } => { + graph.add_node(id, sequence); + } + Entry::Link { + from, + from_orient, + to, + to_orient, + } => match (from_orient, to_orient) { + (Orientation::Forward, Orientation::Forward) + | (Orientation::Reverse, Orientation::Reverse) => { + graph.add_edge(from, to); + } + (Orientation::Forward, Orientation::Reverse) + | (Orientation::Reverse, Orientation::Forward) => { + graph.add_edge(to, from); + } + }, + _ => {} + } + } + + for (from, adjacencies) in graph.adjacencies().iter() { + println!( + "{} -> {}", + from, + adjacencies + .iter() + .map(|to| to.to_owned()) + .collect::>() + .join(", ") + ); } + + let cc = graph.compute_ccs(); + println!("Number of connected components: {}", cc.len()); } } + + Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index ba2d9ae..3d4ae54 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,54 +1,20 @@ -use std::str::FromStr; - -#[derive(Debug)] -pub enum Orientation { - Forward, - Reverse, -} - -impl From<&str> for Orientation { - fn from(s: &str) -> Self { - match s { - "+" => Orientation::Forward, - ">" => Orientation::Forward, - "-" => Orientation::Reverse, - "<" => Orientation::Reverse, - _ => panic!("Invalid orientation: {}", s), - } +use std::{ + io::{self, BufRead, BufReader, Read}, + str::FromStr, +}; + +use crate::gfa::{Entry, Orientation}; + +fn parse_orientation(s: &str) -> Orientation { + match s { + "+" => Orientation::Forward, + ">" => Orientation::Forward, + "-" => Orientation::Reverse, + "<" => Orientation::Reverse, + _ => panic!("Invalid orientation: {}", s), } } -#[derive(Debug)] -pub enum Entry { - Header { - version: String, - }, - Segment { - id: String, - sequence: String, - }, - Link { - from: String, - from_orient: Orientation, - to: String, - to_orient: Orientation, - }, - Path { - name: String, - segments: Vec<(String, Orientation)>, - }, - Walk { - sample: String, - - haplotype_index: usize, - seq_id: String, - seq_start: usize, - seq_end: usize, - - segments: Vec<(String, Orientation)>, - }, -} - /// Parse a line of the source file into a Header struct /// /// ```txt @@ -87,9 +53,9 @@ fn parse_link(line: &str) -> Entry { Entry::Link { from: columns[1].to_string(), - from_orient: columns[2].into(), + from_orient: parse_orientation(columns[2]), to: columns[3].to_string(), - to_orient: columns[4].into(), + to_orient: parse_orientation(columns[4]), } } @@ -107,7 +73,7 @@ fn parse_path(line: &str) -> Entry { .split(',') .map(|s| { let (name, orient) = s.split_at(s.len() - 1); - (name.to_string(), orient.into()) + (name.to_string(), parse_orientation(orient)) }) .collect(), } @@ -126,7 +92,7 @@ fn parse_path_segments(s: &str) -> Vec<(String, Orientation)> { let (name, r) = r.split_at(r.find(['<', '>']).unwrap_or(r.len())); rest = r; - result.push((name.to_string(), orient.into())); + result.push((name.to_string(), parse_orientation(orient))); if rest.is_empty() { break; @@ -156,10 +122,12 @@ fn parse_walk(line: &str) -> Entry { } } -pub fn parse_source(source: &str) -> Vec { +pub fn parse_source(reader: R) -> io::Result> { let mut entries = Vec::new(); - for line in source.lines() { + for line in BufReader::new(reader).lines() { + let line = line?; let line = line.trim(); + if line.is_empty() || line.starts_with('#') { continue; } @@ -180,5 +148,6 @@ pub fn parse_source(source: &str) -> Vec { }; entries.push(entry); } - entries + + Ok(entries) }