connected components algorithm

main
Antonio De Lucreziis 6 months ago
parent 425fcd8544
commit 607d2fc14b

@ -0,0 +1,36 @@
#[derive(Debug)]
pub enum Orientation {
Forward,
Reverse,
}
#[derive(Debug)]
pub enum Entry {
Header {
version: String,
},
Segment {
id: String,
sequence: String,
},
Link {
from: String,
from_orient: Orientation,
to: String,
to_orient: Orientation,
},
Path {
name: String,
segments: Vec<(String, Orientation)>,
},
Walk {
sample: String,
haplotype_index: usize,
seq_id: String,
seq_start: usize,
seq_end: usize,
segments: Vec<(String, Orientation)>,
},
}

@ -1,26 +1,189 @@
struct AdjacencyGraph { use std::{
nodes: Vec<&str>, cell::RefCell,
adjacencies: HashMap<&str, Vec<&str>>, collections::{HashMap, HashSet},
rc::Rc,
result,
};
#[derive(Debug)]
pub struct AdjacencyGraph {
nodes: HashMap<String, String>,
adjacencies: HashMap<String, HashSet<String>>,
} }
impl AdjacencyGraph { impl AdjacencyGraph {
fn new() -> Self { pub fn new() -> Self {
AdjacencyGraph { AdjacencyGraph {
nodes: Vec::new(), nodes: HashMap::new(),
adjacencies: HashMap::new(), adjacencies: HashMap::new(),
} }
} }
fn add_node(&mut self, node: &str) { pub fn add_node(&mut self, key: String, value: String) {
self.nodes.push(node); self.nodes.insert(key, value);
self.adjacencies.insert(node, Vec::new());
} }
fn add_edge(&mut self, from: &str, to: &str) { pub fn add_edge(&mut self, from: String, to: String) {
self.adjacencies.get_mut(from).unwrap().push(to); self.adjacencies
.entry(from)
.or_insert(HashSet::new())
.insert(to);
} }
fn neighbors(&self, node: &str) -> Option<&Vec<&str>> { pub fn get_adjacencies(&self, node: &str) -> Option<&HashSet<String>> {
self.adjacencies.get(node) self.adjacencies.get(node)
} }
pub fn adjacencies(&self) -> &HashMap<String, HashSet<String>> {
&self.adjacencies
}
pub fn opposite(&self) -> AdjacencyGraph {
let mut opposite = AdjacencyGraph::new();
for (from, adjacencies) in self.adjacencies.iter() {
for to in adjacencies {
opposite.add_edge(to.clone(), from.clone());
}
}
opposite
}
pub fn has_edge(&self, from: &str, to: &str) -> bool {
if let Some(adjacencies) = self.get_adjacencies(from) {
adjacencies.contains(&to.to_string())
} else {
false
}
}
// pub fn dfs(&self, node: String) -> HashSet<String> {
// let mut visited = HashSet::new();
// let mut stack = vec![node];
// while let Some(node) = stack.pop() {
// if visited.contains(&node) {
// continue;
// }
// visited.insert(node.clone());
// if let Some(adjacencies) = self.get_adjacencies(&node) {
// for adj in adjacencies {
// stack.push(adj.clone());
// }
// }
// }
// visited
// }
pub fn compute_ccs(&self) -> Vec<Vec<String>> {
let mut visited = HashSet::new();
let mut result = Vec::new();
let op = self.opposite();
for node in self.nodes.keys() {
if visited.contains(node) {
continue;
}
let mut cc = HashSet::new();
let mut stack = vec![node.to_string()];
while let Some(node) = stack.pop() {
if cc.contains(&node) {
continue;
}
cc.insert(node.clone());
if let Some(adjacencies) = self.get_adjacencies(&node) {
for adj in adjacencies {
stack.push(adj.clone());
}
}
if let Some(adjacencies) = op.get_adjacencies(&node) {
for adj in adjacencies {
stack.push(adj.clone());
}
}
}
visited.extend(cc.iter().map(|x| x.to_owned()));
result.push(cc.iter().map(|x| x.to_owned()).collect());
}
result
}
pub fn compute_ccs_2(&self) -> Vec<Vec<String>> {
let mut cc = HashMap::<String, Rc<RefCell<HashSet<String>>>>::new();
for node in self.nodes.keys() {
if cc.contains_key(node) {
continue;
}
println!("All CC: {:?}", cc);
let new_cc = Rc::new(RefCell::new(HashSet::new()));
let mut stack = vec![node.to_string()];
while let Some(node) = stack.pop() {
println!("New CC: {:?}", new_cc.borrow());
new_cc.borrow_mut().insert(node.clone());
if cc.contains_key(&node) {
// merge the two connected components and go to the next node
let old_cc = cc.get(&node).unwrap();
println!("Merging {:?} with {:?}", new_cc.borrow(), old_cc.borrow());
new_cc
.borrow_mut()
.extend(old_cc.borrow().iter().map(|x| x.to_owned()));
break;
}
if new_cc.borrow().contains(&node) {
continue;
}
if let Some(adjacencies) = self.get_adjacencies(&node) {
for adj in adjacencies {
stack.push(adj.clone());
}
}
}
for n in new_cc.borrow().iter() {
cc.insert(n.to_owned(), new_cc.clone());
}
}
// extract the unique connected components by pointers
let mut result = Vec::new();
let mut seen = HashSet::new();
for node in self.nodes.keys() {
if seen.contains(node) {
continue;
}
let cc = cc.get(node).unwrap();
seen.extend(cc.borrow().iter().map(|x| x.to_owned()));
result.push(cc.borrow().iter().map(|x| x.to_owned()).collect());
}
result
}
} }

@ -1,5 +1,9 @@
use argh::FromArgs; use argh::FromArgs;
use gfa::{Entry, Orientation};
use graph::AdjacencyGraph;
mod gfa;
mod graph;
mod parser; mod parser;
#[derive(FromArgs, PartialEq, Debug)] #[derive(FromArgs, PartialEq, Debug)]
@ -24,17 +28,58 @@ struct CommandShow {
input: String, input: String,
} }
fn main() { fn main() -> std::io::Result<()> {
let opts = argh::from_env::<CliTool>(); let opts = argh::from_env::<CliTool>();
match opts.nested { match opts.nested {
MySubCommandEnum::Show(show) => { MySubCommandEnum::Show(show) => {
let file = std::fs::read_to_string(show.input).expect("cannot read file"); let file = std::fs::File::open(show.input)?;
let entries = parser::parse_source(file.as_str()); let entries = parser::parse_source(file)?;
let mut graph = AdjacencyGraph::new();
for entry in entries { for entry in entries {
println!("{:?}", entry); println!("{:?}", entry);
match entry {
Entry::Segment { id, sequence } => {
graph.add_node(id, sequence);
}
Entry::Link {
from,
from_orient,
to,
to_orient,
} => match (from_orient, to_orient) {
(Orientation::Forward, Orientation::Forward)
| (Orientation::Reverse, Orientation::Reverse) => {
graph.add_edge(from, to);
}
(Orientation::Forward, Orientation::Reverse)
| (Orientation::Reverse, Orientation::Forward) => {
graph.add_edge(to, from);
}
},
_ => {}
} }
} }
for (from, adjacencies) in graph.adjacencies().iter() {
println!(
"{} -> {}",
from,
adjacencies
.iter()
.map(|to| to.to_owned())
.collect::<Vec<String>>()
.join(", ")
);
}
let cc = graph.compute_ccs();
println!("Number of connected components: {}", cc.len());
} }
}
Ok(())
} }

@ -1,13 +1,11 @@
use std::str::FromStr; use std::{
io::{self, BufRead, BufReader, Read},
str::FromStr,
};
#[derive(Debug)] use crate::gfa::{Entry, Orientation};
pub enum Orientation {
Forward,
Reverse,
}
impl From<&str> for Orientation { fn parse_orientation(s: &str) -> Orientation {
fn from(s: &str) -> Self {
match s { match s {
"+" => Orientation::Forward, "+" => Orientation::Forward,
">" => Orientation::Forward, ">" => Orientation::Forward,
@ -15,38 +13,6 @@ impl From<&str> for Orientation {
"<" => Orientation::Reverse, "<" => Orientation::Reverse,
_ => panic!("Invalid orientation: {}", s), _ => panic!("Invalid orientation: {}", s),
} }
}
}
#[derive(Debug)]
pub enum Entry {
Header {
version: String,
},
Segment {
id: String,
sequence: String,
},
Link {
from: String,
from_orient: Orientation,
to: String,
to_orient: Orientation,
},
Path {
name: String,
segments: Vec<(String, Orientation)>,
},
Walk {
sample: String,
haplotype_index: usize,
seq_id: String,
seq_start: usize,
seq_end: usize,
segments: Vec<(String, Orientation)>,
},
} }
/// Parse a line of the source file into a Header struct /// Parse a line of the source file into a Header struct
@ -87,9 +53,9 @@ fn parse_link(line: &str) -> Entry {
Entry::Link { Entry::Link {
from: columns[1].to_string(), from: columns[1].to_string(),
from_orient: columns[2].into(), from_orient: parse_orientation(columns[2]),
to: columns[3].to_string(), to: columns[3].to_string(),
to_orient: columns[4].into(), to_orient: parse_orientation(columns[4]),
} }
} }
@ -107,7 +73,7 @@ fn parse_path(line: &str) -> Entry {
.split(',') .split(',')
.map(|s| { .map(|s| {
let (name, orient) = s.split_at(s.len() - 1); let (name, orient) = s.split_at(s.len() - 1);
(name.to_string(), orient.into()) (name.to_string(), parse_orientation(orient))
}) })
.collect(), .collect(),
} }
@ -126,7 +92,7 @@ fn parse_path_segments(s: &str) -> Vec<(String, Orientation)> {
let (name, r) = r.split_at(r.find(['<', '>']).unwrap_or(r.len())); let (name, r) = r.split_at(r.find(['<', '>']).unwrap_or(r.len()));
rest = r; rest = r;
result.push((name.to_string(), orient.into())); result.push((name.to_string(), parse_orientation(orient)));
if rest.is_empty() { if rest.is_empty() {
break; break;
@ -156,10 +122,12 @@ fn parse_walk(line: &str) -> Entry {
} }
} }
pub fn parse_source(source: &str) -> Vec<Entry> { pub fn parse_source<R: Read>(reader: R) -> io::Result<Vec<Entry>> {
let mut entries = Vec::new(); let mut entries = Vec::new();
for line in source.lines() { for line in BufReader::new(reader).lines() {
let line = line?;
let line = line.trim(); let line = line.trim();
if line.is_empty() || line.starts_with('#') { if line.is_empty() || line.starts_with('#') {
continue; continue;
} }
@ -180,5 +148,6 @@ pub fn parse_source(source: &str) -> Vec<Entry> {
}; };
entries.push(entry); entries.push(entry);
} }
entries
Ok(entries)
} }

Loading…
Cancel
Save