mirror of https://github.com/aziis98/asd-2024.git
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
189 lines
4.5 KiB
Rust
189 lines
4.5 KiB
Rust
#![allow(dead_code)]
|
|
|
|
use std::{
|
|
io::{self, BufRead, BufReader, Read},
|
|
str::FromStr,
|
|
};
|
|
|
|
use indicatif::ProgressIterator;
|
|
|
|
use crate::gfa::{Entry, Orientation};
|
|
|
|
fn parse_orientation(s: &str) -> Orientation {
|
|
match s {
|
|
"+" => Orientation::Forward,
|
|
">" => Orientation::Forward,
|
|
"-" => Orientation::Reverse,
|
|
"<" => Orientation::Reverse,
|
|
_ => panic!("Invalid orientation: {}", s),
|
|
}
|
|
}
|
|
|
|
/// Parse a line of the source file into a Header struct
|
|
///
|
|
/// ```txt
|
|
/// H VN:Z:1.0
|
|
/// ```
|
|
///
|
|
fn parse_header(line: &str) -> Entry {
|
|
let columns: Vec<&str> = line.split(':').collect();
|
|
|
|
Entry::Header {
|
|
version: columns[2].to_string(),
|
|
}
|
|
}
|
|
|
|
/// Parse a line of the source file into a Segment struct
|
|
///
|
|
/// ```txt
|
|
/// S 1 ACGT
|
|
/// ```
|
|
fn parse_segment(line: &str) -> Entry {
|
|
let columns: Vec<&str> = line.split('\t').collect();
|
|
|
|
Entry::Segment {
|
|
id: columns[1].to_string(),
|
|
sequence: columns[2].to_string(),
|
|
}
|
|
}
|
|
|
|
/// Parse a line of the source file into a Link struct
|
|
///
|
|
/// ```txt
|
|
/// L 1 + 2 - 3M
|
|
/// ```
|
|
fn parse_link(line: &str) -> Entry {
|
|
let columns: Vec<&str> = line.split('\t').collect();
|
|
|
|
Entry::Link {
|
|
from: columns[1].to_string(),
|
|
from_orient: parse_orientation(columns[2]),
|
|
to: columns[3].to_string(),
|
|
to_orient: parse_orientation(columns[4]),
|
|
}
|
|
}
|
|
|
|
/// Parse a line of the source file into a Path struct
|
|
///
|
|
/// ```txt
|
|
/// P A 11+,12+,14+,15-,17+ *,*,*,*
|
|
/// ```
|
|
fn parse_path(line: &str) -> Entry {
|
|
let columns: Vec<&str> = line.split('\t').collect();
|
|
|
|
Entry::Path {
|
|
name: columns[1].to_string(),
|
|
segments: columns[2]
|
|
.split(',')
|
|
.map(|s| {
|
|
let (name, orient) = s.split_at(s.len() - 1);
|
|
(name.to_string(), parse_orientation(orient))
|
|
})
|
|
.collect(),
|
|
}
|
|
}
|
|
|
|
fn parse_path_segments(s: &str) -> Vec<(String, Orientation)> {
|
|
let mut result = Vec::new();
|
|
let mut rest = s;
|
|
|
|
loop {
|
|
// println!("Rest: {}", rest);
|
|
|
|
let r = rest;
|
|
|
|
let (orient, r) = r.split_at(1);
|
|
let (name, r) = r.split_at(r.find(['<', '>']).unwrap_or(r.len()));
|
|
|
|
rest = r;
|
|
result.push((name.to_string(), parse_orientation(orient)));
|
|
|
|
if rest.is_empty() {
|
|
break;
|
|
}
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Parse a line of the source file into a Walk struct
|
|
///
|
|
/// ```txt
|
|
/// W sample 1 A 0 5 >11>12>14>15>17
|
|
/// ```
|
|
fn parse_walk(line: &str) -> Entry {
|
|
let columns: Vec<&str> = line.split('\t').collect();
|
|
|
|
Entry::Walk {
|
|
sample: columns[1].to_string(),
|
|
|
|
haplotype_index: usize::from_str(columns[2]).unwrap(),
|
|
seq_id: columns[3].to_string(),
|
|
seq_start: usize::from_str(columns[4]).unwrap(),
|
|
seq_end: usize::from_str(columns[5]).unwrap(),
|
|
|
|
segments: parse_path_segments(columns[6]),
|
|
}
|
|
}
|
|
|
|
pub fn parse_file(file: &str) -> io::Result<Vec<Entry>> {
|
|
let file_lines_count = BufReader::new(std::fs::File::open(file)?)
|
|
.lines()
|
|
.progress_count(0)
|
|
.count() as u64;
|
|
|
|
let file = std::fs::File::open(file)?;
|
|
|
|
parse_source(file, file_lines_count)
|
|
}
|
|
|
|
pub fn parse_source<R: Read>(reader: R, line_count: u64) -> io::Result<Vec<Entry>> {
|
|
let mut entries = Vec::new();
|
|
let mut skipped = Vec::new();
|
|
|
|
println!("Parsing GFA file...");
|
|
|
|
for line in BufReader::new(reader).lines().progress_count(line_count) {
|
|
let line = line?;
|
|
let line = line.trim();
|
|
|
|
if line.is_empty() || line.starts_with('#') {
|
|
continue;
|
|
}
|
|
|
|
let first_char = line.chars().next().unwrap();
|
|
let entry = match first_char {
|
|
'H' => parse_header(line),
|
|
'S' => parse_segment(line),
|
|
'L' => parse_link(line),
|
|
// 'P' => parse_path(line),
|
|
// 'W' => parse_walk(line),
|
|
_ => {
|
|
skipped.push(line.chars().next().expect("got empty line"));
|
|
continue;
|
|
}
|
|
};
|
|
|
|
entries.push(entry);
|
|
}
|
|
|
|
// Print skipped lines by compacting same ones together
|
|
for (s, count) in skipped.iter().fold(Vec::new(), |mut acc, s| {
|
|
if let Some((last, count)) = acc.last_mut() {
|
|
if *last == s {
|
|
*count += 1;
|
|
} else {
|
|
acc.push((s, 1));
|
|
}
|
|
} else {
|
|
acc.push((s, 1));
|
|
}
|
|
|
|
acc
|
|
}) {
|
|
eprintln!("Skipped {} lines of type: {}", count, s);
|
|
}
|
|
|
|
Ok(entries)
|
|
}
|