From c897945e06529a356c0a960637cb4efc32759ed9 Mon Sep 17 00:00:00 2001 From: Antonio De Lucreziis Date: Wed, 22 May 2024 14:05:06 +0200 Subject: [PATCH] initial commit --- .gitignore | 5 ++ Cargo.lock | 96 ++++++++++++++++++++++ Cargo.toml | 7 ++ README.md | 48 +++++++++++ examples/example.gfa | 32 ++++++++ src/main.rs | 40 ++++++++++ src/parser.rs | 184 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 412 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 examples/example.gfa create mode 100644 src/main.rs create mode 100644 src/parser.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..15aab6b --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Cargo +/target + +# Local files +*.local* diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..7edd536 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,96 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "argh" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7af5ba06967ff7214ce4c7419c7d185be7ecd6cc4965a8f6e1d8ce0398aad219" +dependencies = [ + "argh_derive", + "argh_shared", +] + +[[package]] +name = "argh_derive" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56df0aeedf6b7a2fc67d06db35b09684c3e8da0c95f8f27685cb17e08413d87a" +dependencies = [ + "argh_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "argh_shared" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5693f39141bda5760ecc4111ab08da40565d1771038c4a0250f03457ec707531" +dependencies = [ + "serde", +] + +[[package]] +name = "asd-2024-gfa" +version = "0.1.0" +dependencies = [ + "argh", +] + +[[package]] +name = "proc-macro2" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b33eb56c327dec362a9e55b3ad14f9d2f0904fb5a5b03b513ab5465399e9f43" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.202" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.202" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2863d96a84c6439701d7a38f9de935ec562c8832cc55d1dde0f513b52fad106" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..2f1719b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "asd-2024-gfa" +version = "0.1.0" +edition = "2021" + +[dependencies] +argh = "0.1.12" diff --git a/README.md b/README.md new file mode 100644 index 0000000..1303e00 --- /dev/null +++ b/README.md @@ -0,0 +1,48 @@ +# Progetto ASD 2023/2024 + +## Example GFA + +``` +H VN:Z:1.0 +S 11 G +S 12 A +S 13 T +S 14 T +S 15 A +S 16 C +S 17 A +S 21 G +S 22 A +S 23 T +S 24 T +S 25 A +L 11 + 12 + * +L 11 + 13 + * +L 12 + 14 + * +L 13 + 14 + * +L 14 + 15 + * +L 14 + 16 + * +L 15 + 17 + * +L 16 + 17 + * +L 21 + 22 + * +L 21 + 23 + * +L 22 + 24 + * +L 23 + 24 - * +L 24 + 25 + * +P A 11+,12+,14+,15+,17+ *,*,*,* +P B 21+,22+,24+,25+ *,*,* +W sample 1 A 0 5 >11>12>14>15>17 +W sample 2 A 0 5 >11>13>14>16>17 +W sample 1 B 0 5 >21>22>24<23<21 +W sample 2 B 0 4 >21>22>24>25 +``` + +## Note + +- Documentazione del formato GFA: http://gfa-spec.github.io/GFA-spec/GFA1.html + +- Data set: + - [example.gfa](https://github.com/jltsiren/gbwt-rs/blob/main/test-data/example.gfa) + - [drb1.gfa](https://github.com/pangenome/odgi/blob/master/test/DRB1-3123_unsorted.gfa) + - [chrY.gfa](https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrY.hprc-v1.0-pggb.gfa.gz) + - [chrX.gfa](https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz) diff --git a/examples/example.gfa b/examples/example.gfa new file mode 100644 index 0000000..ed4c02f --- /dev/null +++ b/examples/example.gfa @@ -0,0 +1,32 @@ +H VN:Z:1.0 +S 11 G +S 12 A +S 13 T +S 14 T +S 15 A +S 16 C +S 17 A +S 21 G +S 22 A +S 23 T +S 24 T +S 25 A +L 11 + 12 + * +L 11 + 13 + * +L 12 + 14 + * +L 13 + 14 + * +L 14 + 15 + * +L 14 + 16 + * +L 15 + 17 + * +L 16 + 17 + * +L 21 + 22 + * +L 21 + 23 + * +L 22 + 24 + * +L 23 + 24 - * +L 24 + 25 + * +P A 11+,12+,14+,15+,17+ *,*,*,* +P B 21+,22+,24+,25+ *,*,* +W sample 1 A 0 5 >11>12>14>15>17 +W sample 2 A 0 5 >11>13>14>16>17 +W sample 1 B 0 5 >21>22>24<23<21 +W sample 2 B 0 4 >21>22>24>25 diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..e12ec41 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,40 @@ +use argh::FromArgs; + +mod parser; + +#[derive(FromArgs, PartialEq, Debug)] +/// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024 +struct CliTool { + #[argh(subcommand)] + nested: MySubCommandEnum, +} + +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand)] +enum MySubCommandEnum { + Show(CommandShow), +} + +#[derive(FromArgs, PartialEq, Debug)] +/// Parse and show the content of a file +#[argh(subcommand, name = "show")] +struct CommandShow { + #[argh(option, short = 'i')] + /// file to read + input: String, +} + +fn main() { + let opts = argh::from_env::(); + + match opts.nested { + MySubCommandEnum::Show(show) => { + let file = std::fs::read_to_string(show.input).expect("cannot read file"); + let entries = parser::parse_source(file.as_str()); + + for entry in entries { + println!("{:?}", entry); + } + } + } +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..ba2d9ae --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,184 @@ +use std::str::FromStr; + +#[derive(Debug)] +pub enum Orientation { + Forward, + Reverse, +} + +impl From<&str> for Orientation { + fn from(s: &str) -> Self { + match s { + "+" => Orientation::Forward, + ">" => Orientation::Forward, + "-" => Orientation::Reverse, + "<" => Orientation::Reverse, + _ => panic!("Invalid orientation: {}", s), + } + } +} + +#[derive(Debug)] +pub enum Entry { + Header { + version: String, + }, + Segment { + id: String, + sequence: String, + }, + Link { + from: String, + from_orient: Orientation, + to: String, + to_orient: Orientation, + }, + Path { + name: String, + segments: Vec<(String, Orientation)>, + }, + Walk { + sample: String, + + haplotype_index: usize, + seq_id: String, + seq_start: usize, + seq_end: usize, + + segments: Vec<(String, Orientation)>, + }, +} + +/// Parse a line of the source file into a Header struct +/// +/// ```txt +/// H VN:Z:1.0 +/// ``` +/// +fn parse_header(line: &str) -> Entry { + let columns: Vec<&str> = line.split(':').collect(); + + Entry::Header { + version: columns[2].to_string(), + } +} + +/// Parse a line of the source file into a Segment struct +/// +/// ```txt +/// S 1 ACGT +/// ``` +fn parse_segment(line: &str) -> Entry { + let columns: Vec<&str> = line.split('\t').collect(); + + Entry::Segment { + id: columns[1].to_string(), + sequence: columns[2].to_string(), + } +} + +/// Parse a line of the source file into a Link struct +/// +/// ```txt +/// L 1 + 2 - 3M +/// ``` +fn parse_link(line: &str) -> Entry { + let columns: Vec<&str> = line.split('\t').collect(); + + Entry::Link { + from: columns[1].to_string(), + from_orient: columns[2].into(), + to: columns[3].to_string(), + to_orient: columns[4].into(), + } +} + +/// Parse a line of the source file into a Path struct +/// +/// ```txt +/// P A 11+,12+,14+,15-,17+ *,*,*,* +/// ``` +fn parse_path(line: &str) -> Entry { + let columns: Vec<&str> = line.split('\t').collect(); + + Entry::Path { + name: columns[1].to_string(), + segments: columns[2] + .split(',') + .map(|s| { + let (name, orient) = s.split_at(s.len() - 1); + (name.to_string(), orient.into()) + }) + .collect(), + } +} + +fn parse_path_segments(s: &str) -> Vec<(String, Orientation)> { + let mut result = Vec::new(); + let mut rest = s; + + loop { + println!("Rest: {}", rest); + + let r = rest; + + let (orient, r) = r.split_at(1); + let (name, r) = r.split_at(r.find(['<', '>']).unwrap_or(r.len())); + + rest = r; + result.push((name.to_string(), orient.into())); + + if rest.is_empty() { + break; + } + } + + result +} + +/// Parse a line of the source file into a Walk struct +/// +/// ```txt +/// W sample 1 A 0 5 >11>12>14>15>17 +/// ``` +fn parse_walk(line: &str) -> Entry { + let columns: Vec<&str> = line.split('\t').collect(); + + Entry::Walk { + sample: columns[1].to_string(), + + haplotype_index: usize::from_str(columns[2]).unwrap(), + seq_id: columns[3].to_string(), + seq_start: usize::from_str(columns[4]).unwrap(), + seq_end: usize::from_str(columns[5]).unwrap(), + + segments: parse_path_segments(columns[6]), + } +} + +pub fn parse_source(source: &str) -> Vec { + let mut entries = Vec::new(); + for line in source.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + + println!("Parsing: {}", line); + + let first_char = line.chars().next().unwrap(); + let entry = match first_char { + 'H' => parse_header(line), + 'S' => parse_segment(line), + 'L' => parse_link(line), + 'P' => parse_path(line), + 'W' => parse_walk(line), + _ => { + eprintln!("Unknown line type: {}", line); + continue; + } + }; + entries.push(entry); + } + entries +}