initial commit

main
Antonio De Lucreziis 8 months ago
commit c897945e06

5
.gitignore vendored

@ -0,0 +1,5 @@
# Cargo
/target
# Local files
*.local*

96
Cargo.lock generated

@ -0,0 +1,96 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "argh"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7af5ba06967ff7214ce4c7419c7d185be7ecd6cc4965a8f6e1d8ce0398aad219"
dependencies = [
"argh_derive",
"argh_shared",
]
[[package]]
name = "argh_derive"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56df0aeedf6b7a2fc67d06db35b09684c3e8da0c95f8f27685cb17e08413d87a"
dependencies = [
"argh_shared",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "argh_shared"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5693f39141bda5760ecc4111ab08da40565d1771038c4a0250f03457ec707531"
dependencies = [
"serde",
]
[[package]]
name = "asd-2024-gfa"
version = "0.1.0"
dependencies = [
"argh",
]
[[package]]
name = "proc-macro2"
version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b33eb56c327dec362a9e55b3ad14f9d2f0904fb5a5b03b513ab5465399e9f43"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.202"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.202"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "syn"
version = "2.0.65"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2863d96a84c6439701d7a38f9de935ec562c8832cc55d1dde0f513b52fad106"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"

@ -0,0 +1,7 @@
[package]
name = "asd-2024-gfa"
version = "0.1.0"
edition = "2021"
[dependencies]
argh = "0.1.12"

@ -0,0 +1,48 @@
# Progetto ASD 2023/2024
## Example GFA
```
H VN:Z:1.0
S 11 G
S 12 A
S 13 T
S 14 T
S 15 A
S 16 C
S 17 A
S 21 G
S 22 A
S 23 T
S 24 T
S 25 A
L 11 + 12 + *
L 11 + 13 + *
L 12 + 14 + *
L 13 + 14 + *
L 14 + 15 + *
L 14 + 16 + *
L 15 + 17 + *
L 16 + 17 + *
L 21 + 22 + *
L 21 + 23 + *
L 22 + 24 + *
L 23 + 24 - *
L 24 + 25 + *
P A 11+,12+,14+,15+,17+ *,*,*,*
P B 21+,22+,24+,25+ *,*,*
W sample 1 A 0 5 >11>12>14>15>17
W sample 2 A 0 5 >11>13>14>16>17
W sample 1 B 0 5 >21>22>24<23<21
W sample 2 B 0 4 >21>22>24>25
```
## Note
- Documentazione del formato GFA: http://gfa-spec.github.io/GFA-spec/GFA1.html
- Data set:
- [example.gfa](https://github.com/jltsiren/gbwt-rs/blob/main/test-data/example.gfa)
- [drb1.gfa](https://github.com/pangenome/odgi/blob/master/test/DRB1-3123_unsorted.gfa)
- [chrY.gfa](https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrY.hprc-v1.0-pggb.gfa.gz)
- [chrX.gfa](https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz)

@ -0,0 +1,32 @@
H VN:Z:1.0
S 11 G
S 12 A
S 13 T
S 14 T
S 15 A
S 16 C
S 17 A
S 21 G
S 22 A
S 23 T
S 24 T
S 25 A
L 11 + 12 + *
L 11 + 13 + *
L 12 + 14 + *
L 13 + 14 + *
L 14 + 15 + *
L 14 + 16 + *
L 15 + 17 + *
L 16 + 17 + *
L 21 + 22 + *
L 21 + 23 + *
L 22 + 24 + *
L 23 + 24 - *
L 24 + 25 + *
P A 11+,12+,14+,15+,17+ *,*,*,*
P B 21+,22+,24+,25+ *,*,*
W sample 1 A 0 5 >11>12>14>15>17
W sample 2 A 0 5 >11>13>14>16>17
W sample 1 B 0 5 >21>22>24<23<21
W sample 2 B 0 4 >21>22>24>25

@ -0,0 +1,40 @@
use argh::FromArgs;
mod parser;
#[derive(FromArgs, PartialEq, Debug)]
/// Strumento CLI per il progetto di Algoritmi e Strutture Dati 2024
struct CliTool {
#[argh(subcommand)]
nested: MySubCommandEnum,
}
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand)]
enum MySubCommandEnum {
Show(CommandShow),
}
#[derive(FromArgs, PartialEq, Debug)]
/// Parse and show the content of a file
#[argh(subcommand, name = "show")]
struct CommandShow {
#[argh(option, short = 'i')]
/// file to read
input: String,
}
fn main() {
let opts = argh::from_env::<CliTool>();
match opts.nested {
MySubCommandEnum::Show(show) => {
let file = std::fs::read_to_string(show.input).expect("cannot read file");
let entries = parser::parse_source(file.as_str());
for entry in entries {
println!("{:?}", entry);
}
}
}
}

@ -0,0 +1,184 @@
use std::str::FromStr;
#[derive(Debug)]
pub enum Orientation {
Forward,
Reverse,
}
impl From<&str> for Orientation {
fn from(s: &str) -> Self {
match s {
"+" => Orientation::Forward,
">" => Orientation::Forward,
"-" => Orientation::Reverse,
"<" => Orientation::Reverse,
_ => panic!("Invalid orientation: {}", s),
}
}
}
#[derive(Debug)]
pub enum Entry {
Header {
version: String,
},
Segment {
id: String,
sequence: String,
},
Link {
from: String,
from_orient: Orientation,
to: String,
to_orient: Orientation,
},
Path {
name: String,
segments: Vec<(String, Orientation)>,
},
Walk {
sample: String,
haplotype_index: usize,
seq_id: String,
seq_start: usize,
seq_end: usize,
segments: Vec<(String, Orientation)>,
},
}
/// Parse a line of the source file into a Header struct
///
/// ```txt
/// H VN:Z:1.0
/// ```
///
fn parse_header(line: &str) -> Entry {
let columns: Vec<&str> = line.split(':').collect();
Entry::Header {
version: columns[2].to_string(),
}
}
/// Parse a line of the source file into a Segment struct
///
/// ```txt
/// S 1 ACGT
/// ```
fn parse_segment(line: &str) -> Entry {
let columns: Vec<&str> = line.split('\t').collect();
Entry::Segment {
id: columns[1].to_string(),
sequence: columns[2].to_string(),
}
}
/// Parse a line of the source file into a Link struct
///
/// ```txt
/// L 1 + 2 - 3M
/// ```
fn parse_link(line: &str) -> Entry {
let columns: Vec<&str> = line.split('\t').collect();
Entry::Link {
from: columns[1].to_string(),
from_orient: columns[2].into(),
to: columns[3].to_string(),
to_orient: columns[4].into(),
}
}
/// Parse a line of the source file into a Path struct
///
/// ```txt
/// P A 11+,12+,14+,15-,17+ *,*,*,*
/// ```
fn parse_path(line: &str) -> Entry {
let columns: Vec<&str> = line.split('\t').collect();
Entry::Path {
name: columns[1].to_string(),
segments: columns[2]
.split(',')
.map(|s| {
let (name, orient) = s.split_at(s.len() - 1);
(name.to_string(), orient.into())
})
.collect(),
}
}
fn parse_path_segments(s: &str) -> Vec<(String, Orientation)> {
let mut result = Vec::new();
let mut rest = s;
loop {
println!("Rest: {}", rest);
let r = rest;
let (orient, r) = r.split_at(1);
let (name, r) = r.split_at(r.find(['<', '>']).unwrap_or(r.len()));
rest = r;
result.push((name.to_string(), orient.into()));
if rest.is_empty() {
break;
}
}
result
}
/// Parse a line of the source file into a Walk struct
///
/// ```txt
/// W sample 1 A 0 5 >11>12>14>15>17
/// ```
fn parse_walk(line: &str) -> Entry {
let columns: Vec<&str> = line.split('\t').collect();
Entry::Walk {
sample: columns[1].to_string(),
haplotype_index: usize::from_str(columns[2]).unwrap(),
seq_id: columns[3].to_string(),
seq_start: usize::from_str(columns[4]).unwrap(),
seq_end: usize::from_str(columns[5]).unwrap(),
segments: parse_path_segments(columns[6]),
}
}
pub fn parse_source(source: &str) -> Vec<Entry> {
let mut entries = Vec::new();
for line in source.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
println!("Parsing: {}", line);
let first_char = line.chars().next().unwrap();
let entry = match first_char {
'H' => parse_header(line),
'S' => parse_segment(line),
'L' => parse_link(line),
'P' => parse_path(line),
'W' => parse_walk(line),
_ => {
eprintln!("Unknown line type: {}", line);
continue;
}
};
entries.push(entry);
}
entries
}
Loading…
Cancel
Save