From d4f6dbc2c06504ed1236ebebf2dba16423d83435 Mon Sep 17 00:00:00 2001 From: Luca Lombardo Date: Sat, 25 May 2024 14:55:57 +0200 Subject: [PATCH] Versione probabilmente non funzionante con CUDA --- Cargo.lock | 123 +++++++++++++- examples/graphs-1-cuda/Cargo.toml | 18 +++ examples/graphs-1-cuda/src/kernel.cu | 48 ++++++ examples/graphs-1-cuda/src/kernel.ptx | 160 ++++++++++++++++++ examples/graphs-1-cuda/src/main.rs | 223 ++++++++++++++++++++++++++ 5 files changed, 570 insertions(+), 2 deletions(-) create mode 100644 examples/graphs-1-cuda/Cargo.toml create mode 100644 examples/graphs-1-cuda/src/kernel.cu create mode 100644 examples/graphs-1-cuda/src/kernel.ptx create mode 100644 examples/graphs-1-cuda/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 60e2154..af9cf74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -938,6 +938,52 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96a6ac251f4a2aca6b3f91340350eab87ae57c3f127ffeb585e92bd336717991" +[[package]] +name = "cust" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6cc71911e179f12483b9734120b45bd00bf64fab085cc4818428523eedd469" +dependencies = [ + "bitflags 1.3.2", + "bytemuck", + "cust_core", + "cust_derive", + "cust_raw", + "find_cuda_helper", +] + +[[package]] +name = "cust_core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "039f79662cb8f890cbf335e818cd522d6e3a53fe63f61d1aaaf859cd3d975f06" +dependencies = [ + "cust_derive", + "glam 0.20.5", + "mint", + "vek", +] + +[[package]] +name = "cust_derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3bc95fe629aed92b2423de6ccff9e40174b21d19cb6ee6281a4d04ac72f66" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "cust_raw" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf40d6ade12cb9828bbc844b9875c7b93d25e67a3c9bf61c7aa3ae09e402bf8" +dependencies = [ + "find_cuda_helper", +] + [[package]] name = "derivative" version = "2.2.0" @@ -1313,12 +1359,21 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2417e237094dfd115a4aa11b2a3dc6cda45e8d36572a7862ba9a48402f7441a3" dependencies = [ - "glam", + "glam 0.21.3", "hashlink", "petgraph", "quad-rand", ] +[[package]] +name = "find_cuda_helper" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad" +dependencies = [ + "glob", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -1491,12 +1546,27 @@ dependencies = [ "xml-rs", ] +[[package]] +name = "glam" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43e957e744be03f5801a55472f593d43fabdebf25a4585db250f04d86b1675f" +dependencies = [ + "num-traits", +] + [[package]] name = "glam" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "518faa5064866338b013ff9b2350dc318e14cc4fcd6cb8206d7e7c9886c98815" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "glow" version = "0.13.1" @@ -1641,6 +1711,22 @@ dependencies = [ "rayon", ] +[[package]] +name = "graphs_1-cuda" +version = "0.1.0" +dependencies = [ + "asd", + "cust", + "fdg", + "macroquad", + "nalgebra", + "num-traits", + "petgraph", + "petgraph-gen", + "rand", + "rayon", +] + [[package]] name = "hashbrown" version = "0.13.2" @@ -1923,7 +2009,7 @@ checksum = "6d983c03e9171099b2eb2067206c78cb37904ea071bb9ea4602c43ed45bb7097" dependencies = [ "bumpalo", "fontdue", - "glam", + "glam 0.21.3", "image", "macroquad_macro", "miniquad", @@ -2026,6 +2112,12 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "mint" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e53debba6bda7a793e5f99b8dacf19e626084f525f7829104ba9898f367d85ff" + [[package]] name = "naga" version = "0.19.2" @@ -2734,6 +2826,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.37.27" @@ -2810,6 +2911,12 @@ dependencies = [ "tiny-skia", ] +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + [[package]] name = "serde" version = "1.0.202" @@ -3241,6 +3348,18 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "vek" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8085882662f9bc47fc8b0cdafa5e19df8f592f650c02b9083da8d45ac9eebd17" +dependencies = [ + "approx", + "num-integer", + "num-traits", + "rustc_version", +] + [[package]] name = "version_check" version = "0.9.4" diff --git a/examples/graphs-1-cuda/Cargo.toml b/examples/graphs-1-cuda/Cargo.toml new file mode 100644 index 0000000..0e8dcaf --- /dev/null +++ b/examples/graphs-1-cuda/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "graphs_1-cuda" +version = "0.1.0" +edition = "2021" + +[dependencies] +nalgebra = { version = "0.32.3", features = ["rand"] } +petgraph = { version = "0.6.4", features = [ + "stable_graph", +], default-features = false } +num-traits = "0.2.17" +rand = "0.8.5" +macroquad = "0.4.4" +petgraph-gen = "0.1.3" +fdg = { git = "https://github.com/grantshandy/fdg" } +asd = { path = "../../" } +rayon = "1.10.0" +cust = "0.3.2" diff --git a/examples/graphs-1-cuda/src/kernel.cu b/examples/graphs-1-cuda/src/kernel.cu new file mode 100644 index 0000000..e8ca371 --- /dev/null +++ b/examples/graphs-1-cuda/src/kernel.cu @@ -0,0 +1,48 @@ +extern "C" __global__ void compute_forces( + float *positions_x, float *positions_y, + float *forces_x, float *forces_y, + int *neighbors, float *distances, + int num_nodes, int max_neighbors) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_nodes) + { + float pos_x = positions_x[idx]; + float pos_y = positions_y[idx]; + float force_x = 0.0f; + float force_y = 0.0f; + + for (int i = 0; i < max_neighbors; i++) + { + int other_idx = neighbors[idx * max_neighbors + i]; + if (other_idx == -1) + break; // No more neighbors + + float other_pos_x = positions_x[other_idx]; + float other_pos_y = positions_y[other_idx]; + float distance = distances[idx * max_neighbors + i]; + + float delta_x = other_pos_x - pos_x; + float delta_y = other_pos_y - pos_y; + float dist = delta_x * delta_x + delta_y * delta_y; + float correction = dist - (distance * distance); + + if (distance > 0.0f && dist > 1e-6f) + { + float scale = 0.01f * atanf(correction) / sqrtf(dist); + force_x += delta_x * scale; + force_y += delta_y * scale; + } + + if (dist > 1e-6f) + { + float repel_scale = 0.01f / max(dist, 1.0f); + force_x -= delta_x * repel_scale; + force_y -= delta_y * repel_scale; + } + } + + forces_x[idx] = force_x; + forces_y[idx] = force_y; + } +} diff --git a/examples/graphs-1-cuda/src/kernel.ptx b/examples/graphs-1-cuda/src/kernel.ptx new file mode 100644 index 0000000..510f912 --- /dev/null +++ b/examples/graphs-1-cuda/src/kernel.ptx @@ -0,0 +1,160 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-30672275 +// Cuda compilation tools, release 11.5, V11.5.119 +// Based on NVVM 7.0.1 +// + +.version 7.5 +.target sm_52 +.address_size 64 + + // .globl compute_forces + +.visible .entry compute_forces( + .param .u64 compute_forces_param_0, + .param .u64 compute_forces_param_1, + .param .u64 compute_forces_param_2, + .param .u64 compute_forces_param_3, + .param .u64 compute_forces_param_4, + .param .u64 compute_forces_param_5, + .param .u32 compute_forces_param_6, + .param .u32 compute_forces_param_7 +) +{ + .reg .pred %p<12>; + .reg .f32 %f<69>; + .reg .b32 %r<18>; + .reg .b64 %rd<26>; + + + ld.param.u64 %rd9, [compute_forces_param_0]; + ld.param.u64 %rd10, [compute_forces_param_1]; + ld.param.u64 %rd5, [compute_forces_param_2]; + ld.param.u64 %rd6, [compute_forces_param_3]; + ld.param.u64 %rd7, [compute_forces_param_4]; + ld.param.u64 %rd8, [compute_forces_param_5]; + ld.param.u32 %r8, [compute_forces_param_6]; + ld.param.u32 %r7, [compute_forces_param_7]; + cvta.to.global.u64 %rd1, %rd10; + cvta.to.global.u64 %rd2, %rd9; + mov.u32 %r9, %ntid.x; + mov.u32 %r10, %ctaid.x; + mov.u32 %r11, %tid.x; + mad.lo.s32 %r1, %r10, %r9, %r11; + setp.ge.s32 %p1, %r1, %r8; + @%p1 bra $L__BB0_12; + + setp.lt.s32 %p2, %r7, 1; + mov.f32 %f67, 0f00000000; + mov.f32 %f68, %f67; + @%p2 bra $L__BB0_11; + + cvta.to.global.u64 %rd3, %rd8; + cvta.to.global.u64 %rd4, %rd7; + mul.wide.s32 %rd11, %r1, 4; + add.s64 %rd12, %rd2, %rd11; + ld.global.f32 %f1, [%rd12]; + add.s64 %rd13, %rd1, %rd11; + ld.global.f32 %f2, [%rd13]; + mul.lo.s32 %r2, %r1, %r7; + mov.u32 %r17, 0; + +$L__BB0_3: + add.s32 %r4, %r17, %r2; + mul.wide.s32 %rd14, %r4, 4; + add.s64 %rd15, %rd4, %rd14; + ld.global.u32 %r5, [%rd15]; + setp.eq.s32 %p3, %r5, -1; + @%p3 bra $L__BB0_11; + + mul.wide.s32 %rd16, %r5, 4; + add.s64 %rd17, %rd2, %rd16; + ld.global.f32 %f26, [%rd17]; + sub.f32 %f5, %f26, %f1; + add.s64 %rd18, %rd1, %rd16; + ld.global.f32 %f27, [%rd18]; + sub.f32 %f6, %f27, %f2; + mul.f32 %f28, %f6, %f6; + fma.rn.f32 %f7, %f5, %f5, %f28; + add.s64 %rd20, %rd3, %rd14; + ld.global.f32 %f29, [%rd20]; + mul.f32 %f30, %f29, %f29; + sub.f32 %f8, %f7, %f30; + setp.leu.f32 %p4, %f29, 0f00000000; + setp.leu.f32 %p5, %f7, 0f358637BD; + or.pred %p6, %p4, %p5; + @%p6 bra $L__BB0_8; + + abs.f32 %f9, %f8; + setp.leu.f32 %p7, %f9, 0f3F800000; + mov.f32 %f62, %f9; + @%p7 bra $L__BB0_7; + + rcp.rn.f32 %f62, %f9; + +$L__BB0_7: + mul.rn.f32 %f31, %f62, %f62; + mov.f32 %f32, 0fC0B59883; + mov.f32 %f33, 0fBF52C7EA; + fma.rn.f32 %f34, %f31, %f33, %f32; + mov.f32 %f35, 0fC0D21907; + fma.rn.f32 %f36, %f34, %f31, %f35; + mul.f32 %f37, %f31, %f36; + mul.f32 %f38, %f62, %f37; + add.f32 %f39, %f31, 0f41355DC0; + mov.f32 %f40, 0f41E6BD60; + fma.rn.f32 %f41, %f39, %f31, %f40; + mov.f32 %f42, 0f419D92C8; + fma.rn.f32 %f43, %f41, %f31, %f42; + rcp.rn.f32 %f44, %f43; + fma.rn.f32 %f45, %f38, %f44, %f62; + mov.f32 %f46, 0f3FC90FDB; + sub.f32 %f47, %f46, %f45; + setp.gt.f32 %p8, %f9, 0f3F800000; + selp.f32 %f48, %f47, %f45, %p8; + mov.b32 %r13, %f48; + mov.b32 %r14, %f8; + and.b32 %r15, %r14, -2147483648; + or.b32 %r16, %r15, %r13; + mov.b32 %f49, %r16; + setp.le.f32 %p9, %f9, 0f7F800000; + selp.f32 %f50, %f49, %f48, %p9; + mul.f32 %f51, %f50, 0f3C23D70A; + sqrt.rn.f32 %f52, %f7; + div.rn.f32 %f53, %f51, %f52; + fma.rn.f32 %f67, %f5, %f53, %f67; + fma.rn.f32 %f68, %f6, %f53, %f68; + +$L__BB0_8: + @%p5 bra $L__BB0_10; + + mov.f32 %f54, 0f3F800000; + max.f32 %f55, %f7, %f54; + mov.f32 %f56, 0f3C23D70A; + div.rn.f32 %f57, %f56, %f55; + mul.f32 %f58, %f5, %f57; + sub.f32 %f67, %f67, %f58; + mul.f32 %f59, %f6, %f57; + sub.f32 %f68, %f68, %f59; + +$L__BB0_10: + add.s32 %r17, %r17, 1; + setp.lt.s32 %p11, %r17, %r7; + @%p11 bra $L__BB0_3; + +$L__BB0_11: + cvta.to.global.u64 %rd21, %rd6; + cvta.to.global.u64 %rd22, %rd5; + mul.wide.s32 %rd23, %r1, 4; + add.s64 %rd24, %rd22, %rd23; + st.global.f32 [%rd24], %f67; + add.s64 %rd25, %rd21, %rd23; + st.global.f32 [%rd25], %f68; + +$L__BB0_12: + ret; + +} + diff --git a/examples/graphs-1-cuda/src/main.rs b/examples/graphs-1-cuda/src/main.rs new file mode 100644 index 0000000..a7c719d --- /dev/null +++ b/examples/graphs-1-cuda/src/main.rs @@ -0,0 +1,223 @@ +use cust::prelude::*; +use std::fs::File; +use std::io::Read; + +async fn main() { + // Initialize CUDA context + let _ctx = cust::quick_init().unwrap(); + + // Load the CUDA module + let mut file = File::open("kernel.ptx").unwrap(); + let mut ptx = String::new(); + file.read_to_string(&mut ptx).unwrap(); + let module = Module::load_from_string(&ptx).unwrap(); + let stream = Stream::new(StreamFlags::DEFAULT, None).unwrap(); + + let mut graph = load_graph(); + + let mut desired_distance_matrix = HashMap::new(); + graph.node_indices().for_each(|idx| { + desired_distance_matrix.insert(idx, dijkstra(&graph, idx, None, |_| 1.0 as f32)); + }); + + loop { + let num_nodes = graph.node_count(); + let max_neighbors = 10; // Adjust based on your data + + // Prepare data for GPU + let positions_x: Vec = graph.node_weights().map(|(_, pos)| pos.x).collect(); + let positions_y: Vec = graph.node_weights().map(|(_, pos)| pos.y).collect(); + let mut forces_x = vec![0.0f32; num_nodes]; + let mut forces_y = vec![0.0f32; num_nodes]; + let neighbors = vec![-1; num_nodes * max_neighbors]; // Placeholder for neighbors + let distances = vec![0.0f32; num_nodes * max_neighbors]; // Placeholder for distances + + // TODO: Fill neighbors and distances arrays based on your graph structure + + // Allocate device memory + let positions_x_device = positions_x.as_slice().as_dvec().unwrap(); + let positions_y_device = positions_y.as_slice().as_dvec().unwrap(); + let forces_x_device = forces_x.as_mut_slice().as_dvec().unwrap(); + let forces_y_device = forces_y.as_mut_slice().as_dvec().unwrap(); + let neighbors_device = neighbors.as_slice().as_dvec().unwrap(); + let distances_device = distances.as_slice().as_dvec().unwrap(); + + // Launch the CUDA kernel + unsafe { + launch!(module.compute_forces<<>>( + positions_x_device.as_device_ptr(), + positions_y_device.as_device_ptr(), + forces_x_device.as_device_ptr(), + forces_y_device.as_device_ptr(), + neighbors_device.as_device_ptr(), + distances_device.as_device_ptr(), + num_nodes as i32, + max_neighbors as i32 + )) + .unwrap(); + } + + stream.synchronize().unwrap(); + + // Copy results back to host + forces_x_device.copy_to(&mut forces_x).unwrap(); + forces_y_device.copy_to(&mut forces_y).unwrap(); + + // Update node positions + for (i, (force_x, force_y)) in forces_x.iter().zip(forces_y.iter()).enumerate() { + let (_, pos) = graph.node_weight_mut(NodeIndex::new(i)).unwrap(); + pos.x += force_x; + pos.y += force_y; + } + + // Render + let now = Instant::now(); + clear_background(WHITE); + draw_graph(&graph); + let elapsed = now.elapsed(); + println!("frame: {:?}", elapsed); + next_frame().await; + } +} + +fn load_graph() -> StableGraph<(String, Point2), ()> { + println!("Loading graph"); + + let mut graph = StableGraph::new(); + + let file = std::fs::File::open(env::args().nth(1).expect("missing gfa file argument")).unwrap(); + let entries = parser::parse_source(file).unwrap(); + + let mut index_map = HashMap::new(); + + let node_count = entries + .iter() + .filter_map(|entry| match entry { + Entry::Segment { id, .. } => Some(id), + _ => None, + }) + .count(); + + println!("Node count: {}", node_count); + + let radius = (node_count as f32).sqrt(); + + let mut i = -10.0; + + for entry in entries + .iter() + .filter(|entry| matches!(entry, Entry::Link { .. })) + .take(3000) + { + // println!("{:?}", entry); + + if let Entry::Link { + from, + from_orient, + to, + to_orient, + } = entry + { + // add first node if not present + let a = index_map + .entry(from.clone()) + .or_insert_with(|| { + i += 1.0; + + graph.add_node(( + format!("{}{}", from, from_orient), + Point2::new(rand::gen_range(0.0, radius), rand::gen_range(0.0, radius)), + // Point2::new(i, 50.0 + rand::gen_range(0.0, 100.0)), + )) + }) + .to_owned(); + + // add second node if not present + let b = index_map + .entry(to.clone()) + .or_insert_with(|| { + i += 1.0; + + graph.add_node(( + format!("{}{}", from, to_orient), + Point2::new(rand::gen_range(0.0, radius), rand::gen_range(0.0, radius)), + // Point2::new(i, 50.0 + rand::gen_range(0.0, 100.0)), + )) + }) + .to_owned(); + + graph.add_edge(a, b, ()); + graph.add_edge(b, a, ()); + } + } + + println!("Loading completed"); + + graph +} + +fn draw_graph(graph: &StableGraph<(String, Point2), ()>) { + let (width, height) = (screen_width(), screen_height()); + + let (min_x, max_x) = graph + .node_weights() + .map(|(_, pos)| pos.x) + .fold((f32::INFINITY, f32::NEG_INFINITY), |(min, max), x| { + (min.min(x), max.max(x)) + }); + + let (min_y, max_y) = graph + .node_weights() + .map(|(_, pos)| pos.y) + .fold((f32::INFINITY, f32::NEG_INFINITY), |(min, max), y| { + (min.min(y), max.max(y)) + }); + + let source_range: f32 = (max_x - min_x).max(max_y - min_y); + + for idx in graph.edge_indices() { + let ((_, source), (_, target)) = graph + .edge_endpoints(idx) + .map(|(a, b)| (graph.node_weight(a).unwrap(), graph.node_weight(b).unwrap())) + .unwrap(); + + draw_line( + remap(source.x, min_x, min_x + source_range, 10.0, width - 10.0), + remap(source.y, min_y, min_y + source_range, 10.0, height - 10.0), + remap(target.x, min_x, min_x + source_range, 10.0, width - 10.0), + remap(target.y, min_y, min_y + source_range, 10.0, height - 10.0), + 1.0, + BLACK, + ); + } + + for (_label, pos) in graph.node_weights() { + let x = remap(pos.x, min_x, min_x + source_range, 10.0, width - 10.0); + let y = remap(pos.y, min_y, min_y + source_range, 10.0, height - 10.0); + + draw_circle(x, y, 2.0, RED); + // draw_text(label.as_str(), x - 30.0, y - 30.0, 10.0, BLACK); + } + + draw_line( + remap(0.0, min_x, min_x + source_range, 10.0, width - 10.0), + remap(0.0, min_y, min_y + source_range, 10.0, height - 10.0), + remap(100.0, min_x, min_x + source_range, 10.0, width - 10.0), + remap(0.0, min_y, min_y + source_range, 10.0, height - 10.0), + 2.0, + BLUE, + ); + + draw_line( + remap(0.0, min_x, min_x + source_range, 10.0, width - 10.0), + remap(0.0, min_y, min_y + source_range, 10.0, height - 10.0), + remap(0.0, min_x, min_x + source_range, 10.0, width - 10.0), + remap(100.0, min_y, min_y + source_range, 10.0, height - 10.0), + 2.0, + BLUE, + ); +} + +fn remap(value: f32, from_min: f32, from_max: f32, to_min: f32, to_max: f32) -> f32 { + (value - from_min) / (from_max - from_min) * (to_max - to_min) + to_min +}