🚀
GPU Computing
High-performance physics simulations with native GPU kernels
GPU Computing with Sounio
Sounio provides native GPU support for high-performance scientific computing, without leaving the language.
Why GPU in Sounio?
Scientific computing often requires massive parallelism:
- Molecular dynamics simulations
- Fluid dynamics (CFD)
- Machine learning inference
- Monte Carlo methods
- Image processing
Sounio compiles directly to PTX (NVIDIA) and SPIR-V (cross-platform).
Simple Vector Operations
use std::gpu::{kernel, launch, Device}
#[kernel]
fn vector_add(a: &[f32], b: &[f32], c: &![f32]) {
let i = thread_idx() + block_idx() * block_dim()
if i < a.len() {
c[i] = a[i] + b[i]
}
}
fn main() with IO, GPU {
let device = Device::default()
let a = vec![1.0f32; 1_000_000]
let b = vec![2.0f32; 1_000_000]
var c = vec![0.0f32; 1_000_000]
// Transfer to GPU
let d_a = device.upload(&a)
let d_b = device.upload(&b)
let d_c = device.upload_mut(&mut c)
// Launch kernel
launch(vector_add,
grid: 1024,
block: 1024,
args: (d_a, d_b, d_c)
)
// Transfer back
device.download(d_c, &mut c)
print("Result: " + c[0].to_string()) // 3.0
}
N-Body Simulation
#[kernel]
fn nbody_step(
positions: &![Vec3],
velocities: &![Vec3],
masses: &[f32],
dt: f32
) {
let i = thread_idx() + block_idx() * block_dim()
if i >= positions.len() { return }
var force = Vec3::zero()
// Compute gravitational force from all other bodies
for j in 0..masses.len() {
if i == j { continue }
let r = positions[j] - positions[i]
let dist_sq = r.length_squared() + 1e-10 // Softening
let f_mag = masses[i] * masses[j] / dist_sq
force = force + r.normalize() * f_mag
}
// Update velocity and position
let accel = force / masses[i]
velocities[i] = velocities[i] + accel * dt
positions[i] = positions[i] + velocities[i] * dt
}
fn simulate_galaxy(n_bodies: usize, n_steps: i32) with IO, GPU {
let device = Device::default()
// Initialize random positions and velocities
var positions = random_sphere(n_bodies, radius: 100.0)
var velocities = random_velocities(n_bodies, max_speed: 1.0)
let masses = random_masses(n_bodies, range: 0.1..10.0)
// Upload to GPU
let d_pos = device.upload_mut(&mut positions)
let d_vel = device.upload_mut(&mut velocities)
let d_mass = device.upload(&masses)
let blocks = (n_bodies + 255) / 256
for step in 0..n_steps {
launch(nbody_step,
grid: blocks,
block: 256,
args: (d_pos, d_vel, d_mass, 0.01)
)
if step % 100 == 0 {
device.download(d_pos, &mut positions)
save_snapshot(positions, step)
}
}
}
Shared Memory Optimization
#[kernel(shared_mem = 256 * sizeof(f32))]
fn matrix_multiply_tiled(
a: &[f32],
b: &[f32],
c: &![f32],
n: i32
) {
// Tile size
const TILE: i32 = 16
// Shared memory for tiles
shared var a_tile: [f32; TILE * TILE]
shared var b_tile: [f32; TILE * TILE]
let row = block_idx_y() * TILE + thread_idx_y()
let col = block_idx_x() * TILE + thread_idx_x()
var sum = 0.0f32
for tile in 0..(n / TILE) {
// Collaborative loading
a_tile[thread_idx_y() * TILE + thread_idx_x()] =
a[row * n + tile * TILE + thread_idx_x()]
b_tile[thread_idx_y() * TILE + thread_idx_x()] =
b[(tile * TILE + thread_idx_y()) * n + col]
sync_threads()
// Compute partial dot product
for k in 0..TILE {
sum = sum + a_tile[thread_idx_y() * TILE + k] *
b_tile[k * TILE + thread_idx_x()]
}
sync_threads()
}
c[row * n + col] = sum
}
GPU with Epistemic Types
#[kernel]
fn monte_carlo_pi(
rng_states: &![RngState],
results: &![i32]
) {
let i = global_thread_idx()
var inside = 0
for _ in 0..1000 {
let x = rng_states[i].uniform()
let y = rng_states[i].uniform()
if x * x + y * y < 1.0 {
inside = inside + 1
}
}
results[i] = inside
}
fn estimate_pi() -> Knowledge<f64> with GPU {
let n_threads = 10000
let samples_per_thread = 1000
var results = vec![0i32; n_threads]
let d_results = device.upload_mut(&mut results)
let d_rng = device.create_rng_states(n_threads)
launch(monte_carlo_pi, grid: 100, block: 100, args: (d_rng, d_results))
device.download(d_results, &mut results)
let total_inside = results.sum() as f64
let total_samples = (n_threads * samples_per_thread) as f64
let pi_estimate = 4.0 * total_inside / total_samples
let uncertainty = 4.0 * (pi_estimate * (4.0 - pi_estimate) / total_samples).sqrt()
Knowledge {
value: pi_estimate,
uncertainty: uncertainty,
confidence: 0.95,
provenance: vec!["monte_carlo_gpu"],
}
}
Features for GPU Computing
- Native compilation: Direct to PTX/SPIR-V
- Memory management: Automatic transfers
- Shared memory: Easy tile-based algorithms
- Multiple backends: CUDA, OpenCL, Vulkan compute
- Effect tracking: GPU marked in type system
Supported Hardware
- NVIDIA GPUs (CUDA/PTX)
- AMD GPUs (ROCm)
- Intel GPUs (oneAPI)
- Cross-platform via SPIR-V
Get Started
sounio new gpu-project --template gpu
cd gpu-project
sounio run examples/vector_add.sio --gpu