GPU Computing with Sounio

Sounio provides native GPU support for high-performance scientific computing, without leaving the language.

Why GPU in Sounio?

Scientific computing often requires massive parallelism:

Molecular dynamics simulations
Fluid dynamics (CFD)
Machine learning inference
Monte Carlo methods
Image processing

Sounio compiles directly to PTX (NVIDIA) and SPIR-V (cross-platform).

Simple Vector Operations

use std::gpu::{kernel, launch, Device}

#[kernel]
fn vector_add(a: &[f32], b: &[f32], c: &![f32]) {
    let i = thread_idx() + block_idx() * block_dim()
    if i < a.len() {
        c[i] = a[i] + b[i]
    }
}

fn main() with IO, GPU {
    let device = Device::default()

    let a = vec![1.0f32; 1_000_000]
    let b = vec![2.0f32; 1_000_000]
    var c = vec![0.0f32; 1_000_000]

    // Transfer to GPU
    let d_a = device.upload(&a)
    let d_b = device.upload(&b)
    let d_c = device.upload_mut(&mut c)

    // Launch kernel
    launch(vector_add,
        grid: 1024,
        block: 1024,
        args: (d_a, d_b, d_c)
    )

    // Transfer back
    device.download(d_c, &mut c)

    print("Result: " + c[0].to_string())  // 3.0
}

N-Body Simulation

#[kernel]
fn nbody_step(
    positions: &![Vec3],
    velocities: &![Vec3],
    masses: &[f32],
    dt: f32
) {
    let i = thread_idx() + block_idx() * block_dim()
    if i >= positions.len() { return }

    var force = Vec3::zero()

    // Compute gravitational force from all other bodies
    for j in 0..masses.len() {
        if i == j { continue }

        let r = positions[j] - positions[i]
        let dist_sq = r.length_squared() + 1e-10  // Softening
        let f_mag = masses[i] * masses[j] / dist_sq
        force = force + r.normalize() * f_mag
    }

    // Update velocity and position
    let accel = force / masses[i]
    velocities[i] = velocities[i] + accel * dt
    positions[i] = positions[i] + velocities[i] * dt
}

fn simulate_galaxy(n_bodies: usize, n_steps: i32) with IO, GPU {
    let device = Device::default()

    // Initialize random positions and velocities
    var positions = random_sphere(n_bodies, radius: 100.0)
    var velocities = random_velocities(n_bodies, max_speed: 1.0)
    let masses = random_masses(n_bodies, range: 0.1..10.0)

    // Upload to GPU
    let d_pos = device.upload_mut(&mut positions)
    let d_vel = device.upload_mut(&mut velocities)
    let d_mass = device.upload(&masses)

    let blocks = (n_bodies + 255) / 256

    for step in 0..n_steps {
        launch(nbody_step,
            grid: blocks,
            block: 256,
            args: (d_pos, d_vel, d_mass, 0.01)
        )

        if step % 100 == 0 {
            device.download(d_pos, &mut positions)
            save_snapshot(positions, step)
        }
    }
}

Shared Memory Optimization

#[kernel(shared_mem = 256 * sizeof(f32))]
fn matrix_multiply_tiled(
    a: &[f32],
    b: &[f32],
    c: &![f32],
    n: i32
) {
    // Tile size
    const TILE: i32 = 16

    // Shared memory for tiles
    shared var a_tile: [f32; TILE * TILE]
    shared var b_tile: [f32; TILE * TILE]

    let row = block_idx_y() * TILE + thread_idx_y()
    let col = block_idx_x() * TILE + thread_idx_x()
    var sum = 0.0f32

    for tile in 0..(n / TILE) {
        // Collaborative loading
        a_tile[thread_idx_y() * TILE + thread_idx_x()] =
            a[row * n + tile * TILE + thread_idx_x()]
        b_tile[thread_idx_y() * TILE + thread_idx_x()] =
            b[(tile * TILE + thread_idx_y()) * n + col]

        sync_threads()

        // Compute partial dot product
        for k in 0..TILE {
            sum = sum + a_tile[thread_idx_y() * TILE + k] *
                        b_tile[k * TILE + thread_idx_x()]
        }

        sync_threads()
    }

    c[row * n + col] = sum
}

GPU with Epistemic Types

#[kernel]
fn monte_carlo_pi(
    rng_states: &![RngState],
    results: &![i32]
) {
    let i = global_thread_idx()
    var inside = 0

    for _ in 0..1000 {
        let x = rng_states[i].uniform()
        let y = rng_states[i].uniform()

        if x * x + y * y < 1.0 {
            inside = inside + 1
        }
    }

    results[i] = inside
}

fn estimate_pi() -> Knowledge<f64> with GPU {
    let n_threads = 10000
    let samples_per_thread = 1000

    var results = vec![0i32; n_threads]
    let d_results = device.upload_mut(&mut results)
    let d_rng = device.create_rng_states(n_threads)

    launch(monte_carlo_pi, grid: 100, block: 100, args: (d_rng, d_results))
    device.download(d_results, &mut results)

    let total_inside = results.sum() as f64
    let total_samples = (n_threads * samples_per_thread) as f64

    let pi_estimate = 4.0 * total_inside / total_samples
    let uncertainty = 4.0 * (pi_estimate * (4.0 - pi_estimate) / total_samples).sqrt()

    Knowledge {
        value: pi_estimate,
        uncertainty: uncertainty,
        confidence: 0.95,
        provenance: vec!["monte_carlo_gpu"],
    }
}

Features for GPU Computing

Native compilation: Direct to PTX/SPIR-V
Memory management: Automatic transfers
Shared memory: Easy tile-based algorithms
Multiple backends: CUDA, OpenCL, Vulkan compute
Effect tracking: GPU marked in type system

Supported Hardware

NVIDIA GPUs (CUDA/PTX)
AMD GPUs (ROCm)
Intel GPUs (oneAPI)
Cross-platform via SPIR-V

Get Started

sounio new gpu-project --template gpu
cd gpu-project
sounio run examples/vector_add.sio --gpu