GPU Backends

Backend Trait Hierarchy

// crates/ringkernel-core/src/backend.rs

use async_trait::async_trait;

/// Core GPU backend abstraction.
#[async_trait]
pub trait GpuBackend: Send + Sync + 'static {
    /// Backend identifier.
    fn name(&self) -> &'static str;

    /// Number of available devices.
    fn device_count(&self) -> usize;

    /// Get device properties.
    fn device_info(&self, device_id: usize) -> Result<DeviceInfo>;

    /// Create a new execution context.
    async fn create_context(&self, device_id: usize) -> Result<Box<dyn GpuContext>>;
}

/// GPU execution context (one per device).
#[async_trait]
pub trait GpuContext: Send + Sync {
    /// Allocate device memory.
    async fn allocate(&self, size: usize) -> Result<DevicePtr>;

    /// Free device memory.
    async fn free(&self, ptr: DevicePtr) -> Result<()>;

    /// Copy host → device.
    async fn copy_to_device(&self, src: &[u8], dst: DevicePtr) -> Result<()>;

    /// Copy device → host.
    async fn copy_to_host(&self, src: DevicePtr, dst: &mut [u8]) -> Result<()>;

    /// Copy device → device.
    async fn copy_device_to_device(&self, src: DevicePtr, dst: DevicePtr, size: usize) -> Result<()>;

    /// Compile kernel source.
    async fn compile(&self, source: &str, options: &CompileOptions) -> Result<Box<dyn CompiledKernel>>;

    /// Create execution stream.
    fn create_stream(&self) -> Result<Box<dyn GpuStream>>;

    /// Synchronize all pending operations.
    async fn synchronize(&self) -> Result<()>;
}

/// Compiled GPU kernel.
#[async_trait]
pub trait CompiledKernel: Send + Sync {
    /// Get kernel function by name.
    fn get_function(&self, name: &str) -> Result<Box<dyn KernelFunction>>;
}

/// Executable kernel function.
#[async_trait]
pub trait KernelFunction: Send + Sync {
    /// Launch kernel.
    async fn launch(
        &self,
        grid: Dim3,
        block: Dim3,
        shared_mem: usize,
        stream: &dyn GpuStream,
        args: &[KernelArg],
    ) -> Result<()>;
}

CUDA Backend

Implementation with `cudarc`

// crates/ringkernel-cuda/src/lib.rs

use cudarc::driver::{CudaDevice, CudaSlice, LaunchAsync, LaunchConfig};
use cudarc::nvrtc::Ptx;

pub struct CudaBackend {
    devices: Vec<Arc<CudaDevice>>,
}

impl CudaBackend {
    pub fn new() -> Result<Self> {
        let device_count = CudaDevice::count()? as usize;
        let devices = (0..device_count)
            .map(|i| CudaDevice::new(i))
            .collect::<Result<Vec<_>, _>>()?;

        Ok(Self {
            devices: devices.into_iter().map(Arc::new).collect(),
        })
    }
}

#[async_trait]
impl GpuBackend for CudaBackend {
    fn name(&self) -> &'static str { "cuda" }

    fn device_count(&self) -> usize { self.devices.len() }

    fn device_info(&self, device_id: usize) -> Result<DeviceInfo> {
        let dev = &self.devices[device_id];
        Ok(DeviceInfo {
            name: dev.name()?,
            compute_capability: dev.compute_capability(),
            total_memory: dev.total_memory()?,
            // ...
        })
    }

    async fn create_context(&self, device_id: usize) -> Result<Box<dyn GpuContext>> {
        Ok(Box::new(CudaContext {
            device: self.devices[device_id].clone(),
        }))
    }
}

pub struct CudaContext {
    device: Arc<CudaDevice>,
}

#[async_trait]
impl GpuContext for CudaContext {
    async fn allocate(&self, size: usize) -> Result<DevicePtr> {
        let slice: CudaSlice<u8> = self.device.alloc_zeros(size)?;
        Ok(DevicePtr::from_cuda(slice))
    }

    async fn compile(&self, source: &str, options: &CompileOptions) -> Result<Box<dyn CompiledKernel>> {
        let ptx = Ptx::from_src(source)?;
        let module = self.device.load_ptx(ptx, "ring_kernel", &[])?;
        Ok(Box::new(CudaCompiledKernel { module }))
    }

    // ... other methods
}

CUDA PTX Compilation (v0.3.0)

The ringkernel-cuda crate provides a compile_ptx() function for runtime CUDA compilation without directly depending on cudarc:

use ringkernel_cuda::compile_ptx;

let cuda_source = r#"
    extern "C" __global__ void add(float* out, const float* a, const float* b, int n) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        if (idx < n) out[idx] = a[idx] + b[idx];
    }
"#;

// Compile to PTX using NVRTC
let ptx = compile_ptx(cuda_source)?;
println!("Generated PTX:\n{}", ptx);

cudarc 0.18.2 API (Updated)

The CUDA backend uses cudarc 0.18.2 with these patterns:

// Module loading (NEW in 0.18.2)
let module = device.inner().load_module(ptx)?;  // Returns Arc<CudaModule>
let func = module.load_function("kernel_name")?; // Load specific function

// Kernel launch with builder pattern (NEW in 0.18.2)
use cudarc::driver::PushKernelArg;
unsafe {
    stream
        .launch_builder(&func)
        .arg(&input_ptr)
        .arg(&output_ptr)
        .arg(&scalar_param)
        .launch(cfg)?;
}

// Cooperative kernel launch
use cudarc::driver::result as cuda_result;
unsafe {
    cuda_result::launch_cooperative_kernel(
        func, grid_dim, block_dim, shared_mem_bytes, stream, kernel_params
    )?;
}

Old API (cudarc 0.11) - No longer works:

device.load_ptx(ptx, module_name, &[func_names]) → Use load_module() + load_function()
device.get_func(module_name, fn_name) → Store functions at construction time
func.launch(cfg, params) → Use stream.launch_builder(&func).arg(...).launch(cfg)


### CUDA Ring Kernel Template

```cuda
// Generated CUDA C for ring kernels
#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include <cuda/atomic>

namespace cg = cooperative_groups;

struct ControlBlock {
    cuda::atomic<int> is_active;
    cuda::atomic<int> should_terminate;
    cuda::atomic<int> has_terminated;
    cuda::atomic<int> errors_encountered;
    cuda::atomic<long long> messages_processed;
    // ... queue pointers
};

template<typename TInput, typename TOutput>
__global__ void ring_kernel_persistent(
    ControlBlock* control,
    TInput* input_buffer,
    cuda::atomic<int>* input_head,
    cuda::atomic<int>* input_tail,
    TOutput* output_buffer,
    cuda::atomic<int>* output_head,
    cuda::atomic<int>* output_tail,
    int queue_capacity
) {
    auto grid = cg::this_grid();

    // Persistent kernel loop
    while (!control->should_terminate.load(cuda::memory_order_acquire)) {
        // Check if active
        if (!control->is_active.load(cuda::memory_order_acquire)) {
            // Yield to avoid busy-waiting
            __nanosleep(1000);
            continue;
        }

        // Try dequeue input
        int head = input_head->load(cuda::memory_order_relaxed);
        int tail = input_tail->load(cuda::memory_order_acquire);

        if (head != tail) {
            // Claim slot
            int next_head = (head + 1) & (queue_capacity - 1);
            if (input_head->compare_exchange_strong(head, next_head,
                    cuda::memory_order_acq_rel)) {

                TInput msg = input_buffer[head];

                // === USER HANDLER CODE ===
                TOutput response = process_message(msg);
                // =========================

                // Enqueue output
                enqueue_output(output_buffer, output_head, output_tail,
                              queue_capacity, response);

                control->messages_processed.fetch_add(1, cuda::memory_order_relaxed);
            }
        }

        grid.sync(); // Cooperative sync
    }

    // Mark terminated
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        control->has_terminated.store(1, cuda::memory_order_release);
    }
}

Metal Backend

Implementation Status

The Metal backend is implemented as a scaffold with the following components:

MetalRuntime - Compute command queue management
MetalBuffer - GPU buffer allocation with shared/private storage modes
MetalPipeline - Compute pipeline state creation
Fence-based synchronization - Metal lacks cooperative groups, uses fences instead

Note: True persistent kernels are not yet implemented. Metal requires a host-driven dispatch pattern where the host re-dispatches compute kernels until termination.

Implementation with `metal-rs`

// crates/ringkernel-metal/src/lib.rs

use metal::{Device, Library, CommandQueue, Buffer, MTLResourceOptions};

pub struct MetalBackend {
    device: Device,
    command_queue: CommandQueue,
}

impl MetalBackend {
    pub fn new() -> Result<Self> {
        let device = Device::system_default()
            .ok_or(Error::NoDevice)?;
        let command_queue = device.new_command_queue();

        Ok(Self { device, command_queue })
    }
}

#[async_trait]
impl GpuBackend for MetalBackend {
    fn name(&self) -> &'static str { "metal" }

    fn device_count(&self) -> usize { 1 } // Metal typically has one GPU

    async fn create_context(&self, _device_id: usize) -> Result<Box<dyn GpuContext>> {
        Ok(Box::new(MetalContext {
            device: self.device.clone(),
            queue: self.command_queue.clone(),
        }))
    }
}

pub struct MetalContext {
    device: Device,
    queue: CommandQueue,
}

#[async_trait]
impl GpuContext for MetalContext {
    async fn allocate(&self, size: usize) -> Result<DevicePtr> {
        let buffer = self.device.new_buffer(
            size as u64,
            MTLResourceOptions::StorageModeShared,
        );
        Ok(DevicePtr::from_metal(buffer))
    }

    async fn compile(&self, source: &str, _options: &CompileOptions) -> Result<Box<dyn CompiledKernel>> {
        let library = self.device.new_library_with_source(source, &Default::default())?;
        Ok(Box::new(MetalCompiledKernel { library }))
    }
}

Metal Shading Language Template

// Generated MSL for ring kernels
#include <metal_stdlib>
#include <metal_atomic>
using namespace metal;

struct ControlBlock {
    atomic_int is_active;
    atomic_int should_terminate;
    atomic_int has_terminated;
    atomic_int errors_encountered;
    atomic_long messages_processed;
    // ... queue pointers
};

kernel void ring_kernel_persistent(
    device ControlBlock* control [[buffer(0)]],
    device InputMessage* input_buffer [[buffer(1)]],
    device atomic_int* input_head [[buffer(2)]],
    device atomic_int* input_tail [[buffer(3)]],
    device OutputMessage* output_buffer [[buffer(4)]],
    device atomic_int* output_head [[buffer(5)]],
    device atomic_int* output_tail [[buffer(6)]],
    constant int& queue_capacity [[buffer(7)]],
    uint tid [[thread_position_in_grid]]
) {
    // Note: Metal doesn't support true persistent kernels
    // Must use event-driven relaunch pattern

    int head = atomic_load_explicit(input_head, memory_order_relaxed);
    int tail = atomic_load_explicit(input_tail, memory_order_acquire);

    if (head == tail) return; // No messages

    // Process messages in batch
    while (head != tail) {
        int next_head = (head + 1) & (queue_capacity - 1);
        if (atomic_compare_exchange_weak_explicit(
                input_head, &head, next_head,
                memory_order_acq_rel, memory_order_relaxed)) {

            InputMessage msg = input_buffer[head];
            OutputMessage response = process_message(msg);
            enqueue_output(output_buffer, output_head, output_tail,
                          queue_capacity, response);

            atomic_fetch_add_explicit(&control->messages_processed, 1,
                                      memory_order_relaxed);
        }
        head = atomic_load_explicit(input_head, memory_order_relaxed);
        tail = atomic_load_explicit(input_tail, memory_order_acquire);
    }
}

WebGPU Backend

Implementation with `wgpu`

// crates/ringkernel-wgpu/src/lib.rs

use wgpu::{Device, Queue, ShaderModule, ComputePipeline};

pub struct WgpuBackend {
    instance: wgpu::Instance,
    adapters: Vec<wgpu::Adapter>,
}

impl WgpuBackend {
    pub async fn new() -> Result<Self> {
        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
            backends: wgpu::Backends::all(),
            ..Default::default()
        });

        let adapters = instance.enumerate_adapters(wgpu::Backends::all())
            .collect();

        Ok(Self { instance, adapters })
    }
}

#[async_trait]
impl GpuBackend for WgpuBackend {
    fn name(&self) -> &'static str { "wgpu" }

    fn device_count(&self) -> usize { self.adapters.len() }

    async fn create_context(&self, device_id: usize) -> Result<Box<dyn GpuContext>> {
        let adapter = &self.adapters[device_id];
        let (device, queue) = adapter.request_device(
            &wgpu::DeviceDescriptor::default(),
            None,
        ).await?;

        Ok(Box::new(WgpuContext { device, queue }))
    }
}

WGSL Template

// Generated WGSL for ring kernels
struct ControlBlock {
    is_active: atomic<i32>,
    should_terminate: atomic<i32>,
    has_terminated: atomic<i32>,
    errors_encountered: atomic<i32>,
    messages_processed: atomic<i32>, // WGSL doesn't have 64-bit atomics
}

@group(0) @binding(0) var<storage, read_write> control: ControlBlock;
@group(0) @binding(1) var<storage, read_write> input_buffer: array<InputMessage>;
@group(0) @binding(2) var<storage, read_write> input_head: atomic<i32>;
@group(0) @binding(3) var<storage, read_write> input_tail: atomic<i32>;
@group(0) @binding(4) var<storage, read_write> output_buffer: array<OutputMessage>;

@compute @workgroup_size(256)
fn ring_kernel(@builtin(global_invocation_id) gid: vec3<u32>) {
    let head = atomicLoad(&input_head);
    let tail = atomicLoad(&input_tail);

    if (head == tail) { return; }

    // Process message
    // ...
}

Persistent vs Traditional Kernel Patterns

RingKernel supports two fundamentally different GPU execution models, each optimized for different workloads.

Traditional Kernels (Launch-per-Command)

Host                              GPU
  │                                │
  ├─── cudaMemcpy(H→D) ───────────>│ (20-50µs)
  ├─── cudaLaunchKernel ──────────>│ (10-30µs)
  │                                ├── Compute
  │<── cudaDeviceSynchronize ──────┤ (5-20µs)
  │<── cudaMemcpy(D→H) ────────────┤ (20-50µs)
  │                                │

Best for: Batch processing, compute-bound workloads, running 1000s of steps at once.

Persistent Kernels (Actor Model)

Host                              GPU
  │                                │
  ├─── Launch kernel (once) ──────>│ ← Single launch
  │                                ├── Persistent loop
  │    ┌─────────────────────────┐ │
  │    │ for each command:       │ │
  ├────┤  Write to mapped mem ───┼─┤ (~10-50ns)
  │<───┤  Poll response ─────────┼─┤ (~1-5µs)
  │    └─────────────────────────┘ │
  │                                │
  ├─── Terminate signal ──────────>│
  │                                │

Best for: Interactive applications, real-time GUIs, dynamic parameter changes, command-heavy workloads.

Performance Comparison (RTX Ada)

Operation	Traditional	Persistent	Speedup
Inject command	317 µs	0.03 µs	11,327x
Single compute step	3.2 µs	163 µs	Traditional 51x
Mixed workload (300 ops)	40.5 ms	15.3 ms	Persistent 2.7x

Decision Guide

Your Workload	Recommended	Why
Run 10,000 simulation steps	Traditional	Launch overhead amortized
Real-time GUI at 60 FPS	Persistent	2.7x more ops per frame
Interactive parameter tuning	Persistent	0.03µs command latency
Batch matrix operations	Traditional	Maximum compute throughput
Game physics with user input	Persistent	Instant response to inputs

Backend Selection

/// Automatic backend selection based on platform.
pub fn select_backend() -> Result<Box<dyn GpuBackend>> {
    #[cfg(feature = "cuda")]
    if let Ok(backend) = CudaBackend::new() {
        if backend.device_count() > 0 {
            return Ok(Box::new(backend));
        }
    }

    #[cfg(all(feature = "metal", target_os = "macos"))]
    if let Ok(backend) = MetalBackend::new() {
        return Ok(Box::new(backend));
    }

    #[cfg(feature = "wgpu")]
    if let Ok(backend) = WgpuBackend::new().await {
        if backend.device_count() > 0 {
            return Ok(Box::new(backend));
        }
    }

    #[cfg(feature = "cpu")]
    return Ok(Box::new(CpuBackend::new()));

    Err(Error::NoBackendAvailable)
}

Rust-to-CUDA Transpilation

The ringkernel-cuda-codegen crate enables writing CUDA kernels in a Rust DSL:

use ringkernel_cuda_codegen::{transpile_global_kernel, transpile_stencil_kernel, StencilConfig};
use ringkernel_cuda_codegen::dsl::*;
use syn::parse_quote;

// Generic kernel using block/thread indices
let kernel: syn::ItemFn = parse_quote! {
    fn vector_scale(data: &mut [f32], scale: f32, n: i32) {
        let idx = block_idx_x() * block_dim_x() + thread_idx_x();
        if idx >= n { return; }
        data[idx as usize] = data[idx as usize] * scale;
    }
};
let cuda = transpile_global_kernel(&kernel)?;

// Stencil kernel using GridPos abstraction
let stencil: syn::ItemFn = parse_quote! {
    fn laplacian(input: &[f32], output: &mut [f32], pos: GridPos) {
        let lap = pos.north(input) + pos.south(input)
                + pos.east(input) + pos.west(input)
                - 4.0 * input[pos.idx()];
        output[pos.idx()] = lap;
    }
};
let config = StencilConfig::new("laplacian").with_tile_size(16, 16).with_halo(1);
let cuda_stencil = transpile_stencil_kernel(&stencil, &config)?;

DSL Functions (CPU Fallback Implementations)

The dsl module provides functions that compile to CUDA intrinsics but also work on CPU:

use ringkernel_cuda_codegen::dsl::*;

// Thread/block indices (return 0 on CPU, compile to CUDA intrinsics)
let tx = thread_idx_x();  // -> threadIdx.x
let bx = block_idx_x();   // -> blockIdx.x
let bd = block_dim_x();   // -> blockDim.x
let gd = grid_dim_x();    // -> gridDim.x

// Synchronization (no-op on CPU)
sync_threads();           // -> __syncthreads()
thread_fence();           // -> __threadfence()

Next: Serialization Strategy

PreviousMemory Management NextSerialization