Memory Management
GPU Memory Challenges
Ring Kernels require careful memory management across CPU↔GPU boundaries:
- GPU Buffer Allocation/Deallocation
- Pinned Host Memory for DMA transfers
- Unified Memory (where available)
- Memory Pools for frequent allocations
- Cross-Device P2P for multi-GPU
Rust Ownership Model for GPU Memory
GpuBuffer Wrapper
// crates/ringkernel-core/src/buffer.rs
use std::marker::PhantomData;
use std::ptr::NonNull;
/// Owned GPU memory buffer with RAII cleanup.
pub struct GpuBuffer<T: Copy + Send> {
device_ptr: NonNull<T>,
len: usize,
capacity: usize,
allocator: Box<dyn GpuAllocator>,
_marker: PhantomData<T>,
}
// Safety: GPU memory is thread-safe when properly synchronized
unsafe impl<T: Copy + Send> Send for GpuBuffer<T> {}
unsafe impl<T: Copy + Send> Sync for GpuBuffer<T> {}
impl<T: Copy + Send> GpuBuffer<T> {
/// Allocate new GPU buffer.
pub fn new(len: usize, allocator: Box<dyn GpuAllocator>) -> Result<Self> {
let size = len * std::mem::size_of::<T>();
let device_ptr = allocator.allocate(size)?;
Ok(Self {
device_ptr: NonNull::new(device_ptr as *mut T)
.ok_or(Error::AllocationFailed)?,
len,
capacity: len,
allocator,
_marker: PhantomData,
})
}
/// Get raw device pointer.
pub fn as_ptr(&self) -> *const T {
self.device_ptr.as_ptr()
}
/// Get mutable device pointer.
pub fn as_mut_ptr(&mut self) -> *mut T {
self.device_ptr.as_ptr()
}
/// Length in elements.
pub fn len(&self) -> usize {
self.len
}
/// Size in bytes.
pub fn size_bytes(&self) -> usize {
self.len * std::mem::size_of::<T>()
}
}
impl<T: Copy + Send> Drop for GpuBuffer<T> {
fn drop(&mut self) {
// Safety: we own this memory
unsafe {
let _ = self.allocator.free(self.device_ptr.as_ptr() as *mut u8);
}
}
}
Pinned Host Memory
Required for efficient DMA transfers:
/// Pinned (page-locked) host memory for DMA transfers.
pub struct PinnedBuffer<T: Copy + Send> {
host_ptr: NonNull<T>,
len: usize,
allocator: Box<dyn HostAllocator>,
_marker: PhantomData<T>,
}
impl<T: Copy + Send> PinnedBuffer<T> {
/// Allocate pinned memory.
pub fn new(len: usize, allocator: Box<dyn HostAllocator>) -> Result<Self> {
let size = len * std::mem::size_of::<T>();
let host_ptr = allocator.allocate_pinned(size)?;
Ok(Self {
host_ptr: NonNull::new(host_ptr as *mut T)
.ok_or(Error::AllocationFailed)?,
len,
allocator,
_marker: PhantomData,
})
}
/// Get slice for reading.
pub fn as_slice(&self) -> &[T] {
unsafe { std::slice::from_raw_parts(self.host_ptr.as_ptr(), self.len) }
}
/// Get mutable slice.
pub fn as_mut_slice(&mut self) -> &mut [T] {
unsafe { std::slice::from_raw_parts_mut(self.host_ptr.as_ptr(), self.len) }
}
}
impl<T: Copy + Send> Drop for PinnedBuffer<T> {
fn drop(&mut self) {
unsafe {
let _ = self.allocator.free_pinned(self.host_ptr.as_ptr() as *mut u8);
}
}
}
Memory Pool for Ring Buffers
Reduces allocation overhead for message queues:
/// Thread-safe memory pool for GPU allocations.
pub struct GpuMemoryPool {
allocator: Arc<dyn GpuAllocator>,
pools: Mutex<HashMap<usize, Vec<NonNull<u8>>>>,
stats: AtomicPoolStats,
}
impl GpuMemoryPool {
/// Get a buffer from the pool or allocate new.
pub fn acquire(&self, size: usize) -> Result<PooledBuffer> {
let bucket_size = size.next_power_of_two();
let mut pools = self.pools.lock().unwrap();
if let Some(pool) = pools.get_mut(&bucket_size) {
if let Some(ptr) = pool.pop() {
self.stats.hits.fetch_add(1, Ordering::Relaxed);
return Ok(PooledBuffer::from_pool(ptr, bucket_size, self));
}
}
// Allocate new
self.stats.misses.fetch_add(1, Ordering::Relaxed);
let ptr = self.allocator.allocate(bucket_size)?;
Ok(PooledBuffer::new(
NonNull::new(ptr).ok_or(Error::AllocationFailed)?,
bucket_size,
self,
))
}
/// Return buffer to pool.
pub fn release(&self, ptr: NonNull<u8>, size: usize) {
let mut pools = self.pools.lock().unwrap();
pools.entry(size).or_default().push(ptr);
self.stats.returns.fetch_add(1, Ordering::Relaxed);
}
}
/// RAII handle that returns to pool on drop.
pub struct PooledBuffer<'a> {
ptr: NonNull<u8>,
size: usize,
pool: &'a GpuMemoryPool,
}
impl Drop for PooledBuffer<'_> {
fn drop(&mut self) {
self.pool.release(self.ptr, self.size);
}
}
Unified Memory (CUDA Managed Memory)
/// Unified memory accessible from CPU and GPU.
pub struct UnifiedBuffer<T: Copy + Send> {
ptr: NonNull<T>,
len: usize,
allocator: Box<dyn UnifiedAllocator>,
_marker: PhantomData<T>,
}
impl<T: Copy + Send> UnifiedBuffer<T> {
pub fn new(len: usize, allocator: Box<dyn UnifiedAllocator>) -> Result<Self> {
let ptr = allocator.allocate_managed(len * std::mem::size_of::<T>())?;
Ok(Self {
ptr: NonNull::new(ptr as *mut T).ok_or(Error::AllocationFailed)?,
len,
allocator,
_marker: PhantomData,
})
}
/// Access from CPU (may trigger page migration).
pub fn as_slice(&self) -> &[T] {
unsafe { std::slice::from_raw_parts(self.ptr.as_ptr(), self.len) }
}
/// Prefetch to GPU.
pub fn prefetch_to_device(&self, device_id: i32) -> Result<()> {
self.allocator.prefetch(self.ptr.as_ptr() as *const u8, self.len, device_id)
}
}
WSL2 Memory Visibility Issues
Critical: WSL2 has limited GPU memory coherence (see CLAUDE.md).
/// Memory visibility strategy for different platforms.
#[derive(Debug, Clone, Copy)]
pub enum MemoryVisibility {
/// Full system-scope atomics (native Linux)
SystemScope,
/// Device-scope only, requires explicit sync (WSL2)
DeviceScope,
/// Explicit DMA transfers only
ExplicitDma,
}
impl MemoryVisibility {
/// Detect platform capabilities.
pub fn detect() -> Self {
#[cfg(target_os = "linux")]
{
if is_wsl2() {
return Self::DeviceScope; // WSL2 limitation
}
Self::SystemScope
}
#[cfg(target_os = "windows")]
{
Self::SystemScope // Native Windows CUDA works
}
#[cfg(target_os = "macos")]
{
Self::DeviceScope // Metal doesn't have system atomics
}
}
}
fn is_wsl2() -> bool {
std::fs::read_to_string("/proc/version")
.map(|v| v.contains("microsoft") || v.contains("WSL"))
.unwrap_or(false)
}
Memory Transfer Strategies
/// Strategy for host↔device data transfer.
pub enum TransferStrategy {
/// Synchronous copy (blocking)
Sync,
/// Async copy with stream
Async { stream: StreamHandle },
/// Zero-copy via unified memory
Unified,
/// Explicit DMA with polling
DmaPolling { poll_interval_us: u32 },
}
/// Bridge for host↔GPU ring buffer synchronization.
pub struct RingBufferBridge<T: RingMessage> {
host_buffer: PinnedBuffer<T>,
gpu_buffer: GpuBuffer<T>,
strategy: TransferStrategy,
// Transfer tracking
host_to_gpu_count: AtomicU64,
gpu_to_host_count: AtomicU64,
}
impl<T: RingMessage> RingBufferBridge<T> {
/// Transfer pending messages from host to GPU.
pub async fn flush_to_device(&mut self) -> Result<usize> {
match &self.strategy {
TransferStrategy::Sync => {
self.sync_copy_to_device()
}
TransferStrategy::Async { stream } => {
self.async_copy_to_device(stream).await
}
TransferStrategy::Unified => {
// No-op, unified memory handles it
Ok(0)
}
TransferStrategy::DmaPolling { poll_interval_us } => {
self.dma_poll_to_device(*poll_interval_us).await
}
}
}
}
Cache-Line Alignment
Critical for GPU performance:
/// Ensure cache-line alignment for GPU structures.
#[repr(C, align(128))]
pub struct CacheLineAligned<T> {
value: T,
}
/// Control block must be 128-byte aligned (dual cache line).
#[repr(C, align(128))]
pub struct ControlBlock {
// ... fields (see 01-architecture-overview.md)
}
/// Telemetry buffer is 64-byte aligned (single cache line).
#[repr(C, align(64))]
pub struct TelemetryBuffer {
// ... fields
}
// Compile-time size assertions
const _: () = assert!(std::mem::size_of::<ControlBlock>() == 128);
const _: () = assert!(std::mem::size_of::<TelemetryBuffer>() == 64);