ringkernel_cuda/
lib.rs

1//! CUDA Backend for RingKernel
2//!
3//! This crate provides NVIDIA CUDA GPU support for RingKernel using cudarc.
4//!
5//! # Features
6//!
7//! - Persistent kernel execution (cooperative groups)
8//! - Lock-free message queues in GPU global memory
9//! - PTX compilation via NVRTC
10//! - Multi-GPU support
11//!
12//! # Requirements
13//!
14//! - NVIDIA GPU with Compute Capability 7.0+
15//! - CUDA Toolkit 11.0+
16//! - Native Linux (persistent kernels) or WSL2 (event-driven fallback)
17//!
18//! # Example
19//!
20//! ```ignore
21//! use ringkernel_cuda::CudaRuntime;
22//! use ringkernel_core::runtime::RingKernelRuntime;
23//!
24//! #[tokio::main]
25//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
26//!     let runtime = CudaRuntime::new().await?;
27//!     let kernel = runtime.launch("vector_add", Default::default()).await?;
28//!     kernel.activate().await?;
29//!     Ok(())
30//! }
31//! ```
32
33#![warn(missing_docs)]
34#![warn(clippy::unwrap_used)]
35
36#[cfg(feature = "ptx-cache")]
37pub mod compile;
38#[cfg(feature = "cooperative")]
39pub mod cooperative;
40#[cfg(feature = "cuda")]
41mod device;
42#[cfg(feature = "cuda")]
43pub mod driver_api;
44#[cfg(feature = "cuda")]
45pub mod hopper;
46#[cfg(feature = "cuda")]
47pub mod k2k_gpu;
48#[cfg(feature = "cuda")]
49mod kernel;
50#[cfg(feature = "cuda")]
51pub mod launch_config;
52#[cfg(feature = "cuda")]
53mod memory;
54#[cfg(feature = "cuda")]
55pub mod memory_pool;
56#[cfg(feature = "cuda")]
57pub mod multi_gpu;
58#[cfg(feature = "cuda")]
59pub mod persistent;
60#[cfg(feature = "cuda")]
61pub mod phases;
62#[cfg(feature = "profiling")]
63pub mod profiling;
64#[cfg(feature = "cuda")]
65pub mod reduction;
66#[cfg(feature = "cuda")]
67mod runtime;
68#[cfg(feature = "cuda")]
69mod stencil;
70#[cfg(feature = "cuda")]
71pub mod stream;
72
73#[cfg(feature = "cuda")]
74pub use device::CudaDevice;
75#[cfg(feature = "cuda")]
76pub use kernel::CudaKernel;
77#[cfg(feature = "cuda")]
78pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
79#[cfg(feature = "cuda")]
80pub use persistent::CudaMappedBuffer;
81#[cfg(feature = "cuda")]
82pub use phases::{
83    InterPhaseReduction, KernelPhase, MultiPhaseConfig, MultiPhaseExecutor, PhaseExecutionStats,
84    SyncMode,
85};
86#[cfg(feature = "cuda")]
87pub use reduction::{
88    generate_block_reduce_code, generate_grid_reduce_code, generate_reduce_and_broadcast_code,
89    CacheKey, CacheStats, CachedReductionBuffer, ReductionBuffer, ReductionBufferBuilder,
90    ReductionBufferCache,
91};
92#[cfg(feature = "cuda")]
93pub use runtime::CudaRuntime;
94#[cfg(feature = "cuda")]
95pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
96
97// Profiling re-exports
98#[cfg(feature = "profiling")]
99pub use profiling::{
100    CudaEvent, CudaEventFlags, CudaMemoryKind, CudaMemoryTracker, CudaNvtxProfiler,
101    GpuChromeTraceBuilder, GpuEventArgs, GpuTimer, GpuTimerPool, GpuTraceEvent, KernelMetrics,
102    ProfilingSession, TrackedAllocation, TransferDirection, TransferMetrics,
103};
104
105// PTX cache re-exports
106#[cfg(feature = "ptx-cache")]
107pub use compile::{PtxCache, PtxCacheError, PtxCacheResult, PtxCacheStats, CACHE_VERSION};
108
109// GPU memory pool re-exports
110#[cfg(feature = "cuda")]
111pub use memory_pool::{
112    GpuBucketStats, GpuPoolConfig, GpuPoolDiagnostics, GpuSizeClass, GpuStratifiedPool,
113};
114
115// Stream manager re-exports
116#[cfg(feature = "cuda")]
117pub use stream::{
118    OverlapMetrics, StreamConfig, StreamConfigBuilder, StreamError, StreamId, StreamManager,
119    StreamPool, StreamPoolStats, StreamResult,
120};
121
122/// Re-export memory module for advanced usage.
123#[cfg(feature = "cuda")]
124pub mod memory_exports {
125    pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
126}
127
128// Placeholder implementations when CUDA is not available
129#[cfg(not(feature = "cuda"))]
130mod stub {
131    ringkernel_core::unavailable_backend!(
132        CudaRuntime,
133        ringkernel_core::runtime::Backend::Cuda,
134        "CUDA"
135    );
136}
137
138#[cfg(not(feature = "cuda"))]
139pub use stub::CudaRuntime;
140
141/// Check if CUDA is available at runtime.
142///
143/// This function returns false if:
144/// - CUDA feature is not enabled
145/// - CUDA libraries are not installed on the system
146/// - No CUDA devices are present
147///
148/// It safely catches panics from cudarc when CUDA is not installed.
149pub fn is_cuda_available() -> bool {
150    #[cfg(feature = "cuda")]
151    {
152        // cudarc panics if CUDA libraries are not found, so we catch that
153        std::panic::catch_unwind(|| {
154            cudarc::driver::CudaContext::device_count()
155                .map(|c| c > 0)
156                .unwrap_or(false)
157        })
158        .unwrap_or(false)
159    }
160    #[cfg(not(feature = "cuda"))]
161    {
162        false
163    }
164}
165
166/// Get CUDA device count.
167///
168/// Returns 0 if CUDA is not available or libraries are not installed.
169pub fn cuda_device_count() -> usize {
170    #[cfg(feature = "cuda")]
171    {
172        // cudarc panics if CUDA libraries are not found, so we catch that
173        std::panic::catch_unwind(|| {
174            cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
175        })
176        .unwrap_or(0)
177    }
178    #[cfg(not(feature = "cuda"))]
179    {
180        0
181    }
182}
183
184/// Compile CUDA C source code to PTX using NVRTC.
185///
186/// This wraps `cudarc::nvrtc::compile_ptx` to provide PTX compilation
187/// without requiring downstream crates to depend on cudarc directly.
188///
189/// # Arguments
190///
191/// * `cuda_source` - CUDA C source code string
192///
193/// # Returns
194///
195/// PTX assembly as a string, or an error if compilation fails.
196///
197/// # Example
198///
199/// ```ignore
200/// use ringkernel_cuda::compile_ptx;
201///
202/// let cuda_source = r#"
203///     extern "C" __global__ void add(float* a, float* b, float* c, int n) {
204///         int i = blockIdx.x * blockDim.x + threadIdx.x;
205///         if (i < n) c[i] = a[i] + b[i];
206///     }
207/// "#;
208///
209/// let ptx = compile_ptx(cuda_source)?;
210/// ```
211#[cfg(feature = "cuda")]
212pub fn compile_ptx(cuda_source: &str) -> ringkernel_core::error::Result<String> {
213    use ringkernel_core::error::RingKernelError;
214
215    let ptx = cudarc::nvrtc::compile_ptx(cuda_source).map_err(|e| {
216        RingKernelError::CompilationError(format!("NVRTC compilation failed: {}", e))
217    })?;
218
219    Ok(ptx.to_src().to_string())
220}
221
222/// Stub compile_ptx when CUDA is not available.
223#[cfg(not(feature = "cuda"))]
224pub fn compile_ptx(_cuda_source: &str) -> ringkernel_core::error::Result<String> {
225    Err(ringkernel_core::error::RingKernelError::BackendUnavailable(
226        "CUDA feature not enabled".to_string(),
227    ))
228}
229
230/// PTX kernel source template for persistent ring kernel.
231///
232/// This is a minimal kernel that immediately marks itself as terminated.
233/// Uses PTX 8.0 / sm_75 as the lowest common denominator that supports
234/// cooperative groups. PTX is forward-compatible, so sm_75 PTX runs on
235/// sm_89/sm_90/sm_100 and newer GPUs.
236pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
237.version 8.0
238.target sm_75
239.address_size 64
240
241.visible .entry ring_kernel_main(
242    .param .u64 control_block_ptr,
243    .param .u64 input_queue_ptr,
244    .param .u64 output_queue_ptr,
245    .param .u64 shared_state_ptr
246) {
247    .reg .u64 %cb_ptr;
248    .reg .u32 %one;
249
250    // Load control block pointer
251    ld.param.u64 %cb_ptr, [control_block_ptr];
252
253    // Mark as terminated immediately (offset 8)
254    mov.u32 %one, 1;
255    st.global.u32 [%cb_ptr + 8], %one;
256
257    ret;
258}
259"#;