ringkernel_cuda/
lib.rs

1//! CUDA Backend for RingKernel
2//!
3//! This crate provides NVIDIA CUDA GPU support for RingKernel using cudarc.
4//!
5//! # Features
6//!
7//! - Persistent kernel execution (cooperative groups)
8//! - Lock-free message queues in GPU global memory
9//! - PTX compilation via NVRTC
10//! - Multi-GPU support
11//!
12//! # Requirements
13//!
14//! - NVIDIA GPU with Compute Capability 7.0+
15//! - CUDA Toolkit 11.0+
16//! - Native Linux (persistent kernels) or WSL2 (event-driven fallback)
17//!
18//! # Example
19//!
20//! ```ignore
21//! use ringkernel_cuda::CudaRuntime;
22//!
23//! #[tokio::main]
24//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
25//!     let runtime = CudaRuntime::new().await?;
26//!     let kernel = runtime.launch("vector_add", Default::default()).await?;
27//!     kernel.activate().await?;
28//!     Ok(())
29//! }
30//! ```
31
32#![warn(missing_docs)]
33
34#[cfg(feature = "cooperative")]
35pub mod cooperative;
36#[cfg(feature = "cuda")]
37mod device;
38#[cfg(feature = "cuda")]
39pub mod driver_api;
40#[cfg(feature = "cuda")]
41pub mod k2k_gpu;
42#[cfg(feature = "cuda")]
43mod kernel;
44#[cfg(feature = "cuda")]
45mod memory;
46#[cfg(feature = "cuda")]
47pub mod persistent;
48#[cfg(feature = "cuda")]
49mod runtime;
50#[cfg(feature = "cuda")]
51mod stencil;
52
53#[cfg(feature = "cuda")]
54pub use device::CudaDevice;
55#[cfg(feature = "cuda")]
56pub use kernel::CudaKernel;
57#[cfg(feature = "cuda")]
58pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
59#[cfg(feature = "cuda")]
60pub use runtime::CudaRuntime;
61#[cfg(feature = "cuda")]
62pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
63
64/// Re-export memory module for advanced usage.
65#[cfg(feature = "cuda")]
66pub mod memory_exports {
67    pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
68}
69
70// Placeholder implementations when CUDA is not available
71#[cfg(not(feature = "cuda"))]
72mod stub {
73    use async_trait::async_trait;
74    use ringkernel_core::error::{Result, RingKernelError};
75    use ringkernel_core::runtime::{
76        Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
77    };
78
79    /// Stub CUDA runtime when CUDA feature is disabled.
80    pub struct CudaRuntime;
81
82    impl CudaRuntime {
83        /// Create fails when CUDA is not available.
84        pub async fn new() -> Result<Self> {
85            Err(RingKernelError::BackendUnavailable(
86                "CUDA feature not enabled".to_string(),
87            ))
88        }
89    }
90
91    #[async_trait]
92    impl RingKernelRuntime for CudaRuntime {
93        fn backend(&self) -> Backend {
94            Backend::Cuda
95        }
96
97        fn is_backend_available(&self, _backend: Backend) -> bool {
98            false
99        }
100
101        async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
102            Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
103        }
104
105        fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
106            None
107        }
108
109        fn list_kernels(&self) -> Vec<KernelId> {
110            vec![]
111        }
112
113        fn metrics(&self) -> RuntimeMetrics {
114            RuntimeMetrics::default()
115        }
116
117        async fn shutdown(&self) -> Result<()> {
118            Ok(())
119        }
120    }
121}
122
123#[cfg(not(feature = "cuda"))]
124pub use stub::CudaRuntime;
125
126/// Check if CUDA is available at runtime.
127///
128/// This function returns false if:
129/// - CUDA feature is not enabled
130/// - CUDA libraries are not installed on the system
131/// - No CUDA devices are present
132///
133/// It safely catches panics from cudarc when CUDA is not installed.
134pub fn is_cuda_available() -> bool {
135    #[cfg(feature = "cuda")]
136    {
137        // cudarc panics if CUDA libraries are not found, so we catch that
138        std::panic::catch_unwind(|| {
139            cudarc::driver::CudaContext::device_count()
140                .map(|c| c > 0)
141                .unwrap_or(false)
142        })
143        .unwrap_or(false)
144    }
145    #[cfg(not(feature = "cuda"))]
146    {
147        false
148    }
149}
150
151/// Get CUDA device count.
152///
153/// Returns 0 if CUDA is not available or libraries are not installed.
154pub fn cuda_device_count() -> usize {
155    #[cfg(feature = "cuda")]
156    {
157        // cudarc panics if CUDA libraries are not found, so we catch that
158        std::panic::catch_unwind(|| {
159            cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
160        })
161        .unwrap_or(0)
162    }
163    #[cfg(not(feature = "cuda"))]
164    {
165        0
166    }
167}
168
169/// PTX kernel source template for persistent ring kernel.
170///
171/// This is a minimal kernel that immediately marks itself as terminated.
172/// Uses PTX 8.0 / sm_89 for Ada Lovelace GPU compatibility (RTX 40xx series).
173pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
174.version 8.0
175.target sm_89
176.address_size 64
177
178.visible .entry ring_kernel_main(
179    .param .u64 control_block_ptr,
180    .param .u64 input_queue_ptr,
181    .param .u64 output_queue_ptr,
182    .param .u64 shared_state_ptr
183) {
184    .reg .u64 %cb_ptr;
185    .reg .u32 %one;
186
187    // Load control block pointer
188    ld.param.u64 %cb_ptr, [control_block_ptr];
189
190    // Mark as terminated immediately (offset 8)
191    mov.u32 %one, 1;
192    st.global.u32 [%cb_ptr + 8], %one;
193
194    ret;
195}
196"#;