1#![warn(missing_docs)]
34#![warn(clippy::unwrap_used)]
35
36#[cfg(feature = "ptx-cache")]
37pub mod compile;
38#[cfg(feature = "cooperative")]
39pub mod cooperative;
40#[cfg(feature = "cuda")]
41mod device;
42#[cfg(feature = "cuda")]
43pub mod driver_api;
44#[cfg(feature = "cuda")]
45pub mod hopper;
46#[cfg(feature = "cuda")]
47pub mod k2k_gpu;
48#[cfg(feature = "cuda")]
49mod kernel;
50#[cfg(feature = "cuda")]
51pub mod launch_config;
52#[cfg(feature = "cuda")]
53mod memory;
54#[cfg(feature = "cuda")]
55pub mod memory_pool;
56#[cfg(feature = "cuda")]
57pub mod multi_gpu;
58#[cfg(feature = "cuda")]
59pub mod persistent;
60#[cfg(feature = "cuda")]
61pub mod phases;
62#[cfg(feature = "profiling")]
63pub mod profiling;
64#[cfg(feature = "cuda")]
65pub mod reduction;
66#[cfg(feature = "cuda")]
67mod runtime;
68#[cfg(feature = "cuda")]
69mod stencil;
70#[cfg(feature = "cuda")]
71pub mod stream;
72
73#[cfg(feature = "cuda")]
74pub use device::CudaDevice;
75#[cfg(feature = "cuda")]
76pub use kernel::CudaKernel;
77#[cfg(feature = "cuda")]
78pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
79#[cfg(feature = "cuda")]
80pub use persistent::CudaMappedBuffer;
81#[cfg(feature = "cuda")]
82pub use phases::{
83 InterPhaseReduction, KernelPhase, MultiPhaseConfig, MultiPhaseExecutor, PhaseExecutionStats,
84 SyncMode,
85};
86#[cfg(feature = "cuda")]
87pub use reduction::{
88 generate_block_reduce_code, generate_grid_reduce_code, generate_reduce_and_broadcast_code,
89 CacheKey, CacheStats, CachedReductionBuffer, ReductionBuffer, ReductionBufferBuilder,
90 ReductionBufferCache,
91};
92#[cfg(feature = "cuda")]
93pub use runtime::CudaRuntime;
94#[cfg(feature = "cuda")]
95pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
96
97#[cfg(feature = "profiling")]
99pub use profiling::{
100 CudaEvent, CudaEventFlags, CudaMemoryKind, CudaMemoryTracker, CudaNvtxProfiler,
101 GpuChromeTraceBuilder, GpuEventArgs, GpuTimer, GpuTimerPool, GpuTraceEvent, KernelMetrics,
102 ProfilingSession, TrackedAllocation, TransferDirection, TransferMetrics,
103};
104
105#[cfg(feature = "ptx-cache")]
107pub use compile::{PtxCache, PtxCacheError, PtxCacheResult, PtxCacheStats, CACHE_VERSION};
108
109#[cfg(feature = "cuda")]
111pub use memory_pool::{
112 GpuBucketStats, GpuPoolConfig, GpuPoolDiagnostics, GpuSizeClass, GpuStratifiedPool,
113};
114
115#[cfg(feature = "cuda")]
117pub use stream::{
118 OverlapMetrics, StreamConfig, StreamConfigBuilder, StreamError, StreamId, StreamManager,
119 StreamPool, StreamPoolStats, StreamResult,
120};
121
122#[cfg(feature = "cuda")]
124pub mod memory_exports {
125 pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
126}
127
128#[cfg(not(feature = "cuda"))]
130mod stub {
131 ringkernel_core::unavailable_backend!(
132 CudaRuntime,
133 ringkernel_core::runtime::Backend::Cuda,
134 "CUDA"
135 );
136}
137
138#[cfg(not(feature = "cuda"))]
139pub use stub::CudaRuntime;
140
141pub fn is_cuda_available() -> bool {
150 #[cfg(feature = "cuda")]
151 {
152 std::panic::catch_unwind(|| {
154 cudarc::driver::CudaContext::device_count()
155 .map(|c| c > 0)
156 .unwrap_or(false)
157 })
158 .unwrap_or(false)
159 }
160 #[cfg(not(feature = "cuda"))]
161 {
162 false
163 }
164}
165
166pub fn cuda_device_count() -> usize {
170 #[cfg(feature = "cuda")]
171 {
172 std::panic::catch_unwind(|| {
174 cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
175 })
176 .unwrap_or(0)
177 }
178 #[cfg(not(feature = "cuda"))]
179 {
180 0
181 }
182}
183
184#[cfg(feature = "cuda")]
212pub fn compile_ptx(cuda_source: &str) -> ringkernel_core::error::Result<String> {
213 use ringkernel_core::error::RingKernelError;
214
215 let ptx = cudarc::nvrtc::compile_ptx(cuda_source).map_err(|e| {
216 RingKernelError::CompilationError(format!("NVRTC compilation failed: {}", e))
217 })?;
218
219 Ok(ptx.to_src().to_string())
220}
221
222#[cfg(not(feature = "cuda"))]
224pub fn compile_ptx(_cuda_source: &str) -> ringkernel_core::error::Result<String> {
225 Err(ringkernel_core::error::RingKernelError::BackendUnavailable(
226 "CUDA feature not enabled".to_string(),
227 ))
228}
229
230pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
237.version 8.0
238.target sm_75
239.address_size 64
240
241.visible .entry ring_kernel_main(
242 .param .u64 control_block_ptr,
243 .param .u64 input_queue_ptr,
244 .param .u64 output_queue_ptr,
245 .param .u64 shared_state_ptr
246) {
247 .reg .u64 %cb_ptr;
248 .reg .u32 %one;
249
250 // Load control block pointer
251 ld.param.u64 %cb_ptr, [control_block_ptr];
252
253 // Mark as terminated immediately (offset 8)
254 mov.u32 %one, 1;
255 st.global.u32 [%cb_ptr + 8], %one;
256
257 ret;
258}
259"#;