Struct MultiGpuCoordinator
pub struct MultiGpuCoordinator {
config: MultiGpuConfig,
devices: RwLock<RawRwLock, Vec<DeviceInfo>>,
kernel_device_map: RwLock<RawRwLock, HashMap<KernelId, usize>>,
device_kernel_counts: RwLock<RawRwLock, Vec<AtomicUsize>>,
round_robin_counter: AtomicUsize,
total_kernels: AtomicU64,
custom_selector: RwLock<RawRwLock, Option<Arc<dyn Fn(&[DeviceStatus], &LaunchOptions) -> usize + Send + Sync>>>,
topology: RwLock<RawRwLock, Option<GpuTopology>>,
}Expand description
Multi-GPU coordinator for managing kernels across devices.
Fields§
§config: MultiGpuConfig§devices: RwLock<RawRwLock, Vec<DeviceInfo>>§kernel_device_map: RwLock<RawRwLock, HashMap<KernelId, usize>>§device_kernel_counts: RwLock<RawRwLock, Vec<AtomicUsize>>§round_robin_counter: AtomicUsize§total_kernels: AtomicU64§custom_selector: RwLock<RawRwLock, Option<Arc<dyn Fn(&[DeviceStatus], &LaunchOptions) -> usize + Send + Sync>>>§topology: RwLock<RawRwLock, Option<GpuTopology>>Implementations§
§impl MultiGpuCoordinator
impl MultiGpuCoordinator
pub fn new(config: MultiGpuConfig) -> Arc<MultiGpuCoordinator>
pub fn new(config: MultiGpuConfig) -> Arc<MultiGpuCoordinator>
Create a new multi-GPU coordinator.
pub fn register_device(&self, device: DeviceInfo)
pub fn register_device(&self, device: DeviceInfo)
Register a device with the coordinator.
pub fn unregister_device(&self, index: usize) -> DeviceUnregisterResult
pub fn unregister_device(&self, index: usize) -> DeviceUnregisterResult
Unregister a device and plan kernel migrations.
This method:
- Identifies all kernels on the device being removed
- Finds target devices for each kernel using load balancing
- Creates migration plans for kernels that can be moved
- Marks orphaned kernels that have no migration target
- Updates internal routing tables
The caller is responsible for executing the actual migrations using
KernelMigrator with the returned KernelMigrationPlan entries.
pub fn devices(&self) -> Vec<DeviceInfo>
pub fn devices(&self) -> Vec<DeviceInfo>
Get all registered devices.
pub fn device(&self, index: usize) -> Option<DeviceInfo>
pub fn device(&self, index: usize) -> Option<DeviceInfo>
Get device info by index.
pub fn device_count(&self) -> usize
pub fn device_count(&self) -> usize
Get number of devices.
pub fn select_device(
&self,
options: &LaunchOptions,
) -> Result<usize, RingKernelError>
pub fn select_device( &self, options: &LaunchOptions, ) -> Result<usize, RingKernelError>
Select a device for launching a kernel.
pub fn assign_kernel(&self, kernel_id: KernelId, device_index: usize)
pub fn assign_kernel(&self, kernel_id: KernelId, device_index: usize)
Assign a kernel to a device.
pub fn remove_kernel(&self, kernel_id: &KernelId)
pub fn remove_kernel(&self, kernel_id: &KernelId)
Remove a kernel assignment.
pub fn get_kernel_device(&self, kernel_id: &KernelId) -> Option<usize>
pub fn get_kernel_device(&self, kernel_id: &KernelId) -> Option<usize>
Get device for a kernel.
pub fn kernels_on_device(&self, device_index: usize) -> Vec<KernelId>
pub fn kernels_on_device(&self, device_index: usize) -> Vec<KernelId>
Get all kernels on a device.
pub fn get_all_status(&self) -> Vec<DeviceStatus>
pub fn get_all_status(&self) -> Vec<DeviceStatus>
Get status of all devices.
pub fn get_device_status(&self, device_index: usize) -> Option<DeviceStatus>
pub fn get_device_status(&self, device_index: usize) -> Option<DeviceStatus>
Get status of a specific device.
pub fn set_custom_selector<F>(&self, selector: F)
pub fn set_custom_selector<F>(&self, selector: F)
Set custom device selector.
pub fn stats(&self) -> MultiGpuStats
pub fn stats(&self) -> MultiGpuStats
Get coordinator statistics.
pub fn can_p2p(&self, device_a: usize, device_b: usize) -> bool
pub fn can_p2p(&self, device_a: usize, device_b: usize) -> bool
Check if P2P is available between two devices.
pub fn update_device_memory(&self, device_index: usize, available_memory: u64)
pub fn update_device_memory(&self, device_index: usize, available_memory: u64)
Update device memory info.
pub fn discover_topology(&self) -> GpuTopology
pub fn discover_topology(&self) -> GpuTopology
Discover GPU topology (estimates if probing not available).
pub fn topology(&self) -> GpuTopology
pub fn topology(&self) -> GpuTopology
Get current topology (discovers if not cached).
pub fn set_topology(&self, topology: GpuTopology)
pub fn set_topology(&self, topology: GpuTopology)
Set custom topology (for testing or manual configuration).
pub fn select_device_for_k2k(
&self,
source_kernel: &KernelId,
) -> Result<usize, RingKernelError>
pub fn select_device_for_k2k( &self, source_kernel: &KernelId, ) -> Result<usize, RingKernelError>
Get best device for communicating with a source kernel.
pub fn request_migration(
&self,
kernel_id: &KernelId,
target_device: usize,
) -> Result<MigrationRequest, RingKernelError>
pub fn request_migration( &self, kernel_id: &KernelId, target_device: usize, ) -> Result<MigrationRequest, RingKernelError>
Request to migrate a kernel to another device.
pub fn complete_migration(
&self,
request: &MigrationRequest,
) -> Result<(), RingKernelError>
pub fn complete_migration( &self, request: &MigrationRequest, ) -> Result<(), RingKernelError>
Complete a migration (updates internal mappings).
Auto Trait Implementations§
impl !Freeze for MultiGpuCoordinator
impl !RefUnwindSafe for MultiGpuCoordinator
impl Send for MultiGpuCoordinator
impl Sync for MultiGpuCoordinator
impl Unpin for MultiGpuCoordinator
impl !UnwindSafe for MultiGpuCoordinator
Blanket Implementations§
§impl<T> ArchivePointee for T
impl<T> ArchivePointee for T
§type ArchivedMetadata = ()
type ArchivedMetadata = ()
§fn pointer_metadata(
_: &<T as ArchivePointee>::ArchivedMetadata,
) -> <T as Pointee>::Metadata
fn pointer_metadata( _: &<T as ArchivePointee>::ArchivedMetadata, ) -> <T as Pointee>::Metadata
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
§impl<F, W, T, D> Deserialize<With<T, W>, D> for F
impl<F, W, T, D> Deserialize<With<T, W>, D> for F
§fn deserialize(
&self,
deserializer: &mut D,
) -> Result<With<T, W>, <D as Fallible>::Error>
fn deserialize( &self, deserializer: &mut D, ) -> Result<With<T, W>, <D as Fallible>::Error>
§impl<T> Instrument for T
impl<T> Instrument for T
§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more