pub struct MultiGpuCoordinator {
config: MultiGpuConfig,
devices: RwLock<Vec<DeviceInfo>>,
kernel_device_map: RwLock<HashMap<KernelId, usize>>,
device_kernel_counts: RwLock<Vec<AtomicUsize>>,
round_robin_counter: AtomicUsize,
total_kernels: AtomicU64,
custom_selector: RwLock<Option<Arc<dyn Fn(&[DeviceStatus], &LaunchOptions) -> usize + Send + Sync>>>,
topology: RwLock<Option<GpuTopology>>,
}Expand description
Multi-GPU coordinator for managing kernels across devices.
Fields§
§config: MultiGpuConfigConfiguration.
devices: RwLock<Vec<DeviceInfo>>Available devices.
kernel_device_map: RwLock<HashMap<KernelId, usize>>Kernel-to-device mapping.
device_kernel_counts: RwLock<Vec<AtomicUsize>>Device kernel counts.
round_robin_counter: AtomicUsizeRound-robin counter.
total_kernels: AtomicU64Total kernels launched.
custom_selector: RwLock<Option<Arc<dyn Fn(&[DeviceStatus], &LaunchOptions) -> usize + Send + Sync>>>Device selection callbacks (for custom strategy).
topology: RwLock<Option<GpuTopology>>GPU topology graph.
Implementations§
Source§impl MultiGpuCoordinator
impl MultiGpuCoordinator
Sourcepub fn new(config: MultiGpuConfig) -> Arc<Self>
pub fn new(config: MultiGpuConfig) -> Arc<Self>
Create a new multi-GPU coordinator.
Sourcepub fn register_device(&self, device: DeviceInfo)
pub fn register_device(&self, device: DeviceInfo)
Register a device with the coordinator.
Sourcepub fn unregister_device(&self, index: usize) -> DeviceUnregisterResult
pub fn unregister_device(&self, index: usize) -> DeviceUnregisterResult
Unregister a device and plan kernel migrations.
This method:
- Identifies all kernels on the device being removed
- Finds target devices for each kernel using load balancing
- Creates migration plans for kernels that can be moved
- Marks orphaned kernels that have no migration target
- Updates internal routing tables
The caller is responsible for executing the actual migrations using
KernelMigrator with the returned KernelMigrationPlan entries.
Sourcefn select_migration_target(&self, candidates: &[usize]) -> Option<usize>
fn select_migration_target(&self, candidates: &[usize]) -> Option<usize>
Select the best target device for migration.
Sourcefn calculate_migration_priority(
&self,
_kernel_id: &KernelId,
) -> MigrationPriority
fn calculate_migration_priority( &self, _kernel_id: &KernelId, ) -> MigrationPriority
Calculate migration priority for a kernel.
Sourcepub fn devices(&self) -> Vec<DeviceInfo>
pub fn devices(&self) -> Vec<DeviceInfo>
Get all registered devices.
Sourcepub fn device(&self, index: usize) -> Option<DeviceInfo>
pub fn device(&self, index: usize) -> Option<DeviceInfo>
Get device info by index.
Sourcepub fn device_count(&self) -> usize
pub fn device_count(&self) -> usize
Get number of devices.
Sourcepub fn select_device(&self, options: &LaunchOptions) -> Result<usize>
pub fn select_device(&self, options: &LaunchOptions) -> Result<usize>
Select a device for launching a kernel.
Sourcepub fn assign_kernel(&self, kernel_id: KernelId, device_index: usize)
pub fn assign_kernel(&self, kernel_id: KernelId, device_index: usize)
Assign a kernel to a device.
Sourcepub fn remove_kernel(&self, kernel_id: &KernelId)
pub fn remove_kernel(&self, kernel_id: &KernelId)
Remove a kernel assignment.
Sourcepub fn get_kernel_device(&self, kernel_id: &KernelId) -> Option<usize>
pub fn get_kernel_device(&self, kernel_id: &KernelId) -> Option<usize>
Get device for a kernel.
Sourcepub fn kernels_on_device(&self, device_index: usize) -> Vec<KernelId>
pub fn kernels_on_device(&self, device_index: usize) -> Vec<KernelId>
Get all kernels on a device.
Sourcepub fn get_all_status(&self) -> Vec<DeviceStatus>
pub fn get_all_status(&self) -> Vec<DeviceStatus>
Get status of all devices.
Sourcepub fn get_device_status(&self, device_index: usize) -> Option<DeviceStatus>
pub fn get_device_status(&self, device_index: usize) -> Option<DeviceStatus>
Get status of a specific device.
Sourcepub fn set_custom_selector<F>(&self, selector: F)
pub fn set_custom_selector<F>(&self, selector: F)
Set custom device selector.
Sourcepub fn stats(&self) -> MultiGpuStats
pub fn stats(&self) -> MultiGpuStats
Get coordinator statistics.
Sourcepub fn can_p2p(&self, device_a: usize, device_b: usize) -> bool
pub fn can_p2p(&self, device_a: usize, device_b: usize) -> bool
Check if P2P is available between two devices.
Sourcepub fn update_device_memory(&self, device_index: usize, available_memory: u64)
pub fn update_device_memory(&self, device_index: usize, available_memory: u64)
Update device memory info.
Sourcepub fn discover_topology(&self) -> GpuTopology
pub fn discover_topology(&self) -> GpuTopology
Discover GPU topology (estimates if probing not available).
Sourcepub fn topology(&self) -> GpuTopology
pub fn topology(&self) -> GpuTopology
Get current topology (discovers if not cached).
Sourcepub fn set_topology(&self, topology: GpuTopology)
pub fn set_topology(&self, topology: GpuTopology)
Set custom topology (for testing or manual configuration).
Sourcepub fn select_device_for_k2k(&self, source_kernel: &KernelId) -> Result<usize>
pub fn select_device_for_k2k(&self, source_kernel: &KernelId) -> Result<usize>
Get best device for communicating with a source kernel.
Sourcepub fn request_migration(
&self,
kernel_id: &KernelId,
target_device: usize,
) -> Result<MigrationRequest>
pub fn request_migration( &self, kernel_id: &KernelId, target_device: usize, ) -> Result<MigrationRequest>
Request to migrate a kernel to another device.
Sourcepub fn complete_migration(&self, request: &MigrationRequest) -> Result<()>
pub fn complete_migration(&self, request: &MigrationRequest) -> Result<()>
Complete a migration (updates internal mappings).