Skip to main content

MultiGpuCoordinator

Struct MultiGpuCoordinator 

Source
pub struct MultiGpuCoordinator {
    config: MultiGpuConfig,
    devices: RwLock<Vec<DeviceInfo>>,
    kernel_device_map: RwLock<HashMap<KernelId, usize>>,
    device_kernel_counts: RwLock<Vec<AtomicUsize>>,
    round_robin_counter: AtomicUsize,
    total_kernels: AtomicU64,
    custom_selector: RwLock<Option<Arc<dyn Fn(&[DeviceStatus], &LaunchOptions) -> usize + Send + Sync>>>,
    topology: RwLock<Option<GpuTopology>>,
}
Expand description

Multi-GPU coordinator for managing kernels across devices.

Fields§

§config: MultiGpuConfig

Configuration.

§devices: RwLock<Vec<DeviceInfo>>

Available devices.

§kernel_device_map: RwLock<HashMap<KernelId, usize>>

Kernel-to-device mapping.

§device_kernel_counts: RwLock<Vec<AtomicUsize>>

Device kernel counts.

§round_robin_counter: AtomicUsize

Round-robin counter.

§total_kernels: AtomicU64

Total kernels launched.

§custom_selector: RwLock<Option<Arc<dyn Fn(&[DeviceStatus], &LaunchOptions) -> usize + Send + Sync>>>

Device selection callbacks (for custom strategy).

§topology: RwLock<Option<GpuTopology>>

GPU topology graph.

Implementations§

Source§

impl MultiGpuCoordinator

Source

pub fn new(config: MultiGpuConfig) -> Arc<Self>

Create a new multi-GPU coordinator.

Source

pub fn register_device(&self, device: DeviceInfo)

Register a device with the coordinator.

Source

pub fn unregister_device(&self, index: usize) -> DeviceUnregisterResult

Unregister a device and plan kernel migrations.

This method:

  1. Identifies all kernels on the device being removed
  2. Finds target devices for each kernel using load balancing
  3. Creates migration plans for kernels that can be moved
  4. Marks orphaned kernels that have no migration target
  5. Updates internal routing tables

The caller is responsible for executing the actual migrations using KernelMigrator with the returned KernelMigrationPlan entries.

Source

fn select_migration_target(&self, candidates: &[usize]) -> Option<usize>

Select the best target device for migration.

Source

fn calculate_migration_priority( &self, _kernel_id: &KernelId, ) -> MigrationPriority

Calculate migration priority for a kernel.

Source

pub fn devices(&self) -> Vec<DeviceInfo>

Get all registered devices.

Source

pub fn device(&self, index: usize) -> Option<DeviceInfo>

Get device info by index.

Source

pub fn device_count(&self) -> usize

Get number of devices.

Source

pub fn select_device(&self, options: &LaunchOptions) -> Result<usize>

Select a device for launching a kernel.

Source

pub fn assign_kernel(&self, kernel_id: KernelId, device_index: usize)

Assign a kernel to a device.

Source

pub fn remove_kernel(&self, kernel_id: &KernelId)

Remove a kernel assignment.

Source

pub fn get_kernel_device(&self, kernel_id: &KernelId) -> Option<usize>

Get device for a kernel.

Source

pub fn kernels_on_device(&self, device_index: usize) -> Vec<KernelId>

Get all kernels on a device.

Source

pub fn get_all_status(&self) -> Vec<DeviceStatus>

Get status of all devices.

Source

pub fn get_device_status(&self, device_index: usize) -> Option<DeviceStatus>

Get status of a specific device.

Source

pub fn set_custom_selector<F>(&self, selector: F)
where F: Fn(&[DeviceStatus], &LaunchOptions) -> usize + Send + Sync + 'static,

Set custom device selector.

Source

pub fn stats(&self) -> MultiGpuStats

Get coordinator statistics.

Source

pub fn can_p2p(&self, device_a: usize, device_b: usize) -> bool

Check if P2P is available between two devices.

Source

pub fn update_device_memory(&self, device_index: usize, available_memory: u64)

Update device memory info.

Source

pub fn discover_topology(&self) -> GpuTopology

Discover GPU topology (estimates if probing not available).

Source

pub fn topology(&self) -> GpuTopology

Get current topology (discovers if not cached).

Source

pub fn set_topology(&self, topology: GpuTopology)

Set custom topology (for testing or manual configuration).

Source

pub fn select_device_for_k2k(&self, source_kernel: &KernelId) -> Result<usize>

Get best device for communicating with a source kernel.

Source

pub fn request_migration( &self, kernel_id: &KernelId, target_device: usize, ) -> Result<MigrationRequest>

Request to migrate a kernel to another device.

Source

pub fn complete_migration(&self, request: &MigrationRequest) -> Result<()>

Complete a migration (updates internal mappings).

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
§

impl<T> ArchivePointee for T

§

type ArchivedMetadata = ()

The archived version of the pointer metadata for this type.
§

fn pointer_metadata( _: &<T as ArchivePointee>::ArchivedMetadata, ) -> <T as Pointee>::Metadata

Converts some archived metadata to the pointer metadata for itself.
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
§

impl<F, W, T, D> Deserialize<With<T, W>, D> for F
where W: DeserializeWith<F, T, D>, D: Fallible + ?Sized, F: ?Sized,

§

fn deserialize( &self, deserializer: &mut D, ) -> Result<With<T, W>, <D as Fallible>::Error>

Deserializes using the given deserializer
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

§

impl<T> Instrument for T

§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided [Span], returning an Instrumented wrapper. Read more
§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

§

impl<T> LayoutRaw for T

§

fn layout_raw(_: <T as Pointee>::Metadata) -> Result<Layout, LayoutError>

Gets the layout of the type.
§

impl<T> Pointee for T

§

type Metadata = ()

The type for metadata in pointers and references to Self.
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

§

fn vzip(self) -> V

§

impl<T> WithSubscriber for T

§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a [WithDispatch] wrapper. Read more
§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a [WithDispatch] wrapper. Read more