Table of Contents

Orleans Integration Guide

Version: 0.4.1-rc2 Status: Production Ready Last Updated: November 2025

Overview

DotCompute integrates seamlessly with Microsoft Orleans to enable distributed GPU computing with actor-based concurrency. This guide covers GPU accelerator usage in Orleans grains, including device lifecycle management, error recovery, and reset strategies optimized for grain activation/deactivation patterns.

Table of Contents

Why Orleans + DotCompute?

Combining Orleans and DotCompute provides:

  • Actor-Based GPU Computing: Encapsulate GPU operations in grains for clean concurrency
  • Automatic Distribution: Orleans distributes GPU workloads across cluster nodes
  • Fault Tolerance: Grain lifecycle management with automatic error recovery
  • Resource Isolation: Each grain can manage its own GPU device or share a pool
  • Elastic Scaling: Scale GPU compute capacity dynamically
  • State Management: Persistent grain state with GPU computation results

Quick Start

1. Add Dependencies

<ItemGroup>
  <PackageReference Include="DotCompute.Core" Version="0.4.0" />
  <PackageReference Include="DotCompute.Backends.CUDA" Version="0.4.0" />
  <PackageReference Include="Microsoft.Orleans.Core" Version="8.0.0" />
  <PackageReference Include="Microsoft.Orleans.Runtime" Version="8.0.0" />
</ItemGroup>

2. Define Grain Interface

using DotCompute.Abstractions;

public interface IComputeGrain : IGrainWithStringKey
{
    Task<double[]> ProcessDataAsync(double[] input);
    Task ResetDeviceAsync();
    Task<DeviceInfo> GetDeviceInfoAsync();
}

3. Implement Grain

using DotCompute.Abstractions;
using DotCompute.Abstractions.Recovery;
using Orleans;

public class ComputeGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;
    private readonly ILogger<ComputeGrain> _logger;

    public ComputeGrain(ILogger<ComputeGrain> logger)
    {
        _logger = logger;
    }

    public override async Task OnActivateAsync(CancellationToken cancellationToken)
    {
        // Initialize GPU accelerator on grain activation
        _accelerator = await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);

        _logger.LogInformation(
            "Grain {GrainId} activated with device {DeviceName}",
            this.GetGrainId(),
            _accelerator.DeviceName);

        await base.OnActivateAsync(cancellationToken);
    }

    public async Task<double[]> ProcessDataAsync(double[] input)
    {
        if (_accelerator == null)
            throw new InvalidOperationException("Accelerator not initialized");

        try
        {
            // GPU computation here
            var buffer = await _accelerator.AllocateAsync<double>(input.Length);
            await buffer.CopyFromAsync(input);

            // Execute kernel...
            var kernel = await _accelerator.CompileKernelAsync(kernelCode);
            await _accelerator.ExecuteKernelAsync(kernel, buffer);

            var result = new double[input.Length];
            await buffer.CopyToAsync(result);

            await buffer.DisposeAsync();
            return result;
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "GPU computation failed");

            // Attempt recovery
            await HandleErrorAsync(ex);
            throw;
        }
    }

    public async Task ResetDeviceAsync()
    {
        if (_accelerator == null) return;

        var result = await _accelerator.ResetAsync(
            ResetOptions.Hard,
            CancellationToken.None);

        if (!result.Success)
        {
            _logger.LogError("Device reset failed: {Error}", result.ErrorMessage);
            throw new InvalidOperationException($"Reset failed: {result.ErrorMessage}");
        }

        _logger.LogInformation(
            "Device reset completed: Freed {MemoryMB} MB in {Duration}ms",
            result.MemoryFreedBytes / 1024 / 1024,
            result.Duration.TotalMilliseconds);
    }

    public async Task<DeviceInfo> GetDeviceInfoAsync()
    {
        if (_accelerator == null)
            throw new InvalidOperationException("Accelerator not initialized");

        return new DeviceInfo
        {
            DeviceId = _accelerator.DeviceId,
            DeviceName = _accelerator.DeviceName,
            BackendType = _accelerator.BackendType.ToString(),
            TotalMemory = _accelerator.TotalMemory,
            AvailableMemory = _accelerator.AvailableMemory
        };
    }

    public override async Task OnDeactivateAsync(
        DeactivationReason reason,
        CancellationToken cancellationToken)
    {
        if (_accelerator != null)
        {
            // Orleans-optimized reset before deactivation
            await _accelerator.ResetAsync(
                ResetOptions.GrainDeactivation,
                cancellationToken);

            await _accelerator.DisposeAsync();
            _accelerator = null;
        }

        _logger.LogInformation(
            "Grain {GrainId} deactivated: {Reason}",
            this.GetGrainId(),
            reason);

        await base.OnDeactivateAsync(reason, cancellationToken);
    }

    private async Task HandleErrorAsync(Exception ex)
    {
        if (_accelerator == null) return;

        // Progressive reset based on error type
        var resetType = ex switch
        {
            OutOfMemoryException => ResetType.Hard,
            InvalidOperationException => ResetType.Context,
            _ => ResetType.Soft
        };

        var result = await _accelerator.ResetAsync(
            new ResetOptions { ResetType = resetType },
            CancellationToken.None);

        if (!result.Success)
        {
            _logger.LogError(
                "Error recovery failed: {Error}",
                result.ErrorMessage);
        }
    }
}

4. Configure Orleans Silo

using Microsoft.Extensions.Hosting;
using Orleans.Hosting;

var builder = Host.CreateDefaultBuilder(args)
    .UseOrleans(siloBuilder =>
    {
        siloBuilder
            .UseLocalhostClustering()
            .ConfigureApplicationParts(parts =>
            {
                parts.AddApplicationPart(typeof(ComputeGrain).Assembly)
                     .WithReferences();
            });
    });

var host = builder.Build();
await host.RunAsync();

Grain Lifecycle Integration

OnActivateAsync Pattern

public override async Task OnActivateAsync(CancellationToken cancellationToken)
{
    try
    {
        // 1. Initialize accelerator
        _accelerator = await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);

        // 2. Warm up (optional but recommended)
        await WarmUpAcceleratorAsync(cancellationToken);

        // 3. Load any persistent state
        await ReadStateAsync();

        _logger.LogInformation(
            "Grain activated with GPU {Device}",
            _accelerator.DeviceName);
    }
    catch (Exception ex)
    {
        _logger.LogError(ex, "Grain activation failed");
        throw;
    }

    await base.OnActivateAsync(cancellationToken);
}

private async Task WarmUpAcceleratorAsync(CancellationToken cancellationToken)
{
    // Execute a small dummy kernel to initialize GPU context
    var dummyBuffer = await _accelerator!.AllocateAsync<float>(1024);
    await dummyBuffer.DisposeAsync();

    // Soft reset to clean up
    await _accelerator.ResetAsync(ResetOptions.Soft, cancellationToken);
}

OnDeactivateAsync Pattern

public override async Task OnDeactivateAsync(
    DeactivationReason reason,
    CancellationToken cancellationToken)
{
    if (_accelerator != null)
    {
        try
        {
            // 1. Save any state
            await WriteStateAsync();

            // 2. Cancel pending operations
            _computeCts?.Cancel();

            // 3. Orleans-optimized reset
            var resetOptions = reason switch
            {
                DeactivationReason.ApplicationShutdown => ResetOptions.CompleteCleanup,
                DeactivationReason.ActivationIdle => ResetOptions.GrainDeactivation,
                _ => ResetOptions.Hard
            };

            var result = await _accelerator.ResetAsync(
                resetOptions,
                cancellationToken);

            if (!result.Success)
            {
                _logger.LogWarning(
                    "Reset during deactivation failed: {Error}",
                    result.ErrorMessage);
            }

            // 4. Dispose accelerator
            await _accelerator.DisposeAsync();
            _accelerator = null;
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error during grain deactivation");
        }
    }

    await base.OnDeactivateAsync(reason, cancellationToken);
}

Device Management Strategies

Strategy 1: One Device Per Grain (Isolated)

Best for: Independent compute tasks, fault isolation

public class IsolatedComputeGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;

    public override async Task OnActivateAsync(CancellationToken cancellationToken)
    {
        // Each grain gets its own GPU device
        _accelerator = await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);

        await base.OnActivateAsync(cancellationToken);
    }

    // Rest of implementation...
}

Pros:

  • Complete isolation between grains
  • No contention for GPU resources
  • Simple error handling

Cons:

  • Higher memory usage
  • Limited by number of GPUs
  • Slower activation

Strategy 2: Shared Device Pool (Efficient)

Best for: Many grains, limited GPUs, efficient resource usage

public interface IAcceleratorPool
{
    Task<IAccelerator> AcquireAsync(CancellationToken cancellationToken);
    Task ReleaseAsync(IAccelerator accelerator);
}

public class AcceleratorPool : IAcceleratorPool
{
    private readonly ConcurrentBag<IAccelerator> _availableAccelerators;
    private readonly SemaphoreSlim _semaphore;
    private readonly int _maxSize;

    public AcceleratorPool(int maxSize = 4)
    {
        _maxSize = maxSize;
        _availableAccelerators = new ConcurrentBag<IAccelerator>();
        _semaphore = new SemaphoreSlim(maxSize, maxSize);
    }

    public async Task<IAccelerator> AcquireAsync(CancellationToken cancellationToken)
    {
        await _semaphore.WaitAsync(cancellationToken);

        if (_availableAccelerators.TryTake(out var accelerator))
        {
            // Soft reset before reuse
            await accelerator.ResetAsync(ResetOptions.Soft, cancellationToken);
            return accelerator;
        }

        // Create new accelerator
        return await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);
    }

    public async Task ReleaseAsync(IAccelerator accelerator)
    {
        try
        {
            // Context reset before returning to pool
            await accelerator.ResetAsync(
                ResetOptions.Context,
                CancellationToken.None);

            _availableAccelerators.Add(accelerator);
        }
        finally
        {
            _semaphore.Release();
        }
    }
}

public class PooledComputeGrain : Grain, IComputeGrain
{
    private readonly IAcceleratorPool _pool;
    private IAccelerator? _accelerator;

    public PooledComputeGrain(IAcceleratorPool pool)
    {
        _pool = pool;
    }

    public override async Task OnActivateAsync(CancellationToken cancellationToken)
    {
        _accelerator = await _pool.AcquireAsync(cancellationToken);
        await base.OnActivateAsync(cancellationToken);
    }

    public override async Task OnDeactivateAsync(
        DeactivationReason reason,
        CancellationToken cancellationToken)
    {
        if (_accelerator != null)
        {
            await _pool.ReleaseAsync(_accelerator);
            _accelerator = null;
        }

        await base.OnDeactivateAsync(reason, cancellationToken);
    }
}

Pros:

  • Efficient GPU utilization
  • Supports many grains
  • Fast activation (reuse existing devices)

Cons:

  • Potential contention
  • More complex error handling
  • Need to manage pool lifecycle

Strategy 3: Hybrid (Smart Allocation)

Best for: Mixed workloads, varying grain priorities

public class SmartComputeGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;
    private readonly IAcceleratorPool? _pool;
    private readonly GrainPriority _priority;
    private bool _isPooled;

    public SmartComputeGrain(
        IAcceleratorPool? pool,
        GrainPriority priority = GrainPriority.Normal)
    {
        _pool = pool;
        _priority = priority;
    }

    public override async Task OnActivateAsync(CancellationToken cancellationToken)
    {
        // High-priority grains get dedicated devices
        if (_priority == GrainPriority.High)
        {
            _accelerator = await AcceleratorFactory.CreateAsync(
                AcceleratorType.CUDA,
                cancellationToken);
            _isPooled = false;
        }
        // Normal priority use pool
        else if (_pool != null)
        {
            _accelerator = await _pool.AcquireAsync(cancellationToken);
            _isPooled = true;
        }
        else
        {
            throw new InvalidOperationException("No pool available for normal priority grain");
        }

        await base.OnActivateAsync(cancellationToken);
    }

    public override async Task OnDeactivateAsync(
        DeactivationReason reason,
        CancellationToken cancellationToken)
    {
        if (_accelerator != null)
        {
            if (_isPooled && _pool != null)
            {
                await _pool.ReleaseAsync(_accelerator);
            }
            else
            {
                await _accelerator.ResetAsync(
                    ResetOptions.CompleteCleanup,
                    cancellationToken);
                await _accelerator.DisposeAsync();
            }

            _accelerator = null;
        }

        await base.OnDeactivateAsync(reason, cancellationToken);
    }
}

public enum GrainPriority
{
    Normal,
    High
}

Reset Integration

Reset Options for Orleans

DotCompute provides Orleans-specific reset configurations:

// Optimized for grain deactivation (default for grain lifecycle)
ResetOptions.GrainDeactivation
{
    ResetType = ResetType.Context,
    WaitForCompletion = true,
    ClearKernelCache = true,
    ClearMemoryPool = false,  // Pool will be reused
    Timeout = TimeSpan.FromSeconds(5)
}

// For error recovery in grains
ResetOptions.ErrorRecovery
{
    ResetType = ResetType.Hard,
    WaitForCompletion = true,
    ClearKernelCache = true,
    ClearMemoryPool = true,
    Timeout = TimeSpan.FromSeconds(10),
    Force = true  // Don't wait for stuck operations
}

// For application shutdown
ResetOptions.CompleteCleanup
{
    ResetType = ResetType.Full,
    WaitForCompletion = true,
    ClearKernelCache = true,
    ClearMemoryPool = true,
    Reinitialize = false,  // Will be disposed anyway
    Timeout = TimeSpan.FromSeconds(30)
}

Periodic Reset Pattern

public class LongRunningComputeGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;
    private int _operationCount;
    private const int ResetThreshold = 1000;

    public async Task<ComputeResult> ProcessAsync(ComputeInput input)
    {
        if (_accelerator == null)
            throw new InvalidOperationException("Not activated");

        try
        {
            var result = await ExecuteComputeAsync(input);

            _operationCount++;

            // Periodic soft reset to prevent resource buildup
            if (_operationCount % ResetThreshold == 0)
            {
                await _accelerator.ResetAsync(
                    ResetOptions.Soft,
                    CancellationToken.None);

                _logger.LogInformation(
                    "Performed periodic reset after {Count} operations",
                    _operationCount);
            }

            return result;
        }
        catch (Exception ex)
        {
            await HandleComputeErrorAsync(ex);
            throw;
        }
    }
}

Memory Pressure Reset

public class MemoryAwareGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;
    private readonly ILogger<MemoryAwareGrain> _logger;

    public async Task<ComputeResult> ProcessLargeDatasetAsync(LargeDataset dataset)
    {
        if (_accelerator == null)
            throw new InvalidOperationException("Not activated");

        // Check available memory before operation
        var availableMemory = _accelerator.AvailableMemory;
        var requiredMemory = dataset.EstimatedMemoryUsage;

        if (availableMemory < requiredMemory * 1.2) // 20% safety margin
        {
            _logger.LogWarning(
                "Low memory: {Available} MB available, {Required} MB required",
                availableMemory / 1024 / 1024,
                requiredMemory / 1024 / 1024);

            // Attempt to free memory
            var result = await _accelerator.ResetAsync(
                ResetOptions.Hard,
                CancellationToken.None);

            if (result.Success && result.MemoryFreedBytes > 0)
            {
                _logger.LogInformation(
                    "Freed {Freed} MB",
                    result.MemoryFreedBytes / 1024 / 1024);
            }

            // Recheck
            availableMemory = _accelerator.AvailableMemory;
            if (availableMemory < requiredMemory)
            {
                throw new OutOfMemoryException(
                    $"Insufficient GPU memory: {availableMemory / 1024 / 1024} MB available, " +
                    $"{requiredMemory / 1024 / 1024} MB required");
            }
        }

        return await ExecuteComputeAsync(dataset);
    }
}

Error Recovery Patterns

Retry with Progressive Reset

public class ResilientComputeGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;

    public async Task<ComputeResult> ProcessWithRetryAsync(ComputeInput input)
    {
        if (_accelerator == null)
            throw new InvalidOperationException("Not activated");

        var maxAttempts = 3;
        Exception? lastException = null;

        for (int attempt = 1; attempt <= maxAttempts; attempt++)
        {
            try
            {
                return await ExecuteComputeAsync(input);
            }
            catch (Exception ex) when (attempt < maxAttempts)
            {
                lastException = ex;
                _logger.LogWarning(
                    ex,
                    "Compute attempt {Attempt} failed, will retry",
                    attempt);

                // Progressive reset escalation
                var resetType = attempt switch
                {
                    1 => ResetType.Context,
                    2 => ResetType.Hard,
                    _ => ResetType.Full
                };

                var resetResult = await _accelerator.ResetAsync(
                    new ResetOptions { ResetType = resetType },
                    CancellationToken.None);

                if (!resetResult.Success)
                {
                    _logger.LogError(
                        "Reset failed: {Error}",
                        resetResult.ErrorMessage);
                    break;
                }

                // Exponential backoff
                await Task.Delay(TimeSpan.FromMilliseconds(100 * Math.Pow(2, attempt)));
            }
        }

        throw new InvalidOperationException(
            $"Compute failed after {maxAttempts} attempts",
            lastException);
    }
}

Circuit Breaker Pattern

public class CircuitBreakerGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;
    private int _consecutiveFailures;
    private DateTime _lastFailureTime;
    private bool _circuitOpen;

    private const int FailureThreshold = 3;
    private static readonly TimeSpan CircuitResetTime = TimeSpan.FromMinutes(5);

    public async Task<ComputeResult> ProcessAsync(ComputeInput input)
    {
        if (_accelerator == null)
            throw new InvalidOperationException("Not activated");

        // Check circuit breaker
        if (_circuitOpen)
        {
            if (DateTime.UtcNow - _lastFailureTime > CircuitResetTime)
            {
                // Attempt to reset circuit
                await TryResetCircuitAsync();
            }
            else
            {
                throw new InvalidOperationException(
                    $"Circuit breaker open. Retry after {CircuitResetTime}");
            }
        }

        try
        {
            var result = await ExecuteComputeAsync(input);

            // Success - reset failure count
            _consecutiveFailures = 0;
            return result;
        }
        catch (Exception ex)
        {
            _consecutiveFailures++;
            _lastFailureTime = DateTime.UtcNow;

            _logger.LogError(
                ex,
                "Compute failed ({Failures}/{Threshold} failures)",
                _consecutiveFailures,
                FailureThreshold);

            // Open circuit if threshold exceeded
            if (_consecutiveFailures >= FailureThreshold)
            {
                _circuitOpen = true;
                _logger.LogCritical(
                    "Circuit breaker opened after {Failures} failures",
                    _consecutiveFailures);

                // Aggressive reset attempt
                await _accelerator.ResetAsync(
                    ResetOptions.ErrorRecovery,
                    CancellationToken.None);
            }

            throw;
        }
    }

    private async Task TryResetCircuitAsync()
    {
        _logger.LogInformation("Attempting to close circuit breaker");

        try
        {
            // Full device reset
            var result = await _accelerator!.ResetAsync(
                ResetOptions.Full,
                CancellationToken.None);

            if (result.Success)
            {
                _circuitOpen = false;
                _consecutiveFailures = 0;
                _logger.LogInformation("Circuit breaker closed successfully");
            }
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Failed to close circuit breaker");
        }
    }
}

Performance Optimization

Batch Processing with Smart Reset

public class BatchProcessingGrain : Grain, IBatchComputeGrain
{
    private IAccelerator? _accelerator;

    public async Task<BatchResult[]> ProcessBatchAsync(
        ComputeInput[] inputs,
        CancellationToken cancellationToken)
    {
        if (_accelerator == null)
            throw new InvalidOperationException("Not activated");

        var results = new BatchResult[inputs.Length];
        var batchSize = 100;

        for (int i = 0; i < inputs.Length; i += batchSize)
        {
            var batch = inputs.Skip(i).Take(batchSize).ToArray();

            // Process batch
            for (int j = 0; j < batch.Length; j++)
            {
                results[i + j] = await ExecuteComputeAsync(batch[j]);
            }

            // Soft reset between batches
            if (i + batchSize < inputs.Length)
            {
                await _accelerator.ResetAsync(
                    ResetOptions.Soft,
                    cancellationToken);
            }
        }

        // Context reset at the end
        await _accelerator.ResetAsync(ResetOptions.Context, cancellationToken);

        return results;
    }
}

Warm-Up and Precompilation

public class OptimizedGrain : Grain, IComputeGrain
{
    private IAccelerator? _accelerator;
    private readonly Dictionary<string, IKernel> _precompiledKernels = new();

    public override async Task OnActivateAsync(CancellationToken cancellationToken)
    {
        _accelerator = await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);

        // Precompile common kernels
        await PrecompileKernelsAsync(cancellationToken);

        // Warm-up GPU
        await WarmUpAsync(cancellationToken);

        await base.OnActivateAsync(cancellationToken);
    }

    private async Task PrecompileKernelsAsync(CancellationToken cancellationToken)
    {
        var kernelSources = GetCommonKernelSources();

        foreach (var (name, source) in kernelSources)
        {
            var kernel = await _accelerator!.CompileKernelAsync(source);
            _precompiledKernels[name] = kernel;
        }

        _logger.LogInformation(
            "Precompiled {Count} kernels during activation",
            _precompiledKernels.Count);
    }

    private async Task WarmUpAsync(CancellationToken cancellationToken)
    {
        // Execute small dummy workload to initialize GPU context
        var dummyBuffer = await _accelerator!.AllocateAsync<float>(1024);

        // Run a simple kernel
        if (_precompiledKernels.TryGetValue("warmup", out var warmupKernel))
        {
            await _accelerator.ExecuteKernelAsync(warmupKernel, dummyBuffer);
        }

        await dummyBuffer.DisposeAsync();

        // Clean up after warm-up
        await _accelerator.ResetAsync(ResetOptions.Soft, cancellationToken);

        _logger.LogInformation("GPU warm-up completed");
    }
}

Advanced Patterns

Multi-GPU Grain

public class MultiGpuGrain : Grain, IMultiGpuComputeGrain
{
    private readonly List<IAccelerator> _accelerators = new();

    public override async Task OnActivateAsync(CancellationToken cancellationToken)
    {
        // Initialize multiple GPUs
        var deviceCount = await AcceleratorFactory.GetDeviceCountAsync(
            AcceleratorType.CUDA);

        for (int i = 0; i < deviceCount; i++)
        {
            var accelerator = await AcceleratorFactory.CreateAsync(
                AcceleratorType.CUDA,
                deviceId: i,
                cancellationToken);

            _accelerators.Add(accelerator);
        }

        _logger.LogInformation(
            "Activated with {Count} GPUs",
            _accelerators.Count);

        await base.OnActivateAsync(cancellationToken);
    }

    public async Task<ComputeResult> ProcessParallelAsync(
        ComputeInput[] inputs,
        CancellationToken cancellationToken)
    {
        // Distribute work across GPUs
        var tasks = inputs
            .Select((input, index) => ProcessOnGpuAsync(
                input,
                _accelerators[index % _accelerators.Count],
                cancellationToken))
            .ToArray();

        var results = await Task.WhenAll(tasks);

        return CombineResults(results);
    }

    public override async Task OnDeactivateAsync(
        DeactivationReason reason,
        CancellationToken cancellationToken)
    {
        // Reset all GPUs in parallel
        var resetTasks = _accelerators.Select(async accelerator =>
        {
            await accelerator.ResetAsync(
                ResetOptions.GrainDeactivation,
                cancellationToken);
            await accelerator.DisposeAsync();
        });

        await Task.WhenAll(resetTasks);
        _accelerators.Clear();

        await base.OnDeactivateAsync(reason, cancellationToken);
    }
}

Stateful Grain with Persistent GPU State

public interface IStatefulComputeGrainState
{
    byte[] ModelWeights { get; set; }
    Dictionary<string, object> Metadata { get; set; }
}

public class StatefulComputeGrain : Grain, IStatefulComputeGrain
{
    private readonly IPersistentState<IStatefulComputeGrainState> _state;
    private IAccelerator? _accelerator;
    private IUnifiedBuffer<float>? _gpuState;

    public StatefulComputeGrain(
        [PersistentState("compute-state", "StateStore")]
        IPersistentState<IStatefulComputeGrainState> state)
    {
        _state = state;
    }

    public override async Task OnActivateAsync(CancellationToken cancellationToken)
    {
        _accelerator = await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);

        // Restore GPU state from persistent storage
        await RestoreGpuStateAsync(cancellationToken);

        await base.OnActivateAsync(cancellationToken);
    }

    private async Task RestoreGpuStateAsync(CancellationToken cancellationToken)
    {
        if (_state.State.ModelWeights?.Length > 0)
        {
            var weights = MemoryMarshal.Cast<byte, float>(_state.State.ModelWeights);
            _gpuState = await _accelerator!.AllocateAsync<float>(weights.Length);
            await _gpuState.CopyFromAsync(weights.ToArray());

            _logger.LogInformation(
                "Restored {Size} MB of GPU state",
                _state.State.ModelWeights.Length / 1024 / 1024);
        }
    }

    public async Task UpdateModelAsync(float[] newWeights)
    {
        if (_accelerator == null || _gpuState == null)
            throw new InvalidOperationException("Not activated");

        // Update GPU state
        await _gpuState.CopyFromAsync(newWeights);

        // Persist to Orleans state
        var bytes = MemoryMarshal.AsBytes(newWeights.AsSpan()).ToArray();
        _state.State.ModelWeights = bytes;
        await _state.WriteStateAsync();

        _logger.LogInformation("Model updated and persisted");
    }

    public override async Task OnDeactivateAsync(
        DeactivationReason reason,
        CancellationToken cancellationToken)
    {
        // Save GPU state before deactivation
        if (_gpuState != null)
        {
            var weights = new float[_gpuState.Length];
            await _gpuState.CopyToAsync(weights);

            var bytes = MemoryMarshal.AsBytes(weights.AsSpan()).ToArray();
            _state.State.ModelWeights = bytes;
            await _state.WriteStateAsync();

            await _gpuState.DisposeAsync();
        }

        if (_accelerator != null)
        {
            await _accelerator.ResetAsync(
                ResetOptions.GrainDeactivation,
                cancellationToken);
            await _accelerator.DisposeAsync();
        }

        await base.OnDeactivateAsync(reason, cancellationToken);
    }
}

Best Practices

1. Always Reset on Deactivation

// ✅ Good: Clean deactivation
public override async Task OnDeactivateAsync(
    DeactivationReason reason,
    CancellationToken cancellationToken)
{
    if (_accelerator != null)
    {
        await _accelerator.ResetAsync(
            ResetOptions.GrainDeactivation,
            cancellationToken);
        await _accelerator.DisposeAsync();
    }
    await base.OnDeactivateAsync(reason, cancellationToken);
}

// ❌ Bad: No reset
public override async Task OnDeactivateAsync(
    DeactivationReason reason,
    CancellationToken cancellationToken)
{
    await _accelerator?.DisposeAsync()!;  // Resource leak!
    await base.OnDeactivateAsync(reason, cancellationToken);
}

2. Use Appropriate Reset Levels

// ✅ Good: Match reset to scenario
await _accelerator.ResetAsync(ResetOptions.Soft, ct);           // Between operations
await _accelerator.ResetAsync(ResetOptions.Context, ct);        // Periodic cleanup
await _accelerator.ResetAsync(ResetOptions.Hard, ct);           // Error recovery
await _accelerator.ResetAsync(ResetOptions.GrainDeactivation, ct); // Deactivation

// ❌ Bad: Over-resetting
await _accelerator.ResetAsync(ResetOptions.Full, ct);  // After every operation!

3. Handle Activation Errors

// ✅ Good: Proper error handling
public override async Task OnActivateAsync(CancellationToken cancellationToken)
{
    try
    {
        _accelerator = await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);
    }
    catch (Exception ex)
    {
        _logger.LogError(ex, "Failed to initialize GPU");
        throw;  // Let Orleans handle it
    }

    await base.OnActivateAsync(cancellationToken);
}

// ❌ Bad: Swallow errors
public override async Task OnActivateAsync(CancellationToken cancellationToken)
{
    try
    {
        _accelerator = await AcceleratorFactory.CreateAsync(
            AcceleratorType.CUDA,
            cancellationToken);
    }
    catch
    {
        // Grain in invalid state!
    }
}

4. Log Reset Operations

// ✅ Good: Comprehensive logging
var result = await _accelerator.ResetAsync(resetOptions, ct);
_logger.LogInformation(
    "Device reset: Type={Type}, Success={Success}, Duration={Duration}ms, Memory={MemoryMB}MB",
    result.ResetType,
    result.Success,
    result.Duration.TotalMilliseconds,
    result.MemoryFreedBytes / 1024 / 1024);

// ❌ Bad: No logging
await _accelerator.ResetAsync(resetOptions, ct);

Troubleshooting

Grain Activation Fails

Symptoms: Grains fail to activate with GPU errors.

Solutions:

// Add retry logic to activation
public override async Task OnActivateAsync(CancellationToken cancellationToken)
{
    var maxAttempts = 3;
    for (int i = 0; i < maxAttempts; i++)
    {
        try
        {
            _accelerator = await AcceleratorFactory.CreateAsync(
                AcceleratorType.CUDA,
                cancellationToken);
            break;
        }
        catch (Exception ex) when (i < maxAttempts - 1)
        {
            _logger.LogWarning(ex, "GPU initialization attempt {Attempt} failed", i + 1);
            await Task.Delay(TimeSpan.FromSeconds(Math.Pow(2, i)));
        }
    }

    await base.OnActivateAsync(cancellationToken);
}

Memory Leaks in Grains

Symptoms: GPU memory usage grows over time.

Solutions:

// Implement periodic cleanup
private Timer? _cleanupTimer;

public override async Task OnActivateAsync(CancellationToken cancellationToken)
{
    _accelerator = await AcceleratorFactory.CreateAsync(
        AcceleratorType.CUDA,
        cancellationToken);

    // Schedule periodic cleanup
    _cleanupTimer = RegisterTimer(
        PeriodicCleanupAsync,
        null,
        TimeSpan.FromMinutes(5),
        TimeSpan.FromMinutes(5));

    await base.OnActivateAsync(cancellationToken);
}

private async Task PeriodicCleanupAsync(object state)
{
    if (_accelerator != null)
    {
        var result = await _accelerator.ResetAsync(
            ResetOptions.Context,
            CancellationToken.None);

        _logger.LogInformation(
            "Periodic cleanup freed {MemoryMB} MB",
            result.MemoryFreedBytes / 1024 / 1024);
    }
}

Slow Deactivation

Symptoms: Grain deactivation takes too long.

Solutions:

public override async Task OnDeactivateAsync(
    DeactivationReason reason,
    CancellationToken cancellationToken)
{
    if (_accelerator != null)
    {
        using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
        cts.CancelAfter(TimeSpan.FromSeconds(5));  // Timeout

        try
        {
            await _accelerator.ResetAsync(
                new ResetOptions
                {
                    ResetType = ResetType.Context,
                    Timeout = TimeSpan.FromSeconds(3),
                    Force = true  // Don't wait if stuck
                },
                cts.Token);
        }
        catch (OperationCanceledException)
        {
            _logger.LogWarning("Reset timed out during deactivation");
        }

        await _accelerator.DisposeAsync();
    }

    await base.OnDeactivateAsync(reason, cancellationToken);
}

See Also