Getting Started with Orleans.GpuBridge.Core

This guide will walk you through creating your first GPU-accelerated Orleans application using Orleans.GpuBridge.Core.

Prerequisites

Before you begin, ensure you have:

.NET 9.0 SDK or later
NVIDIA GPU with CUDA 12.0+ or AMD GPU with ROCm 5.0+
Windows 10/11 or Linux (Ubuntu 22.04+)
Basic familiarity with Microsoft Orleans and C#

Verify GPU Setup

# For NVIDIA GPUs
nvidia-smi

# For AMD GPUs
rocm-smi

Step 1: Create a New Orleans Project

# Create a new console application
dotnet new console -n MyGpuApp
cd MyGpuApp

# Add Orleans packages
dotnet add package Microsoft.Orleans.Server
dotnet add package Microsoft.Orleans.Client

# Add GPU Bridge packages
dotnet add package Orleans.GpuBridge.Core
dotnet add package Orleans.GpuBridge.Backends.DotCompute

Step 2: Define Your Grain Interface

Create a file IVectorAddGrain.cs:

using Orleans;

namespace MyGpuApp;

/// <summary>
/// Grain interface for GPU-accelerated vector addition
/// </summary>
public interface IVectorAddGrain : IGrainWithIntegerKey
{
    /// <summary>
    /// Adds two vectors using GPU acceleration
    /// </summary>
    /// <param name="a">First vector</param>
    /// <param name="b">Second vector</param>
    /// <returns>Sum of the two vectors</returns>
    Task<float[]> AddVectorsAsync(float[] a, float[] b);
}

Step 3: Implement Your GPU Grain

Create a file VectorAddGrain.cs:

using Orleans;
using Orleans.GpuBridge.Abstractions;
using Orleans.Runtime;

namespace MyGpuApp;

/// <summary>
/// GPU-accelerated grain for vector addition
/// </summary>
[GpuAccelerated]
public class VectorAddGrain : Grain, IVectorAddGrain
{
    private readonly ILogger<VectorAddGrain> _logger;

    [GpuKernel("kernels/VectorAdd")]
    private IGpuKernel<VectorAddInput, float[]>? _kernel;

    public VectorAddGrain(ILogger<VectorAddGrain> logger)
    {
        _logger = logger;
    }

    public override Task OnActivateAsync(CancellationToken cancellationToken)
    {
        _logger.LogInformation("VectorAddGrain {GrainId} activated", this.GetPrimaryKeyLong());
        return base.OnActivateAsync(cancellationToken);
    }

    public async Task<float[]> AddVectorsAsync(float[] a, float[] b)
    {
        if (a.Length != b.Length)
        {
            throw new ArgumentException("Vectors must have the same length");
        }

        _logger.LogInformation("Processing vector addition of length {Length}", a.Length);

        if (_kernel == null)
        {
            throw new InvalidOperationException("GPU kernel not initialized");
        }

        var input = new VectorAddInput { A = a, B = b };
        return await _kernel.ExecuteAsync(input);
    }
}

/// <summary>
/// Input data for vector addition kernel
/// </summary>
public record VectorAddInput
{
    public required float[] A { get; init; }
    public required float[] B { get; init; }
}

Step 4: Implement the GPU Kernel

Create a file VectorAddKernel.cs:

using Orleans.GpuBridge.Abstractions;

namespace MyGpuApp;

/// <summary>
/// CPU fallback implementation of vector addition
/// (GPU implementation would use CUDA/ROCm)
/// </summary>
public class VectorAddKernel : IGpuKernel<VectorAddInput, float[]>
{
    public Task<float[]> ExecuteAsync(VectorAddInput input)
    {
        // CPU fallback implementation
        var result = new float[input.A.Length];
        for (int i = 0; i < input.A.Length; i++)
        {
            result[i] = input.A[i] + input.B[i];
        }
        return Task.FromResult(result);
    }

    public void Dispose()
    {
        // Cleanup GPU resources if needed
    }
}

For production GPU implementation, you would write a CUDA kernel:

// VectorAdd.cu - CUDA kernel implementation
__global__ void vectorAdd(const float* a, const float* b, float* c, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n)
    {
        c[idx] = a[idx] + b[idx];
    }
}

Step 5: Configure the Silo (Server)

Update Program.cs to configure the Orleans silo:

using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Orleans;
using Orleans.GpuBridge.Runtime;
using Orleans.Hosting;
using MyGpuApp;

var builder = Host.CreateDefaultBuilder(args)
    .UseOrleans((context, siloBuilder) =>
    {
        siloBuilder
            .UseLocalhostClustering()
            .ConfigureApplicationParts(parts =>
            {
                parts.AddApplicationPart(typeof(VectorAddGrain).Assembly).WithReferences();
            })
            .UseDashboard(options => options.Port = 8080);

        // Configure GPU Bridge
        siloBuilder.Services.AddGpuBridge(options =>
        {
            options.PreferGpu = true;
            options.EnableRingKernels = false; // Start with simple offload model
        })
        .AddKernel(k => k
            .Id("kernels/VectorAdd")
            .In<VectorAddInput>()
            .Out<float[]>()
            .FromFactory(sp => new VectorAddKernel()));
    })
    .ConfigureLogging(logging =>
    {
        logging.AddConsole();
        logging.SetMinimumLevel(LogLevel.Information);
    });

var host = builder.Build();
await host.RunAsync();

Step 6: Create a Client

Create a file Client.cs for testing:

using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Orleans;
using Orleans.Hosting;

namespace MyGpuApp;

public static class ClientApp
{
    public static async Task RunClientAsync()
    {
        var client = new HostBuilder()
            .UseOrleansClient(clientBuilder =>
            {
                clientBuilder.UseLocalhostClustering();
            })
            .ConfigureLogging(logging =>
            {
                logging.AddConsole();
                logging.SetMinimumLevel(LogLevel.Warning);
            })
            .Build();

        await client.StartAsync();
        var grainFactory = client.Services.GetRequiredService<IGrainFactory>();

        Console.WriteLine("Orleans GPU Bridge - Vector Addition Demo");
        Console.WriteLine("==========================================\n");

        // Get the grain
        var grain = grainFactory.GetGrain<IVectorAddGrain>(0);

        // Test with sample data
        var vectorA = new float[] { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f };
        var vectorB = new float[] { 10.0f, 20.0f, 30.0f, 40.0f, 50.0f };

        Console.WriteLine($"Vector A: [{string.Join(", ", vectorA)}]");
        Console.WriteLine($"Vector B: [{string.Join(", ", vectorB)}]");

        var result = await grain.AddVectorsAsync(vectorA, vectorB);

        Console.WriteLine($"Result:   [{string.Join(", ", result)}]");

        // Benchmark performance
        Console.WriteLine("\nRunning performance benchmark...");
        var largeVectorSize = 1_000_000;
        var largeA = Enumerable.Range(0, largeVectorSize).Select(i => (float)i).ToArray();
        var largeB = Enumerable.Range(0, largeVectorSize).Select(i => (float)i * 2).ToArray();

        var sw = System.Diagnostics.Stopwatch.StartNew();
        var largeResult = await grain.AddVectorsAsync(largeA, largeB);
        sw.Stop();

        Console.WriteLine($"Processed {largeVectorSize:N0} elements in {sw.ElapsedMilliseconds}ms");
        Console.WriteLine($"Throughput: {largeVectorSize / sw.Elapsed.TotalSeconds:N0} elements/second");

        await client.StopAsync();
    }
}

Update Program.cs to include client execution:

// Add this to Program.cs after host initialization
var siloTask = host.RunAsync();

// Wait for silo to start
await Task.Delay(5000);

// Run client
await ClientApp.RunClientAsync();

await siloTask;

Step 7: Run Your Application

# Build the project
dotnet build

# Run the application
dotnet run

You should see output similar to:

Orleans GPU Bridge - Vector Addition Demo
==========================================

Vector A: [1, 2, 3, 4, 5]
Vector B: [10, 20, 30, 40, 50]
Result:   [11, 22, 33, 44, 55]

Running performance benchmark...
Processed 1,000,000 elements in 45ms
Throughput: 22,222,222 elements/second

Step 8: Enable GPU Acceleration

To use actual GPU execution instead of CPU fallback:

Implement CUDA Kernel: Create VectorAdd.cu with CUDA code
Compile Kernel: Use nvcc to compile to PTX or cubin
Load Kernel: Update VectorAddKernel to load compiled GPU code
Configure Backend: Ensure DotCompute backend is properly configured

Example with DotCompute backend:

public class VectorAddKernel : IGpuKernel<VectorAddInput, float[]>
{
    private readonly IDotComputeDevice _device;
    private readonly IDotComputeKernel _kernel;

    public VectorAddKernel(IDotComputeDevice device)
    {
        _device = device;
        _kernel = _device.LoadKernel("VectorAdd.ptx", "vectorAdd");
    }

    public async Task<float[]> ExecuteAsync(VectorAddInput input)
    {
        var n = input.A.Length;
        var result = new float[n];

        // Allocate GPU memory
        using var gpuA = _device.Allocate(input.A);
        using var gpuB = _device.Allocate(input.B);
        using var gpuC = _device.Allocate<float>(n);

        // Launch kernel
        var blockSize = 256;
        var gridSize = (n + blockSize - 1) / blockSize;

        await _kernel.LaunchAsync(
            gridDim: new Dim3(gridSize),
            blockDim: new Dim3(blockSize),
            args: new object[] { gpuA, gpuB, gpuC, n }
        );

        // Copy result back
        await gpuC.CopyToAsync(result);

        return result;
    }

    public void Dispose()
    {
        _kernel?.Dispose();
    }
}

Next Steps

Now that you have a working GPU-accelerated Orleans application, explore:

Concepts and Background - Learn about GPU-native actors and ring kernels
Architecture Overview - Understand the system design
Hypergraph Actors - Build multi-way relationships
Temporal Correctness - Implement HLC and Vector Clocks
Advanced Examples - Production patterns

Troubleshooting

GPU Not Detected

If your GPU is not detected:

# Check GPU drivers
nvidia-smi  # or rocm-smi for AMD

# Verify CUDA installation
nvcc --version

# Check .NET GPU support
dotnet --info

Performance Issues

If performance is lower than expected:

Enable ring kernels for persistent GPU state
Increase batch sizes for better GPU utilization
Profile with NVIDIA Nsight or AMD ROCProfiler
Check memory transfer overhead

Common Errors

"Kernel not found": Ensure kernel ID matches registration "GPU memory allocation failed": Check available GPU memory "Device not available": Verify GPU drivers and CUDA installation

Resources

API Reference - Complete API documentation
GPU-Native Actors Guide - Advanced GPU patterns
GitHub Repository - Source code and examples
Orleans Documentation - Microsoft Orleans reference

← Back to Home | Next: Concepts →