Table of Contents

Error Handling

This module covers debugging GPU code and implementing robust error handling for production applications.

GPU Error Types

Synchronous Errors

Detected immediately during API calls:

Error Cause Solution
InvalidArgument Bad parameter Validate inputs
OutOfMemory Allocation failed Reduce size, free unused
InvalidDevice Device not available Check device selection
CompilationFailed Kernel syntax error Fix kernel code

Asynchronous Errors

Detected during or after kernel execution:

Error Cause Solution
LaunchFailed Invalid configuration Check grid/block sizes
IllegalAddress Out-of-bounds access Add bounds checks
IllegalInstruction Unsupported operation Check device capability
AssertFailed Kernel assertion Debug kernel logic

Basic Error Handling

Try-Catch Pattern

public async Task<bool> SafeExecuteAsync(float[] data)
{
    try
    {
        using var buffer = _orchestrator.CreateBuffer<float>(data.Length);
        await buffer.CopyFromAsync(data);
        await _orchestrator.ExecuteKernelAsync(kernel, config, buffer);
        await buffer.CopyToAsync(data);
        return true;
    }
    catch (OutOfMemoryException ex)
    {
        _logger.LogError(ex, "GPU out of memory for {Size} elements", data.Length);
        return false;
    }
    catch (KernelExecutionException ex)
    {
        _logger.LogError(ex, "Kernel execution failed: {Error}", ex.GpuError);
        return false;
    }
    catch (Exception ex)
    {
        _logger.LogError(ex, "Unexpected error during GPU computation");
        throw;
    }
}

Error Checking After Operations

public async Task ExecuteWithValidation()
{
    await _orchestrator.ExecuteKernelAsync(kernel, config, buffer);

    // Synchronize to catch async errors
    var error = await _orchestrator.SynchronizeAsync();

    if (error != GpuError.Success)
    {
        throw new KernelExecutionException($"Kernel failed: {error}");
    }
}

Debugging Kernels

Debug Mode Compilation

// Enable detailed logging for debugging
host.Services.AddLogging(logging =>
{
    logging.AddConsole();
    logging.SetMinimumLevel(LogLevel.Debug);
    logging.AddFilter("DotCompute", LogLevel.Trace);
});

host.Services.AddDotComputeRuntime();

Kernel Assertions

[Kernel]
public static void DebugKernel(Span<float> data, int expectedLength)
{
    int idx = Kernel.ThreadId.X;

    // Debug assertion (only in debug builds)
    Kernel.Assert(idx < expectedLength, "Thread index out of bounds");
    Kernel.Assert(data.Length == expectedLength, "Buffer size mismatch");

    if (idx < data.Length)
    {
        float value = data[idx];
        Kernel.Assert(!float.IsNaN(value), "NaN detected in input");

        data[idx] = ProcessValue(value);
    }
}

Printf-Style Debugging

[Kernel(EnablePrintf = true)]
public static void DebugWithPrintf(Span<float> data)
{
    int idx = Kernel.ThreadId.X;

    if (idx < 5)  // Limit output
    {
        Kernel.Printf("Thread %d: input = %f\n", idx, data[idx]);
    }

    if (idx < data.Length)
    {
        data[idx] = data[idx] * 2;

        if (idx < 5)
        {
            Kernel.Printf("Thread %d: output = %f\n", idx, data[idx]);
        }
    }
}

CPU Validation Mode

Run kernel on CPU for debugging:

// Force CPU backend for easier debugging
await orchestrator.ExecuteAsync<object>(
    kernelName: "MyKernel",
    preferredBackend: "CPU",  // Force CPU for debugging
    args: params);

Production Error Handling

Retry with Fallback

public class ResilientComputeService
{
    private readonly IComputeOrchestrator _orchestrator;
    private readonly ILogger<ResilientComputeService> _logger;

    public async Task<float[]> ComputeAsync(float[] input, int maxRetries = 3)
    {
        for (int attempt = 0; attempt < maxRetries; attempt++)
        {
            try
            {
                return await ComputeOnGpuAsync(input);
            }
            catch (OutOfMemoryException) when (attempt < maxRetries - 1)
            {
                _logger.LogWarning("GPU OOM, retrying with smaller batch");
                // Could try smaller batch size here
            }
            catch (KernelExecutionException ex) when (attempt < maxRetries - 1)
            {
                _logger.LogWarning(ex, "Kernel failed, retrying");
                await Task.Delay(100 * (attempt + 1));  // Backoff
            }
        }

        // Fallback to CPU
        _logger.LogWarning("GPU failed after retries, falling back to CPU");
        return await ComputeOnCpuAsync(input);
    }
}

Resource Cleanup

public class ComputePipeline : IAsyncDisposable
{
    private readonly List<IBuffer<float>> _buffers = new();
    private bool _disposed;

    public async Task ProcessAsync(float[] data)
    {
        if (_disposed)
            throw new ObjectDisposedException(nameof(ComputePipeline));

        try
        {
            var buffer = _orchestrator.CreateBuffer<float>(data.Length);
            _buffers.Add(buffer);

            await buffer.CopyFromAsync(data);
            await _orchestrator.ExecuteKernelAsync(kernel, config, buffer);
            await buffer.CopyToAsync(data);
        }
        catch
        {
            // Ensure cleanup on error
            await DisposeAsync();
            throw;
        }
    }

    public async ValueTask DisposeAsync()
    {
        if (_disposed) return;
        _disposed = true;

        // Synchronize to ensure all operations complete
        await _orchestrator.SynchronizeAsync();

        // Dispose all buffers
        foreach (var buffer in _buffers)
        {
            buffer.Dispose();
        }
        _buffers.Clear();
    }
}

Health Monitoring

public class GpuHealthMonitor
{
    private readonly IComputeOrchestrator _orchestrator;
    private readonly IUnifiedAcceleratorFactory _factory;
    private readonly ILogger _logger;

    public async Task<GpuHealth> CheckHealthAsync()
    {
        try
        {
            var backend = _orchestrator.ActiveBackend;

            // Check memory
            var freeMemory = backend.GetFreeMemory();
            var totalMemory = backend.GetTotalMemory();
            var memoryUsage = (totalMemory - freeMemory) / (double)totalMemory;

            // Run quick test kernel
            var testPassed = await RunTestKernelAsync();

            // Check temperature (if available)
            var temperature = backend.GetTemperature();

            return new GpuHealth
            {
                IsHealthy = testPassed && memoryUsage < 0.95,
                MemoryUsagePercent = memoryUsage * 100,
                FreeMemoryMB = freeMemory / (1024 * 1024),
                TemperatureCelsius = temperature,
                Status = testPassed ? "OK" : "Test Failed"
            };
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "GPU health check failed");
            return new GpuHealth
            {
                IsHealthy = false,
                Status = $"Error: {ex.Message}"
            };
        }
    }

    private async Task<bool> RunTestKernelAsync()
    {
        try
        {
            using var buffer = _orchestrator.CreateBuffer<float>(1024);
            await _orchestrator.ExecuteKernelAsync(
                TestKernels.Identity,
                new KernelConfig { BlockSize = 256, GridSize = 4 },
                buffer);
            return true;
        }
        catch
        {
            return false;
        }
    }
}

Common Issues and Solutions

Issue 1: Illegal Memory Access

Symptom: Kernel crashes with CUDA_ERROR_ILLEGAL_ADDRESS

Cause: Out-of-bounds array access

Solution:

[Kernel]
public static void SafeKernel(Span<float> data)
{
    int idx = Kernel.ThreadId.X;

    // Always check bounds
    if (idx >= data.Length)
        return;

    data[idx] = data[idx] * 2;
}

Issue 2: Timeout/Hang

Symptom: Kernel never completes

Cause: Infinite loop or deadlock

Solution:

// Set execution timeout
var timeout = TimeSpan.FromSeconds(30);

using var cts = new CancellationTokenSource(timeout);

try
{
    await _orchestrator.ExecuteKernelAsync(kernel, config, buffer)
        .WaitAsync(cts.Token);
}
catch (OperationCanceledException)
{
    _logger.LogError("Kernel execution timed out");
    await _orchestrator.ResetDeviceAsync();
    throw new TimeoutException("Kernel execution exceeded time limit");
}

Issue 3: Numerical Instability

Symptom: NaN or Inf results

Solution:

[Kernel]
public static void StableKernel(Span<float> data)
{
    int idx = Kernel.ThreadId.X;
    if (idx >= data.Length) return;

    float value = data[idx];

    // Check for invalid input
    if (float.IsNaN(value) || float.IsInfinity(value))
    {
        data[idx] = 0;  // or handle appropriately
        return;
    }

    // Avoid division by zero
    float denominator = CalculateDenominator(value);
    if (MathF.Abs(denominator) < 1e-10f)
    {
        denominator = 1e-10f;
    }

    data[idx] = value / denominator;
}

Logging Best Practices

public class GpuOperationLogger
{
    private readonly ILogger _logger;

    public async Task<T> LoggedOperationAsync<T>(
        string operationName,
        Func<Task<T>> operation)
    {
        var sw = Stopwatch.StartNew();

        try
        {
            _logger.LogDebug("Starting GPU operation: {Operation}", operationName);

            var result = await operation();

            _logger.LogDebug(
                "GPU operation completed: {Operation}, Duration: {Duration}ms",
                operationName, sw.ElapsedMilliseconds);

            return result;
        }
        catch (Exception ex)
        {
            _logger.LogError(ex,
                "GPU operation failed: {Operation}, Duration: {Duration}ms",
                operationName, sw.ElapsedMilliseconds);
            throw;
        }
    }
}

Exercises

Exercise 1: Error Recovery

Implement a compute service that automatically retries failed operations with exponential backoff.

Exercise 2: Debug Kernel

Add assertions and printf debugging to identify a bug in a provided kernel.

Exercise 3: Health Dashboard

Create a monitoring dashboard that displays GPU health metrics.

Key Takeaways

  1. Synchronize to catch async errors - GPU errors may not surface immediately
  2. Add bounds checks to all kernels - Most crashes come from out-of-bounds access
  3. Use CPU mode for debugging - Easier to step through and inspect
  4. Implement fallbacks - Production systems should handle GPU failures
  5. Monitor GPU health - Catch issues before they cause failures

Path Complete

Congratulations! You've completed the Intermediate Learning Path.

What you learned:

  • Memory optimization and pooling
  • Kernel performance tuning
  • Multi-kernel pipeline design
  • Production error handling

Next steps: