Debugging System Architecture
The Debugging System provides production-grade cross-backend validation, performance analysis, and determinism testing to ensure kernel correctness across CPU and GPU implementations.
Architecture Overview
Application Code
↓
IComputeOrchestrator
↓
DebugIntegratedOrchestrator (Transparent Wrapper)
↓
┌─────────────────────────────────────────────────┐
│ IKernelDebugService │
├─────────────────────────────────────────────────┤
│ - Cross-backend validation │
│ - Performance profiling │
│ - Determinism testing │
│ - Memory pattern analysis │
└─────────────────────────────────────────────────┘
↓ ↓
Primary Execution Reference Execution
(GPU - Fast) (CPU - Trusted)
↓ ↓
Compare Results → Validation Report
Design Principles
1. Production-Safe
- Minimal Overhead: < 5% in Production profile
- Selective Validation: Only suspicious results checked
- No Breaking Changes: Warnings only, never throws
2. Transparent Integration
- Wrapper Pattern: Wraps
IComputeOrchestratortransparently - No Code Changes: Enable via configuration only
- Opt-In: Disabled by default in Release builds
3. Actionable Insights
- Detailed Reports: What failed, where, and why
- Suggested Fixes: Guidance on resolving issues
- Performance Data: Execution times, memory usage
4. Multiple Profiles
- Development: Extensive validation (2-5x overhead)
- Testing: Balanced validation (20-50% overhead)
- Production: Selective validation (< 5% overhead)
- Disabled: No validation overhead
IKernelDebugService Interface
Complete debugging and validation interface:
public interface IKernelDebugService
{
/// <summary>
/// Validates kernel execution across multiple backends
/// </summary>
Task<ValidationResult> ValidateCrossBackendAsync(
string kernelName,
object parameters,
AcceleratorType primaryBackend,
AcceleratorType referenceBackend,
CancellationToken cancellationToken = default);
/// <summary>
/// Profiles kernel performance characteristics
/// </summary>
Task<PerformanceProfile> ProfileKernelAsync(
string kernelName,
object parameters,
AcceleratorType backend,
int iterations = 100,
CancellationToken cancellationToken = default);
/// <summary>
/// Tests kernel determinism (same input → same output)
/// </summary>
Task<DeterminismResult> TestDeterminismAsync(
string kernelName,
object parameters,
AcceleratorType backend,
int runs = 10,
CancellationToken cancellationToken = default);
/// <summary>
/// Analyzes memory access patterns for optimization opportunities
/// </summary>
Task<MemoryAccessReport> AnalyzeMemoryPatternsAsync(
string kernelName,
object parameters,
AcceleratorType backend,
CancellationToken cancellationToken = default);
/// <summary>
/// Validates numerical stability (no overflow, NaN, Inf)
/// </summary>
Task<NumericalStabilityResult> ValidateNumericalStabilityAsync(
string kernelName,
object parameters,
AcceleratorType backend,
CancellationToken cancellationToken = default);
/// <summary>
/// Compares kernel output against known golden reference
/// </summary>
Task<ValidationResult> ValidateAgainstGoldenAsync(
string kernelName,
object parameters,
object expectedOutput,
AcceleratorType backend,
CancellationToken cancellationToken = default);
/// <summary>
/// Stress tests kernel with random inputs
/// </summary>
Task<StressTestResult> StressTestKernelAsync(
string kernelName,
IInputGenerator inputGenerator,
AcceleratorType backend,
int iterations = 1000,
CancellationToken cancellationToken = default);
/// <summary>
/// Detects race conditions and thread-safety issues
/// </summary>
Task<RaceConditionReport> DetectRaceConditionsAsync(
string kernelName,
object parameters,
AcceleratorType backend,
int concurrentExecutions = 100,
CancellationToken cancellationToken = default);
}
Cross-Backend Validation
Validation Strategy
public class KernelDebugService : IKernelDebugService
{
private readonly IComputeOrchestrator _orchestrator;
private readonly DebugProfile _profile;
private readonly ILogger<KernelDebugService> _logger;
public async Task<ValidationResult> ValidateCrossBackendAsync(
string kernelName,
object parameters,
AcceleratorType primaryBackend,
AcceleratorType referenceBackend)
{
// 1. Execute on primary backend (e.g., GPU)
var primaryStopwatch = Stopwatch.StartNew();
var primaryResult = await _orchestrator.ExecuteKernelAsync(
kernelName,
parameters,
forceBackend: primaryBackend
);
primaryStopwatch.Stop();
// 2. Execute on reference backend (e.g., CPU)
var referenceStopwatch = Stopwatch.StartNew();
var referenceResult = await _orchestrator.ExecuteKernelAsync(
kernelName,
parameters,
forceBackend: referenceBackend
);
referenceStopwatch.Stop();
// 3. Compare results
var differences = CompareResults(
primaryResult,
referenceResult,
tolerance: _profile.ToleranceThreshold
);
// 4. Analyze differences
var severity = DetermineSeverity(differences);
// 5. Log findings
if (differences.Any())
{
_logger.LogWarning(
"Cross-backend validation found {Count} differences for {Kernel}. " +
"Primary: {Primary} ({PrimaryTime}ms), Reference: {Reference} ({ReferenceTime}ms)",
differences.Count,
kernelName,
primaryBackend,
primaryStopwatch.Elapsed.TotalMilliseconds,
referenceBackend,
referenceStopwatch.Elapsed.TotalMilliseconds
);
}
return new ValidationResult
{
IsValid = !differences.Any() || severity < Severity.Critical,
Differences = differences,
PrimaryBackend = primaryBackend,
PrimaryExecutionTime = primaryStopwatch.Elapsed,
ReferenceBackend = referenceBackend,
ReferenceExecutionTime = referenceStopwatch.Elapsed,
Severity = severity,
Recommendation = GenerateRecommendation(differences, severity)
};
}
private List<Difference> CompareResults(
object primaryResult,
object referenceResult,
double tolerance)
{
var differences = new List<Difference>();
// Handle arrays
if (primaryResult is Array primaryArray && referenceResult is Array referenceArray)
{
if (primaryArray.Length != referenceArray.Length)
{
differences.Add(new Difference
{
Type = DifferenceType.LengthMismatch,
Index = -1,
PrimaryValue = primaryArray.Length,
ReferenceValue = referenceArray.Length
});
return differences;
}
// Compare elements
for (int i = 0; i < primaryArray.Length; i++)
{
var primaryValue = Convert.ToDouble(primaryArray.GetValue(i));
var referenceValue = Convert.ToDouble(referenceArray.GetValue(i));
if (!AreEqual(primaryValue, referenceValue, tolerance))
{
differences.Add(new Difference
{
Type = DifferenceType.ValueMismatch,
Index = i,
PrimaryValue = primaryValue,
ReferenceValue = referenceValue,
RelativeError = Math.Abs((primaryValue - referenceValue) / referenceValue)
});
}
}
}
return differences;
}
private bool AreEqual(double a, double b, double tolerance)
{
// Handle special values
if (double.IsNaN(a) && double.IsNaN(b)) return true;
if (double.IsInfinity(a) && double.IsInfinity(b)) return a.Equals(b);
// Relative tolerance comparison
var diff = Math.Abs(a - b);
var magnitude = Math.Max(Math.Abs(a), Math.Abs(b));
return diff <= tolerance * magnitude || diff <= tolerance;
}
}
Validation Result
public class ValidationResult
{
/// <summary>
/// True if validation passed (within tolerance)
/// </summary>
public bool IsValid { get; init; }
/// <summary>
/// List of differences found
/// </summary>
public IReadOnlyList<Difference> Differences { get; init; }
/// <summary>
/// Primary backend used for execution
/// </summary>
public AcceleratorType PrimaryBackend { get; init; }
/// <summary>
/// Primary execution time
/// </summary>
public TimeSpan PrimaryExecutionTime { get; init; }
/// <summary>
/// Reference backend (typically CPU)
/// </summary>
public AcceleratorType ReferenceBackend { get; init; }
/// <summary>
/// Reference execution time
/// </summary>
public TimeSpan ReferenceExecutionTime { get; init; }
/// <summary>
/// Speedup factor (reference time / primary time)
/// </summary>
public double Speedup => ReferenceExecutionTime.TotalMilliseconds /
PrimaryExecutionTime.TotalMilliseconds;
/// <summary>
/// Severity of validation failure
/// </summary>
public Severity Severity { get; init; }
/// <summary>
/// Recommended action to fix issues
/// </summary>
public string Recommendation { get; init; }
}
public class Difference
{
public DifferenceType Type { get; init; }
public int Index { get; init; }
public object PrimaryValue { get; init; }
public object ReferenceValue { get; init; }
public double RelativeError { get; init; }
}
public enum DifferenceType
{
ValueMismatch,
LengthMismatch,
TypeMismatch,
NumericalInstability
}
public enum Severity
{
None,
Low, // Within 10x tolerance
Medium, // Within 100x tolerance
High, // Within 1000x tolerance
Critical // Beyond 1000x tolerance
}
Debugging Profiles
Profile Configuration
public enum DebugProfile
{
/// <summary>
/// No validation (0% overhead)
/// </summary>
Disabled,
/// <summary>
/// Minimal validation in production (< 5% overhead)
/// Validates only suspicious results (NaN, Inf, unusually large values)
/// </summary>
Production,
/// <summary>
/// Balanced validation for testing (20-50% overhead)
/// Validates random 10% sample of executions
/// </summary>
Testing,
/// <summary>
/// Extensive validation for development (2-5x overhead)
/// Validates every execution
/// </summary>
Development
}
public class DebugOptions
{
/// <summary>
/// Debugging profile
/// </summary>
public DebugProfile Profile { get; set; } = DebugProfile.Disabled;
/// <summary>
/// Enable cross-backend validation
/// </summary>
public bool EnableCrossBackendValidation { get; set; } = true;
/// <summary>
/// Reference backend for validation (default: CPU)
/// </summary>
public AcceleratorType ReferenceBackend { get; set; } = AcceleratorType.CPU;
/// <summary>
/// Tolerance threshold for floating-point comparison
/// </summary>
public double ToleranceThreshold { get; set; } = 1e-5;
/// <summary>
/// Validate all executions (Development profile)
/// </summary>
public bool ValidateAllExecutions { get; set; } = false;
/// <summary>
/// Sampling rate for Testing profile (0.0 to 1.0)
/// </summary>
public double SamplingRate { get; set; } = 0.1; // 10% sample
/// <summary>
/// Throw exception on validation failure
/// </summary>
public bool ThrowOnValidationFailure { get; set; } = false;
/// <summary>
/// Enable performance profiling
/// </summary>
public bool EnablePerformanceProfiling { get; set; } = true;
/// <summary>
/// Enable determinism testing
/// </summary>
public bool EnableDeterminismTesting { get; set; } = false;
}
Profile Characteristics
| Profile | Overhead | Validation | Use Case |
|---|---|---|---|
| Disabled | 0% | None | Production, performance-critical |
| Production | < 5% | Suspicious results only | Production monitoring |
| Testing | 20-50% | 10% random sample | CI/CD, integration tests |
| Development | 2-5x | Every execution | Local development, debugging |
Profile Implementation
public class DebugIntegratedOrchestrator : IComputeOrchestrator
{
private readonly IComputeOrchestrator _innerOrchestrator;
private readonly IKernelDebugService _debugService;
private readonly DebugOptions _options;
private readonly Random _random = new();
public async Task<TResult> ExecuteKernelAsync<TResult>(
string kernelName,
object parameters,
CancellationToken cancellationToken = default)
{
// Check if validation needed based on profile
if (!ShouldValidate(kernelName))
{
return await _innerOrchestrator.ExecuteKernelAsync<TResult>(
kernelName,
parameters,
cancellationToken
);
}
// Execute with validation
var result = await _innerOrchestrator.ExecuteKernelAsync<TResult>(
kernelName,
parameters,
cancellationToken
);
// Validate result
var validation = await _debugService.ValidateCrossBackendAsync(
kernelName,
parameters,
primaryBackend: AcceleratorType.Auto,
referenceBackend: _options.ReferenceBackend,
cancellationToken
);
// Handle validation failure
if (!validation.IsValid)
{
HandleValidationFailure(kernelName, validation);
}
return result;
}
private bool ShouldValidate(string kernelName)
{
return _options.Profile switch
{
DebugProfile.Disabled => false,
DebugProfile.Development => true,
DebugProfile.Testing => _random.NextDouble() < _options.SamplingRate,
DebugProfile.Production => false, // Validate in post-execution hook instead
_ => false
};
}
private void HandleValidationFailure(string kernelName, ValidationResult validation)
{
var message = $"Validation failed for kernel '{kernelName}': " +
$"{validation.Differences.Count} differences found " +
$"(severity: {validation.Severity}). " +
$"Recommendation: {validation.Recommendation}";
if (_options.ThrowOnValidationFailure)
{
throw new ValidationException(message, validation);
}
else
{
_logger.LogWarning(message);
}
}
}
Performance Profiling
Performance Profile
public class PerformanceProfile
{
/// <summary>
/// Kernel name
/// </summary>
public string KernelName { get; init; }
/// <summary>
/// Backend used for execution
/// </summary>
public AcceleratorType Backend { get; init; }
/// <summary>
/// Number of iterations executed
/// </summary>
public int Iterations { get; init; }
/// <summary>
/// Total execution time
/// </summary>
public TimeSpan TotalTime { get; init; }
/// <summary>
/// Average execution time per iteration
/// </summary>
public TimeSpan AverageTime => TimeSpan.FromTicks(TotalTime.Ticks / Iterations);
/// <summary>
/// Minimum execution time
/// </summary>
public TimeSpan MinTime { get; init; }
/// <summary>
/// Maximum execution time
/// </summary>
public TimeSpan MaxTime { get; init; }
/// <summary>
/// Standard deviation of execution times
/// </summary>
public TimeSpan StandardDeviation { get; init; }
/// <summary>
/// Throughput (iterations per second)
/// </summary>
public double Throughput => Iterations / TotalTime.TotalSeconds;
/// <summary>
/// Memory allocated (bytes)
/// </summary>
public long MemoryAllocated { get; init; }
/// <summary>
/// Memory transferred (bytes)
/// </summary>
public long MemoryTransferred { get; init; }
/// <summary>
/// Compute intensity (FLOPS)
/// </summary>
public double ComputeIntensity { get; init; }
/// <summary>
/// Performance in GFLOPS
/// </summary>
public double GFLOPS => ComputeIntensity / AverageTime.TotalSeconds / 1e9;
}
Profiling Implementation
public async Task<PerformanceProfile> ProfileKernelAsync(
string kernelName,
object parameters,
AcceleratorType backend,
int iterations = 100)
{
var times = new List<TimeSpan>();
long memoryAllocated = 0;
long memoryTransferred = 0;
// Warm-up (exclude from measurements)
await _orchestrator.ExecuteKernelAsync(
kernelName,
parameters,
forceBackend: backend
);
// Profile iterations
var totalStopwatch = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
var iterationStopwatch = Stopwatch.StartNew();
await _orchestrator.ExecuteKernelAsync(
kernelName,
parameters,
forceBackend: backend
);
iterationStopwatch.Stop();
times.Add(iterationStopwatch.Elapsed);
}
totalStopwatch.Stop();
// Calculate statistics
var minTime = times.Min();
var maxTime = times.Max();
var avgTime = TimeSpan.FromTicks((long)times.Average(t => t.Ticks));
var variance = times.Average(t => Math.Pow((t - avgTime).Ticks, 2));
var stdDev = TimeSpan.FromTicks((long)Math.Sqrt(variance));
return new PerformanceProfile
{
KernelName = kernelName,
Backend = backend,
Iterations = iterations,
TotalTime = totalStopwatch.Elapsed,
MinTime = minTime,
MaxTime = maxTime,
StandardDeviation = stdDev,
MemoryAllocated = memoryAllocated,
MemoryTransferred = memoryTransferred
};
}
Determinism Testing
Determinism Result
public class DeterminismResult
{
/// <summary>
/// True if kernel is deterministic (all runs produce same output)
/// </summary>
public bool IsDeterministic { get; init; }
/// <summary>
/// Number of runs executed
/// </summary>
public int Runs { get; init; }
/// <summary>
/// Differences found between runs
/// </summary>
public IReadOnlyList<DeterminismViolation> Violations { get; init; }
/// <summary>
/// Likely cause of non-determinism
/// </summary>
public NonDeterminismCause? Cause { get; init; }
}
public class DeterminismViolation
{
public int RunIndex { get; init; }
public int ElementIndex { get; init; }
public object ExpectedValue { get; init; }
public object ActualValue { get; init; }
}
public enum NonDeterminismCause
{
RaceCondition, // Concurrent writes to same location
UnorderedReduction, // Non-associative reduction operation
FloatingPointError, // Accumulated FP rounding errors
RandomNumberGenerator, // Uses RNG without fixed seed
SystemClock, // Uses system time
Unknown
}
Testing Implementation
public async Task<DeterminismResult> TestDeterminismAsync(
string kernelName,
object parameters,
AcceleratorType backend,
int runs = 10)
{
var results = new List<object>();
// Execute multiple runs with same input
for (int i = 0; i < runs; i++)
{
var result = await _orchestrator.ExecuteKernelAsync(
kernelName,
parameters,
forceBackend: backend
);
results.Add(result);
}
// Compare all results against first run
var referenceResult = results[0];
var violations = new List<DeterminismViolation>();
for (int run = 1; run < runs; run++)
{
var differences = CompareResults(referenceResult, results[run], tolerance: 0);
foreach (var diff in differences)
{
violations.Add(new DeterminismViolation
{
RunIndex = run,
ElementIndex = diff.Index,
ExpectedValue = diff.ReferenceValue,
ActualValue = diff.PrimaryValue
});
}
}
// Analyze violations to determine cause
var cause = AnalyzeDeterminismViolations(violations);
return new DeterminismResult
{
IsDeterministic = !violations.Any(),
Runs = runs,
Violations = violations,
Cause = violations.Any() ? cause : null
};
}
private NonDeterminismCause AnalyzeDeterminismViolations(
List<DeterminismViolation> violations)
{
// Pattern analysis to determine likely cause
if (violations.All(v => v.ElementIndex % 32 == 0))
{
return NonDeterminismCause.RaceCondition; // Aligned accesses suggest race
}
if (violations.Count > 1000)
{
return NonDeterminismCause.FloatingPointError; // Many small errors
}
// More sophisticated analysis...
return NonDeterminismCause.Unknown;
}
Memory Pattern Analysis
Memory Access Report
public class MemoryAccessReport
{
/// <summary>
/// Kernel name
/// </summary>
public string KernelName { get; init; }
/// <summary>
/// Total memory accesses
/// </summary>
public long TotalAccesses { get; init; }
/// <summary>
/// Sequential access percentage (0.0 to 1.0)
/// </summary>
public double SequentialAccessRate { get; init; }
/// <summary>
/// Random access percentage (0.0 to 1.0)
/// </summary>
public double RandomAccessRate { get; init; }
/// <summary>
/// Cache hit rate (estimated, 0.0 to 1.0)
/// </summary>
public double CacheHitRate { get; init; }
/// <summary>
/// Memory bandwidth utilization (0.0 to 1.0)
/// </summary>
public double BandwidthUtilization { get; init; }
/// <summary>
/// Bank conflicts detected (GPU specific)
/// </summary>
public int BankConflicts { get; init; }
/// <summary>
/// Optimization suggestions
/// </summary>
public IReadOnlyList<string> Suggestions { get; init; }
}
Service Registration
Configuration
// Minimal setup (disabled by default)
services.AddDotComputeRuntime();
// Enable production debugging
services.AddProductionDebugging(options =>
{
options.Profile = DebugProfile.Production;
options.EnableCrossBackendValidation = true;
options.ReferenceBackend = AcceleratorType.CPU;
options.ToleranceThreshold = 1e-5;
options.ThrowOnValidationFailure = false; // Log only
});
// Development debugging (extensive validation)
#if DEBUG
services.AddProductionDebugging(options =>
{
options.Profile = DebugProfile.Development;
options.ValidateAllExecutions = true;
options.EnablePerformanceProfiling = true;
options.EnableDeterminismTesting = true;
options.ThrowOnValidationFailure = true; // Fail fast
});
#endif
// Testing profile (CI/CD)
services.AddProductionDebugging(options =>
{
options.Profile = DebugProfile.Testing;
options.SamplingRate = 0.1; // Validate 10% of executions
options.ThrowOnValidationFailure = true;
});
Usage Examples
Example 1: Cross-Backend Validation
// Arrange
var debugService = serviceProvider.GetRequiredService<IKernelDebugService>();
// Act
var validation = await debugService.ValidateCrossBackendAsync(
kernelName: "MatrixMultiply",
parameters: new { matrixA, matrixB },
primaryBackend: AcceleratorType.CUDA,
referenceBackend: AcceleratorType.CPU
);
// Assert
if (!validation.IsValid)
{
Console.WriteLine($"Validation failed with {validation.Differences.Count} differences");
Console.WriteLine($"Severity: {validation.Severity}");
Console.WriteLine($"Recommendation: {validation.Recommendation}");
Console.WriteLine($"GPU execution: {validation.PrimaryExecutionTime.TotalMilliseconds}ms");
Console.WriteLine($"CPU execution: {validation.ReferenceExecutionTime.TotalMilliseconds}ms");
Console.WriteLine($"Speedup: {validation.Speedup:F2}x");
}
Example 2: Performance Profiling
var profile = await debugService.ProfileKernelAsync(
kernelName: "VectorAdd",
parameters: new { a = dataA, b = dataB },
backend: AcceleratorType.CUDA,
iterations: 1000
);
Console.WriteLine($"Average time: {profile.AverageTime.TotalMicroseconds:F2}μs");
Console.WriteLine($"Min/Max: {profile.MinTime.TotalMicroseconds:F2}μs / {profile.MaxTime.TotalMicroseconds:F2}μs");
Console.WriteLine($"Std dev: {profile.StandardDeviation.TotalMicroseconds:F2}μs");
Console.WriteLine($"Throughput: {profile.Throughput:F0} ops/sec");
Example 3: Determinism Testing
var determinism = await debugService.TestDeterminismAsync(
kernelName: "ParallelSum",
parameters: new { data = inputData },
backend: AcceleratorType.CUDA,
runs: 100
);
if (!determinism.IsDeterministic)
{
Console.WriteLine($"Kernel is non-deterministic!");
Console.WriteLine($"Found {determinism.Violations.Count} violations");
Console.WriteLine($"Likely cause: {determinism.Cause}");
}
Testing Strategy
Debug Service Tests
[Fact]
public async Task ValidateCrossBackend_IdenticalResults_ReturnsValid()
{
// Arrange
var debugService = CreateDebugService();
var input = Enumerable.Range(0, 1000).Select(i => (float)i).ToArray();
// Act
var validation = await debugService.ValidateCrossBackendAsync(
"VectorDouble",
new { input },
AcceleratorType.CUDA,
AcceleratorType.CPU
);
// Assert
Assert.True(validation.IsValid);
Assert.Empty(validation.Differences);
}
[Fact]
public async Task ProfileKernel_ReturnsAccurateStatistics()
{
// Arrange
var debugService = CreateDebugService();
// Act
var profile = await debugService.ProfileKernelAsync(
"SimpleKernel",
parameters: new { size = 1000 },
backend: AcceleratorType.CPU,
iterations: 100
);
// Assert
Assert.Equal(100, profile.Iterations);
Assert.True(profile.MinTime <= profile.AverageTime);
Assert.True(profile.AverageTime <= profile.MaxTime);
Assert.True(profile.Throughput > 0);
}