Performance Profiling
This module covers GPU-native performance measurement using timing APIs, profiling tools, and optimization strategies.
Why GPU-Native Timing?
CPU-based timing includes:
- Kernel launch overhead
- Host-device synchronization
- PCIe latency
GPU-native timing provides:
- Sub-microsecond resolution
- Accurate kernel-only measurement
- Hardware timestamp counters
GPU Timing API
Basic Usage
// Create timing provider
var timingProvider = orchestrator.GetTimingProvider();
// Calibrate for accuracy
await timingProvider.CalibrateAsync();
// Get GPU timestamp
var timestamp = await timingProvider.GetGpuTimestampAsync();
Console.WriteLine($"GPU timestamp: {timestamp} ns");
// Get clock frequency
var frequency = timingProvider.GetGpuClockFrequency();
Console.WriteLine($"Clock frequency: {frequency / 1e6:F0} MHz");
// Get timer resolution
var resolution = timingProvider.GetTimerResolutionNanos();
Console.WriteLine($"Resolution: {resolution} ns");
Measuring Kernel Execution
public async Task<double> MeasureKernelExecutionAsync(Delegate kernel, KernelConfig config, params object[] args)
{
var timingProvider = orchestrator.GetTimingProvider();
// Record start timestamp
var startTimestamp = await timingProvider.GetGpuTimestampAsync();
// Execute kernel
await orchestrator.ExecuteKernelAsync(kernel, config, args);
// Synchronize to ensure kernel completed
await orchestrator.SynchronizeAsync();
// Record end timestamp
var endTimestamp = await timingProvider.GetGpuTimestampAsync();
// Calculate elapsed time in milliseconds
double elapsedNs = endTimestamp - startTimestamp;
return elapsedNs / 1_000_000.0;
}
Batch Timestamp Collection
// For high-frequency measurements
var timestamps = await timingProvider.GetGpuTimestampsBatchAsync(
count: 100,
interval: TimeSpan.FromMicroseconds(10));
// Analyze timing distribution
var deltas = new List<double>();
for (int i = 1; i < timestamps.Length; i++)
{
deltas.Add((timestamps[i] - timestamps[i-1]) / 1000.0); // microseconds
}
Console.WriteLine($"Average interval: {deltas.Average():F2} µs");
Console.WriteLine($"Std deviation: {CalculateStdDev(deltas):F2} µs");
Calibration Strategies
Available Strategies
// Quick calibration (default)
await timingProvider.CalibrateAsync(CalibrationStrategy.Quick);
// Accurate calibration (takes longer)
await timingProvider.CalibrateAsync(CalibrationStrategy.Accurate);
// Statistical calibration (most accurate)
await timingProvider.CalibrateAsync(CalibrationStrategy.Statistical);
// No calibration (raw timestamps)
await timingProvider.CalibrateAsync(CalibrationStrategy.None);
Calibration Parameters
var calibrationOptions = new CalibrationOptions
{
Strategy = CalibrationStrategy.Accurate,
Iterations = 1000,
WarmupIterations = 100,
OutlierRejection = true,
OutlierThreshold = 2.0 // Standard deviations
};
await timingProvider.CalibrateAsync(calibrationOptions);
Profiling Patterns
Pattern 1: Comprehensive Kernel Profile
public class KernelProfile
{
public double ExecutionTimeMs { get; set; }
public double MemoryThroughputGBps { get; set; }
public double ComputeThroughputGFlops { get; set; }
public double Occupancy { get; set; }
public long RegistersPerThread { get; set; }
public long SharedMemoryBytes { get; set; }
}
public async Task<KernelProfile> ProfileKernelAsync(
Delegate kernel,
KernelConfig config,
int dataSize,
int flopsPerElement)
{
var timingProvider = orchestrator.GetTimingProvider();
// Warmup
for (int i = 0; i < 10; i++)
{
await orchestrator.ExecuteKernelAsync(kernel, config, buffer);
}
await orchestrator.SynchronizeAsync();
// Measure multiple iterations
const int iterations = 100;
var times = new double[iterations];
for (int i = 0; i < iterations; i++)
{
var start = await timingProvider.GetGpuTimestampAsync();
await orchestrator.ExecuteKernelAsync(kernel, config, buffer);
await orchestrator.SynchronizeAsync();
var end = await timingProvider.GetGpuTimestampAsync();
times[i] = (end - start) / 1_000_000.0; // ms
}
// Calculate metrics
double avgTime = times.Average();
double dataBytes = dataSize * sizeof(float) * 2; // read + write
double memBandwidth = dataBytes / (avgTime / 1000.0) / 1e9; // GB/s
double totalFlops = (double)dataSize * flopsPerElement;
double computeThroughput = totalFlops / (avgTime / 1000.0) / 1e9; // GFLOPS
// Get occupancy info
var occupancy = await orchestrator.GetOccupancyAsync(kernel, config.BlockSize);
return new KernelProfile
{
ExecutionTimeMs = avgTime,
MemoryThroughputGBps = memBandwidth,
ComputeThroughputGFlops = computeThroughput,
Occupancy = occupancy.TheoreticalOccupancy,
RegistersPerThread = occupancy.RegistersPerThread,
SharedMemoryBytes = occupancy.SharedMemoryPerBlock
};
}
Pattern 2: Pipeline Stage Timing
public class PipelineProfile
{
public Dictionary<string, double> StageTimes { get; set; } = new();
public double TotalTimeMs { get; set; }
}
public async Task<PipelineProfile> ProfilePipelineAsync()
{
var profile = new PipelineProfile();
var timingProvider = orchestrator.GetTimingProvider();
var totalStart = await timingProvider.GetGpuTimestampAsync();
// Stage 1: Preprocessing
var stage1Start = await timingProvider.GetGpuTimestampAsync();
await orchestrator.ExecuteKernelAsync(preprocessKernel, config, buffer1, buffer2);
await orchestrator.SynchronizeAsync();
var stage1End = await timingProvider.GetGpuTimestampAsync();
profile.StageTimes["Preprocess"] = (stage1End - stage1Start) / 1_000_000.0;
// Stage 2: Main processing
var stage2Start = await timingProvider.GetGpuTimestampAsync();
await orchestrator.ExecuteKernelAsync(mainKernel, config, buffer2, buffer3);
await orchestrator.SynchronizeAsync();
var stage2End = await timingProvider.GetGpuTimestampAsync();
profile.StageTimes["Main"] = (stage2End - stage2Start) / 1_000_000.0;
// Stage 3: Postprocessing
var stage3Start = await timingProvider.GetGpuTimestampAsync();
await orchestrator.ExecuteKernelAsync(postprocessKernel, config, buffer3, buffer4);
await orchestrator.SynchronizeAsync();
var stage3End = await timingProvider.GetGpuTimestampAsync();
profile.StageTimes["Postprocess"] = (stage3End - stage3Start) / 1_000_000.0;
var totalEnd = await timingProvider.GetGpuTimestampAsync();
profile.TotalTimeMs = (totalEnd - totalStart) / 1_000_000.0;
return profile;
}
Pattern 3: Memory Transfer Profiling
public async Task ProfileTransfersAsync(int size)
{
var timingProvider = orchestrator.GetTimingProvider();
var hostData = new float[size];
using var buffer = orchestrator.CreateBuffer<float>(size);
// Profile Host → GPU
var h2dStart = await timingProvider.GetGpuTimestampAsync();
await buffer.CopyFromAsync(hostData);
await orchestrator.SynchronizeAsync();
var h2dEnd = await timingProvider.GetGpuTimestampAsync();
double h2dTimeMs = (h2dEnd - h2dStart) / 1_000_000.0;
double h2dBandwidth = (size * sizeof(float)) / (h2dTimeMs / 1000.0) / 1e9;
Console.WriteLine($"Host→GPU: {h2dTimeMs:F3} ms, {h2dBandwidth:F1} GB/s");
// Profile GPU → Host
var d2hStart = await timingProvider.GetGpuTimestampAsync();
await buffer.CopyToAsync(hostData);
await orchestrator.SynchronizeAsync();
var d2hEnd = await timingProvider.GetGpuTimestampAsync();
double d2hTimeMs = (d2hEnd - d2hStart) / 1_000_000.0;
double d2hBandwidth = (size * sizeof(float)) / (d2hTimeMs / 1000.0) / 1e9;
Console.WriteLine($"GPU→Host: {d2hTimeMs:F3} ms, {d2hBandwidth:F1} GB/s");
}
Bottleneck Analysis
Identifying Bottlenecks
public enum BottleneckType
{
Compute,
MemoryBandwidth,
MemoryLatency,
LaunchOverhead,
Synchronization
}
public async Task<BottleneckType> AnalyzeBottleneckAsync(KernelProfile profile, HardwareInfo hw)
{
// Calculate theoretical peaks
double peakMemBw = hw.MemoryBandwidthGBps;
double peakCompute = hw.PeakGFlops;
// Calculate utilization
double memUtilization = profile.MemoryThroughputGBps / peakMemBw;
double computeUtilization = profile.ComputeThroughputGFlops / peakCompute;
Console.WriteLine($"Memory utilization: {memUtilization:P0}");
Console.WriteLine($"Compute utilization: {computeUtilization:P0}");
if (memUtilization > 0.7 && computeUtilization < 0.3)
{
return BottleneckType.MemoryBandwidth;
}
else if (computeUtilization > 0.7 && memUtilization < 0.3)
{
return BottleneckType.Compute;
}
else if (profile.Occupancy < 0.3)
{
return BottleneckType.MemoryLatency;
}
else if (profile.ExecutionTimeMs < 0.1)
{
return BottleneckType.LaunchOverhead;
}
else
{
return BottleneckType.Synchronization;
}
}
Optimization Recommendations
public string GetOptimizationRecommendation(BottleneckType bottleneck)
{
return bottleneck switch
{
BottleneckType.MemoryBandwidth =>
"• Improve memory coalescing\n" +
"• Use shared memory for reused data\n" +
"• Reduce memory transactions per thread",
BottleneckType.Compute =>
"• Increase arithmetic intensity\n" +
"• Use faster math functions (e.g., __sinf vs sin)\n" +
"• Consider mixed precision",
BottleneckType.MemoryLatency =>
"• Increase occupancy (more warps)\n" +
"• Prefetch data to hide latency\n" +
"• Reduce register usage",
BottleneckType.LaunchOverhead =>
"• Batch multiple operations\n" +
"• Use persistent kernels (Ring Kernels)\n" +
"• Increase work per kernel launch",
BottleneckType.Synchronization =>
"• Reduce barrier frequency\n" +
"• Use warp-level primitives\n" +
"• Overlap compute and communication",
_ => "Profile more to identify specific issues"
};
}
BenchmarkDotNet Integration
[SimpleJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class GpuBenchmarks
{
private IComputeOrchestrator _orchestrator = null!;
private IBuffer<float> _buffer = null!;
[Params(1024, 1024*1024, 10*1024*1024)]
public int Size { get; set; }
[GlobalSetup]
public void Setup()
{
var services = new ServiceCollection();
services.AddDotComputeRuntime();
var provider = services.BuildServiceProvider();
_orchestrator = provider.GetRequiredService<IComputeOrchestrator>();
_buffer = _orchestrator.CreateBuffer<float>(Size);
}
[GlobalCleanup]
public void Cleanup()
{
_buffer.Dispose();
}
[Benchmark(Baseline = true)]
public async Task VectorAddBaseline()
{
await _orchestrator.ExecuteKernelAsync(
MyKernels.VectorAdd,
new KernelConfig { BlockSize = 256, GridSize = (Size + 255) / 256 },
_buffer, _buffer, _buffer);
await _orchestrator.SynchronizeAsync();
}
[Benchmark]
public async Task VectorAddOptimized()
{
await _orchestrator.ExecuteKernelAsync(
MyKernels.VectorAddOptimized,
new KernelConfig { BlockSize = 256, GridSize = (Size + 255) / 256 },
_buffer, _buffer, _buffer);
await _orchestrator.SynchronizeAsync();
}
}
Continuous Performance Monitoring
public class PerformanceMonitor
{
private readonly ConcurrentQueue<PerformanceSample> _samples = new();
private readonly ITimingProvider _timingProvider;
public void RecordSample(string operation, double durationMs, long dataSize)
{
_samples.Enqueue(new PerformanceSample
{
Timestamp = DateTime.UtcNow,
Operation = operation,
DurationMs = durationMs,
DataSize = dataSize,
ThroughputGBps = dataSize / (durationMs / 1000.0) / 1e9
});
// Detect performance regression
var recentSamples = GetRecentSamples(operation, TimeSpan.FromMinutes(5));
var avgThroughput = recentSamples.Average(s => s.ThroughputGBps);
var baseline = GetBaseline(operation);
if (avgThroughput < baseline * 0.8)
{
_logger.LogWarning(
"Performance regression detected in {Operation}: " +
"{Current:F1} GB/s vs baseline {Baseline:F1} GB/s",
operation, avgThroughput, baseline);
}
}
}
Exercises
Exercise 1: Kernel Profile
Profile a kernel and identify whether it's compute or memory bound.
Exercise 2: Pipeline Analysis
Profile a multi-stage pipeline and find the slowest stage.
Exercise 3: Optimization Benchmark
Optimize a kernel based on profiling data and verify improvement with benchmarks.
Key Takeaways
- GPU-native timing provides accurate kernel-only measurements
- Calibration improves accuracy - use appropriate strategy for your needs
- Multiple iterations reduce measurement noise
- Bottleneck analysis guides optimization efforts
- Continuous monitoring catches performance regressions
Path Complete
Congratulations! You've completed the Advanced Learning Path.
What you learned:
- Ring Kernel persistent GPU computation
- Barrier and synchronization patterns
- Multi-GPU programming with P2P
- GPU-native performance profiling
Next steps:
- Contributor Path - Extend DotCompute
- Ring Kernels Guide - Complete reference
- Performance Guide - Deep dive
Further Reading
- GPU Timing API - Complete API reference
- Performance Profiling - Advanced techniques
- Benchmarking - Benchmark methodology