Table of Contents

Metal Backend Architecture

This document provides detailed technical architecture documentation for the DotCompute Metal backend implementation.

Native Layer Architecture

File Structure

src/Backends/DotCompute.Backends.Metal/
├── native/
│   ├── include/
│   │   ├── DCMetalInterop.h     # Core type definitions
│   │   └── DCMetalMPS.h         # MPS function declarations
│   ├── src/
│   │   ├── DCMetalDevice.mm     # Device, buffers, kernels, command queues
│   │   └── DCMetalMPS.mm        # Metal Performance Shaders integration
│   ├── CMakeLists.txt           # Build configuration
│   └── build.sh                 # Build script
├── MPS/
│   ├── MetalMPSNative.cs        # P/Invoke declarations
│   ├── MetalMPSOrchestrator.cs  # Operation routing
│   └── MetalPerformanceShadersBackend.cs
├── Memory/
│   ├── MetalMemoryPoolManager.cs
│   └── MetalUnifiedBuffer.cs
├── MetalAccelerator.cs
├── MetalKernelCompiler.cs
└── MetalCommandQueuePool.cs

Native Interop Design

The native layer uses extern "C" functions for P/Invoke compatibility:

// DCMetalInterop.h
typedef void* DCMetalDevice;
typedef void* DCMetalBuffer;
typedef void* DCMetalLibrary;
typedef void* DCMetalKernel;
typedef void* DCMetalCommandQueue;

// Function naming convention: DCMetal_<Category>_<Action>
DCMetalDevice DCMetal_CreateDevice(void);
DCMetalBuffer DCMetal_CreateBuffer(DCMetalDevice device, size_t size, uint32_t options);
bool DCMetal_ExecuteKernel(DCMetalDevice device, DCMetalKernel kernel, ...);

Command Queue Pool

Problem

Metal's MTLCommandQueue is not thread-safe for concurrent command buffer creation. Using a single queue from multiple threads causes:

  • Race conditions
  • SIGSEGV crashes
  • Undefined behavior

Solution

Thread-safe command queue pool with per-device queues:

// DCMetalDevice.mm
static std::mutex s_commandQueueMutex;
static std::map<void*, id<MTLCommandQueue>> s_sharedCommandQueues;

id<MTLCommandQueue> getSharedCommandQueue(id<MTLDevice> device) {
    std::lock_guard<std::mutex> lock(s_commandQueueMutex);
    void* key = (__bridge void*)device;
    auto it = s_sharedCommandQueues.find(key);
    if (it != s_sharedCommandQueues.end()) {
        return it->second;
    }
    id<MTLCommandQueue> queue = [device newCommandQueue];
    s_sharedCommandQueues[key] = queue;
    return queue;
}

Cleanup

Explicit cleanup prevents ARC conflicts at exit:

void DCMetal_CleanupCommandQueues(void) {
    @autoreleasepool {
        std::lock_guard<std::mutex> lock(s_commandQueueMutex);
        s_sharedCommandQueues.clear();
    }
}

MPS Integration Architecture

Thread-Safe MPS Command Queues

MPS operations have their own command queue pool:

// DCMetalMPS.mm
static std::mutex s_mpsCommandQueueMutex;
static std::map<void*, id<MTLCommandQueue>> s_mpsCommandQueues;

static id<MTLCommandQueue> getMPSCommandQueue(id<MTLDevice> device) {
    std::lock_guard<std::mutex> lock(s_mpsCommandQueueMutex);
    void* key = (__bridge void*)device;
    auto it = s_mpsCommandQueues.find(key);
    if (it != s_mpsCommandQueues.end()) {
        return it->second;
    }
    id<MTLCommandQueue> queue = [device newCommandQueue];
    if (queue) {
        s_mpsCommandQueues[key] = queue;
    }
    return queue;
}

Static Data Safety

Avoid std::string in static scope to prevent exit-time destructor issues:

// BAD: std::string destructor conflicts with Objective-C runtime shutdown
static std::string familyStr; // Causes SIGSEGV on exit

// GOOD: Plain char array has no destructor
static char s_gpuFamilyBuffer[64] = "Unknown";

MPS Operation Flow

┌─────────────────────┐
│   C# Application    │
└──────────┬──────────┘
           │
           ▼
┌─────────────────────┐
│  MPSOrchestrator    │──► ShouldUseMPS() decision
└──────────┬──────────┘
           │
     ┌─────┴─────┐
     │           │
     ▼           ▼
┌─────────┐ ┌─────────┐
│  MPS    │ │ Custom  │
│ Backend │ │ Kernel  │
└────┬────┘ └────┬────┘
     │           │
     ▼           ▼
┌─────────────────────┐
│  Native P/Invoke    │
└──────────┬──────────┘
           │
           ▼
┌─────────────────────┐
│  libDotComputeMetal │
│  (Objective-C++)    │
└──────────┬──────────┘
           │
           ▼
┌─────────────────────┐
│  Metal Framework    │
│  + MPS Framework    │
└─────────────────────┘

Memory Architecture

Unified Memory Model

Apple Silicon uses unified memory accessible by both CPU and GPU:

// Storage mode selection
MTLResourceOptions options;
if (device.hasUnifiedMemory) {
    // Shared storage - direct CPU/GPU access
    options = MTLResourceStorageModeShared;
} else {
    // Private storage - GPU only, requires blit for CPU access
    options = MTLResourceStorageModePrivate;
}

Memory Pool Design

public class MetalMemoryPoolManager
{
    // Bucketed pools by size (powers of 2)
    private readonly ConcurrentDictionary<int, ConcurrentBag<MetalBuffer>> _pools;

    // Configuration
    private readonly int _maxPerBucket = 16;
    private readonly long _maxTotalBytes = 1024 * 1024 * 1024; // 1GB

    public MetalBuffer Rent(int size)
    {
        int bucket = GetBucket(size);
        if (_pools.TryGetValue(bucket, out var pool) && pool.TryTake(out var buffer))
        {
            return buffer; // 90%+ hit rate
        }
        return AllocateNew(size);
    }

    public void Return(MetalBuffer buffer)
    {
        int bucket = GetBucket(buffer.Size);
        if (_pools[bucket].Count < _maxPerBucket)
        {
            _pools[bucket].Add(buffer);
        }
        else
        {
            buffer.Dispose(); // Pool full, release
        }
    }
}

Kernel Compilation Architecture

Binary Caching

bool DCMetal_CompileLibraryWithCache(
    DCMetalDevice device,
    const char* source,
    DCMetalLibrary* outLibrary,
    const char* cacheKey)
{
    @autoreleasepool {
        id<MTLDevice> mtlDevice = (__bridge id<MTLDevice>)device;

        // Check binary cache first
        auto it = s_binaryArchives.find(cacheKey);
        if (it != s_binaryArchives.end()) {
            // Load from cached binary archive (~5μs)
            NSError* error = nil;
            id<MTLLibrary> library = [mtlDevice newLibraryWithData:it->second
                                                            error:&error];
            if (library) {
                *outLibrary = (__bridge_retained void*)library;
                return true;
            }
        }

        // Compile from source (~200ms)
        NSError* error = nil;
        id<MTLLibrary> library = [mtlDevice newLibraryWithSource:@(source)
                                                         options:nil
                                                           error:&error];
        if (library) {
            // Create and populate binary archive for future use
            MTLBinaryArchiveDescriptor* archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
            id<MTLBinaryArchive> archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc
                                                                               error:&error];
            // ... populate archive with compiled functions ...
            s_binaryArchives[cacheKey] = archive;

            *outLibrary = (__bridge_retained void*)library;
            return true;
        }
        return false;
    }
}

Compilation Pipeline

Source Code (MSL)
       │
       ▼
┌─────────────────┐
│  Hash Source    │──► Cache Key
└────────┬────────┘
         │
    ┌────┴────┐
    │         │
    ▼         ▼
[Cache Hit]  [Cache Miss]
    │              │
    ▼              ▼
Load Binary   Compile Source
Archive       (~200ms)
(~5μs)             │
    │              ▼
    │         Create Binary
    │         Archive
    │              │
    └──────┬───────┘
           │
           ▼
    MTLLibrary Ready
           │
           ▼
    Extract Functions
    (MTLFunction)
           │
           ▼
    Create Pipeline State
    (MTLComputePipelineState)

Error Handling

Native Error Propagation

typedef struct {
    bool success;
    int errorCode;
    char errorMessage[256];
} DCMetalResult;

DCMetalResult DCMetal_ExecuteKernel(...) {
    DCMetalResult result = {true, 0, ""};

    @try {
        // ... execution code ...
    }
    @catch (NSException* exception) {
        result.success = false;
        result.errorCode = -1;
        strncpy(result.errorMessage,
                [[exception reason] UTF8String],
                sizeof(result.errorMessage) - 1);
    }

    return result;
}

C# Exception Handling

public void Execute(...)
{
    var result = MetalNative.ExecuteKernel(...);
    if (!result.success)
    {
        throw new MetalExecutionException(
            result.errorCode,
            Marshal.PtrToStringUTF8(result.errorMessage));
    }
}

Build System

Native Library Build

#!/bin/bash
# build.sh

# Configure CMake
cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13

# Build
cmake --build build --config Release

# Install to parent directory
cp build/libDotComputeMetal.dylib ../

CMake Configuration

# CMakeLists.txt
cmake_minimum_required(VERSION 3.20)
project(DotComputeMetal VERSION 1.0.0 LANGUAGES CXX OBJCXX)

find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(MPS_FRAMEWORK MetalPerformanceShaders REQUIRED)
find_library(FOUNDATION_FRAMEWORK Foundation REQUIRED)

add_library(DotComputeMetal SHARED
    src/DCMetalDevice.mm
    src/DCMetalMPS.mm
)

target_link_libraries(DotComputeMetal
    ${METAL_FRAMEWORK}
    ${MPS_FRAMEWORK}
    ${FOUNDATION_FRAMEWORK}
)

set_target_properties(DotComputeMetal PROPERTIES
    CXX_STANDARD 17
    OBJCXX_STANDARD 17
)

Testing Architecture

Unit Test Categories

[Category("Metal")]
public class MetalAcceleratorTests
{
    [SkippableFact]
    public void CreateAccelerator_ReturnsValidDevice()
    {
        Skip.IfNot(RuntimeInformation.IsOSPlatform(OSPlatform.OSX));
        using var accelerator = MetalAccelerator.Create();
        Assert.NotNull(accelerator);
    }
}

[Category("Metal-MPS")]
public class MPSBackendTests
{
    [SkippableFact]
    public void MatrixMultiply_ProducesCorrectResult()
    {
        // ...
    }
}

Performance Validation

public class SimplePerformanceValidation
{
    public async Task<ValidationResult> RunAll()
    {
        var results = new List<ValidationResult>();

        // Claim #1: Unified Memory (2-3x speedup)
        results.Add(await ValidateUnifiedMemory());

        // Claim #2: MPS Performance (3-4x for large matrices)
        results.Add(await ValidateMPSPerformance());

        // Claim #4: Cold Start (<10ms)
        results.Add(await ValidateColdStart());

        // Claim #5: Kernel Cache (<1ms hits)
        results.Add(await ValidateKernelCache());

        // Claim #6: Command Buffer (<100μs)
        results.Add(await ValidateCommandBuffer());

        // Claim #7: Parallel Execution (>1.5x speedup)
        results.Add(await ValidateParallelExecution());

        return results;
    }
}

Debugging

Enable Native Logging

#define METAL_DEBUG 1

#if METAL_DEBUG
    #define METAL_LOG(fmt, ...) NSLog(@"[METAL-DEBUG] " fmt, ##__VA_ARGS__)
#else
    #define METAL_LOG(fmt, ...)
#endif

// Usage in code
METAL_LOG("Setting pipeline state: %p", pipelineState);
METAL_LOG("Dispatching: grid=(%lu,%lu,%lu)", gridX, gridY, gridZ);

GPU Validation

# Enable Metal validation layer
export METAL_DEVICE_WRAPPER_TYPE=1
export METAL_DEBUG_ERROR_MODE=0
./run-tests.sh

Version Compatibility

macOS Version Metal Family Features
10.13+ MTLGPUFamilyMac1 Basic compute
10.15+ MTLGPUFamilyMac2 Binary archives
11.0+ MTLGPUFamilyApple7 Full Apple Silicon
12.0+ MTLGPUFamilyApple8 M2+ features
14.0+ MTLGPUFamilyApple9 M3+ features

See Also