Table of Contents

Class LinearAlgebraKernelLibrary

Namespace
DotCompute.Algorithms
Assembly
DotCompute.Algorithms.dll

Comprehensive GPU kernel library for linear algebra operations. Provides optimized kernels for matrix operations, decompositions, and eigenvalue computations.

public static class LinearAlgebraKernelLibrary
Inheritance
LinearAlgebraKernelLibrary
Inherited Members

Fields

CUDACholeskyKernel

CUDA kernel for blocked Cholesky decomposition.

public const string CUDACholeskyKernel = "\r\nextern \"C\" __global__ void cholesky_decomposition_cuda(\r\n    float* A,               // Input matrix (modified to L)\r\n    const int n,            // Matrix dimension\r\n    const int block_size    // Block size for tiling\r\n) {\r\n    int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n    int i = blockIdx.y;     // Current row being processed\r\n\r\n    // Ensure positive definiteness\r\n    __shared__ float pivot_value;\r\n\r\n    if (threadIdx.x == 0 && blockIdx.x == 0) {\r\n        pivot_value = A[i * n + i];\r\n    }\r\n\r\n    __syncthreads();\r\n\r\n    if (pivot_value <= 0.0f) {\r\n        return; // Matrix not positive definite\r\n    }\r\n\r\n    // Process diagonal element\r\n    if (i == blockIdx.x && tid == i) {\r\n        float sum = 0.0f;\r\n        for (int k = 0; k < i; k++) {\r\n            float val = A[i * n + k];\r\n            sum += val * val;\r\n        }\r\n\r\n        float diagonal_val = A[i * n + i] - sum;\r\n        if (diagonal_val > 0.0f) {\r\n            A[i * n + i] = sqrtf(diagonal_val);\r\n        }\r\n    }\r\n\r\n    __syncthreads();\r\n\r\n    // Process elements below diagonal in current column\r\n    if (tid > i && tid < n) {\r\n        float sum = 0.0f;\r\n        for (int k = 0; k < i; k++) {\r\n            sum += A[tid * n + k] * A[i * n + k];\r\n        }\r\n\r\n        float diagonal = A[i * n + i];\r\n        if (fabsf(diagonal) > 1e-10f) {\r\n            A[tid * n + i] = (A[tid * n + i] - sum) / diagonal;\r\n        }\r\n\r\n        // Zero out upper triangular part\r\n        if (tid < i) {\r\n            A[i * n + tid] = 0.0f;\r\n        }\r\n    }\r\n}"

Field Value

string

CUDAGivensRotationKernel

The c u d a givens rotation kernel.

public const string CUDAGivensRotationKernel = "\r\nextern \"C\" __global__ void apply_givens_rotation_cuda(\r\n    float* A,               // Matrix to transform\r\n    float* Q,               // Accumulate rotations\r\n    const int n,            // Matrix dimension\r\n    const int i,            // First row/col index\r\n    const int j,            // Second row/col index\r\n    const float c,          // Cosine of rotation\r\n    const float s           // Sine of rotation\r\n) {\r\n    int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n\r\n    // Apply rotation to rows\r\n    if (tid < n) {\r\n        float a_i = A[i * n + tid];\r\n        float a_j = A[j * n + tid];\r\n\r\n        A[i * n + tid] = c * a_i - s * a_j;\r\n        A[j * n + tid] = s * a_i + c * a_j;\r\n    }\r\n\r\n    __syncthreads();\r\n\r\n    // Apply rotation to columns\r\n    if (tid < n) {\r\n        float a_i = A[tid * n + i];\r\n        float a_j = A[tid * n + j];\r\n\r\n        A[tid * n + i] = c * a_i - s * a_j;\r\n        A[tid * n + j] = s * a_i + c * a_j;\r\n\r\n        // Update Q matrix\r\n        float q_i = Q[tid * n + i];\r\n        float q_j = Q[tid * n + j];\r\n\r\n        Q[tid * n + i] = c * q_i - s * q_j;\r\n        Q[tid * n + j] = s * q_i + c * q_j;\r\n    }\r\n}"

Field Value

string

CUDAJacobiSVDKernel

CUDA kernel for Jacobi SVD rotation with double precision support.

public const string CUDAJacobiSVDKernel = "\r\nextern \"C\" __global__ void jacobi_svd_rotation_cuda(\r\n    float* A,                 // Matrix A (modified in-place)\r\n    float* U,                 // Left singular vectors\r\n    float* V,                 // Right singular vectors\r\n    const int n,              // Matrix dimension\r\n    const int i,              // Row index for rotation\r\n    const int j,              // Column index for rotation\r\n    float* convergence_flag   // Convergence indicator\r\n) {\r\n    int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n\r\n    if (tid >= n) return;\r\n\r\n    // Load matrix elements for 2x2 submatrix\r\n    float a_ii = A[i * n + i];\r\n    float a_ij = A[i * n + j];\r\n    float a_ji = A[j * n + i];\r\n    float a_jj = A[j * n + j];\r\n\r\n    // Check convergence criteria\r\n    float off_diag = fabsf(a_ij) + fabsf(a_ji);\r\n    if (off_diag < 1e-10f) {\r\n        if (tid == 0) *convergence_flag = 1.0f;\r\n        return;\r\n    }\r\n\r\n    // Compute Jacobi rotation parameters\r\n    float tau = (a_jj - a_ii) / (2.0f * (a_ij + a_ji));\r\n    float t = copysignf(1.0f, tau) / (fabsf(tau) + sqrtf(1.0f + tau * tau));\r\n    float c = rsqrtf(1.0f + t * t);  // 1/sqrt(1+t²)\r\n    float s = c * t;\r\n\r\n    __syncthreads();\r\n\r\n    // Apply Givens rotation to columns i and j\r\n    if (tid < n) {\r\n        float a_ti = A[tid * n + i];\r\n        float a_tj = A[tid * n + j];\r\n\r\n        A[tid * n + i] = c * a_ti - s * a_tj;\r\n        A[tid * n + j] = s * a_ti + c * a_tj;\r\n\r\n        // Update U matrix\r\n        float u_ti = U[tid * n + i];\r\n        float u_tj = U[tid * n + j];\r\n\r\n        U[tid * n + i] = c * u_ti - s * u_tj;\r\n        U[tid * n + j] = s * u_ti + c * u_tj;\r\n    }\r\n\r\n    __syncthreads();\r\n\r\n    // Apply Givens rotation to rows i and j\r\n    if (tid < n) {\r\n        float a_it = A[i * n + tid];\r\n        float a_jt = A[j * n + tid];\r\n\r\n        A[i * n + tid] = c * a_it - s * a_jt;\r\n        A[j * n + tid] = s * a_it + c * a_jt;\r\n\r\n        // Update V matrix\r\n        float v_it = V[i * n + tid];\r\n        float v_jt = V[j * n + tid];\r\n\r\n        V[i * n + tid] = c * v_it - s * v_jt;\r\n        V[j * n + tid] = s * v_it + c * v_jt;\r\n    }\r\n}"

Field Value

string

CUDAMatrixMultiplyTiledKernel

The c u d a matrix multiply tiled kernel.

public const string CUDAMatrixMultiplyTiledKernel = "\r\nextern \"C\" __global__ void matrix_multiply_tiled_cuda(\r\n    const float* A,     // Matrix A (M x K)\r\n    const float* B,     // Matrix B (K x N)\r\n    float* C,           // Result matrix C (M x N)\r\n    const int M,        // Rows of A and C\r\n    const int N,        // Columns of B and C\r\n    const int K         // Columns of A and rows of B\r\n) {\r\n    const int TILE_SIZE = 16;\r\n\r\n    // Shared memory for tiles\r\n    __shared__ float tile_A[16][16];\r\n    __shared__ float tile_B[16][16];\r\n\r\n    // Thread and block indices\r\n    int tx = threadIdx.x;\r\n    int ty = threadIdx.y;\r\n    int bx = blockIdx.x;\r\n    int by = blockIdx.y;\r\n\r\n    // Output coordinates\r\n    int row = by * TILE_SIZE + ty;\r\n    int col = bx * TILE_SIZE + tx;\r\n\r\n    float sum = 0.0f;\r\n\r\n    // Loop over tiles\r\n    int num_tiles = (K + TILE_SIZE - 1) / TILE_SIZE;\r\n    for (int t = 0; t < num_tiles; t++) {\r\n        // Load tile of A\r\n        int a_row = row;\r\n        int a_col = t * TILE_SIZE + tx;\r\n        if (a_row < M && a_col < K) {\r\n            tile_A[ty][tx] = A[a_row * K + a_col];\r\n        } else {\r\n            tile_A[ty][tx] = 0.0f;\r\n        }\r\n\r\n        // Load tile of B\r\n        int b_row = t * TILE_SIZE + ty;\r\n        int b_col = col;\r\n        if (b_row < K && b_col < N) {\r\n            tile_B[ty][tx] = B[b_row * N + b_col];\r\n        } else {\r\n            tile_B[ty][tx] = 0.0f;\r\n        }\r\n\r\n        __syncthreads();\r\n\r\n        // Compute partial sum\r\n        #pragma unroll\r\n        for (int k = 0; k < TILE_SIZE; k++) {\r\n            sum += tile_A[ty][k] * tile_B[k][tx];\r\n        }\r\n\r\n        __syncthreads();\r\n    }\r\n\r\n    // Store result\r\n    if (row < M && col < N) {\r\n        C[row * N + col] = sum;\r\n    }\r\n}"

Field Value

string

CUDAVectorOperationsKernel

The c u d a vector operations kernel.

public const string CUDAVectorOperationsKernel = "\r\nextern \"C\" __global__ void vector_operations_cuda(\r\n    const float* a,          // Input vector a\r\n    const float* b,          // Input vector b\r\n    float* result,           // Output vector\r\n    const int n,             // Vector length\r\n    const int operation      // 0=add, 1=sub, 2=mul, 3=div\r\n) {\r\n    int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n    int stride = blockDim.x * gridDim.x;\r\n\r\n    // Process multiple elements per thread for better efficiency\r\n    for (int i = tid; i < n; i += stride) {\r\n        float val_a = a[i];\r\n        float val_b = b[i];\r\n\r\n        switch (operation) {\r\n            case 0: // Addition\r\n                result[i] = val_a + val_b;\r\n                break;\r\n            case 1: // Subtraction\r\n                result[i] = val_a - val_b;\r\n                break;\r\n            case 2: // Element-wise multiplication\r\n                result[i] = val_a * val_b;\r\n                break;\r\n            case 3: // Element-wise division\r\n                result[i] = (fabsf(val_b) > 1e-10f) ? val_a / val_b : 0.0f;\r\n                break;\r\n        }\r\n    }\r\n}"

Field Value

string

CUDAWarpReductionKernel

The c u d a warp reduction kernel.

public const string CUDAWarpReductionKernel = "\r\n__device__ __forceinline__ float warp_reduce_sum(float val) {\r\n    for (int offset = warpSize/2; offset > 0; offset /= 2) {\r\n        val += __shfl_down_sync(0xFFFFFFFF, val, offset);\r\n    }\r\n    return val;\r\n}\r\n\r\nextern \"C\" __global__ void warp_reduction_cuda(\r\n    const float* input,        // Input array\r\n    float* output,            // Output array\r\n    const int n               // Input size\r\n) {\r\n    int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n    int lane = threadIdx.x % warpSize;\r\n    int warp_id = threadIdx.x / warpSize;\r\n\r\n    __shared__ float warp_sums[32]; // Max 32 warps per block\r\n\r\n    // Load and sum\r\n    float sum = 0.0f;\r\n    for (int i = tid; i < n; i += blockDim.x * gridDim.x) {\r\n        sum += input[i];\r\n    }\r\n\r\n    // Warp-level reduction\r\n    sum = warp_reduce_sum(sum);\r\n\r\n    // Store warp result\r\n    if (lane == 0) {\r\n        warp_sums[warp_id] = sum;\r\n    }\r\n\r\n    __syncthreads();\r\n\r\n    // Block-level reduction\r\n    if (warp_id == 0) {\r\n        sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ?\r\n              warp_sums[threadIdx.x] : 0.0f;\r\n        sum = warp_reduce_sum(sum);\r\n\r\n        if (threadIdx.x == 0) {\r\n            output[blockIdx.x] = sum;\r\n        }\r\n    }\r\n}"

Field Value

string

OpenCLHouseholderTransformKernel

The open c l householder transform kernel.

public const string OpenCLHouseholderTransformKernel = "\r\n__kernel void apply_householder_transform_optimized(\r\n    __global float* matrix,           // Input/output matrix\r\n    __global const float* v,          // Householder vector\r\n    const int m,                      // Matrix rows\r\n    const int n,                      // Matrix columns\r\n    const int v_len,                  // Householder vector length\r\n    const int start_row               // Starting row index\r\n) {\r\n    int col = get_global_id(0);\r\n    int lid = get_local_id(0);\r\n\r\n    if (col >= n) return;\r\n\r\n    // Shared memory for dot product computation\r\n    __local float shared_dot[256];\r\n    shared_dot[lid] = 0.0f;\r\n\r\n    // Compute dot product v^T * A[:, col] using work group collaboration\r\n    float local_dot = 0.0f;\r\n    for (int i = lid; i < v_len; i += get_local_size(0)) {\r\n        if (start_row + i < m) {\r\n            local_dot += v[i] * matrix[(start_row + i) * n + col];\r\n        }\r\n    }\r\n\r\n    shared_dot[lid] = local_dot;\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // Parallel reduction\r\n    for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) {\r\n        if (lid < offset) {\r\n            shared_dot[lid] += shared_dot[lid + offset];\r\n        }\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n    }\r\n\r\n    float total_dot = shared_dot[0] * 2.0f;\r\n\r\n    // Apply transformation: A[:, col] = A[:, col] - 2 * (v^T * A[:, col]) * v\r\n    for (int i = lid; i < v_len; i += get_local_size(0)) {\r\n        if (start_row + i < m) {\r\n            matrix[(start_row + i) * n + col] -= total_dot * v[i];\r\n        }\r\n    }\r\n}"

Field Value

string

OpenCLHouseholderVectorKernel

OpenCL kernel for computing Householder vector with parallel reduction.

public const string OpenCLHouseholderVectorKernel = "\r\n__kernel void compute_householder_vector_parallel(\r\n    __global const float* column,     // Input column vector\r\n    __global float* householder,      // Output Householder vector\r\n    __global float* norm_result,      // Output norm value\r\n    const int n,                      // Vector length\r\n    const int start_idx               // Starting index\r\n) {\r\n    int gid = get_global_id(0);\r\n    int lid = get_local_id(0);\r\n    int group_size = get_local_size(0);\r\n\r\n    // Shared memory for reduction\r\n    __local float shared_data[256];\r\n\r\n    // Initialize shared memory\r\n    shared_data[lid] = 0.0f;\r\n\r\n    // Load and square elements for norm computation\r\n    if (gid < n - start_idx) {\r\n        float val = column[start_idx + gid];\r\n        householder[gid] = val;\r\n        shared_data[lid] = val * val;\r\n    }\r\n\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // Parallel reduction for norm\r\n    for (int offset = group_size / 2; offset > 0; offset /= 2) {\r\n        if (lid < offset) {\r\n            shared_data[lid] += shared_data[lid + offset];\r\n        }\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n    }\r\n\r\n    // Compute norm and update first element\r\n    if (lid == 0) {\r\n        float norm = sqrt(shared_data[0]);\r\n        norm_result[get_group_id(0)] = norm;\r\n\r\n        if (gid == 0 && norm > 1e-10f) {\r\n            float sign = householder[0] >= 0.0f ? 1.0f : -1.0f;\r\n            householder[0] += sign * norm;\r\n        }\r\n    }\r\n\r\n    barrier(CLK_GLOBAL_MEM_FENCE);\r\n\r\n    // Second reduction for normalization\r\n    if (gid < n - start_idx) {\r\n        shared_data[lid] = householder[gid] * householder[gid];\r\n    } else {\r\n        shared_data[lid] = 0.0f;\r\n    }\r\n\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // Reduction for vector norm\r\n    for (int offset = group_size / 2; offset > 0; offset /= 2) {\r\n        if (lid < offset) {\r\n            shared_data[lid] += shared_data[lid + offset];\r\n        }\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n    }\r\n\r\n    // Normalize vector\r\n    if (lid == 0) {\r\n        float vnorm = sqrt(shared_data[0]);\r\n        if (vnorm > 1e-10f) {\r\n            for (int i = 0; i < n - start_idx; i++) {\r\n                householder[i] /= vnorm;\r\n            }\r\n        }\r\n    }\r\n}"

Field Value

string

OpenCLLUDecompositionKernel

OpenCL kernel for LU decomposition with partial pivoting.

public const string OpenCLLUDecompositionKernel = "\r\n__kernel void lu_decomposition_step(\r\n    __global float* A,          // Matrix being decomposed\r\n    __global int* P,            // Permutation array\r\n    const int n,                // Matrix dimension\r\n    const int k                 // Current step\r\n) {\r\n    int tid = get_global_id(0);\r\n    int row = tid + k + 1;\r\n\r\n    if (row >= n) return;\r\n\r\n    // Find pivot (would need separate kernel for full pivoting)\r\n    __local float max_val;\r\n    __local int pivot_row;\r\n\r\n    if (get_local_id(0) == 0) {\r\n        max_val = fabs(A[k * n + k]);\r\n        pivot_row = k;\r\n\r\n        for (int i = k + 1; i < n; i++) {\r\n            float val = fabs(A[i * n + k]);\r\n            if (val > max_val) {\r\n                max_val = val;\r\n                pivot_row = i;\r\n            }\r\n        }\r\n    }\r\n\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // Swap rows if needed (simplified - would need atomic operations)\r\n    if (pivot_row != k && get_local_id(0) == 0) {\r\n        for (int j = 0; j < n; j++) {\r\n            float temp = A[k * n + j];\r\n            A[k * n + j] = A[pivot_row * n + j];\r\n            A[pivot_row * n + j] = temp;\r\n        }\r\n\r\n        // Update permutation\r\n        int temp_p = P[k];\r\n        P[k] = P[pivot_row];\r\n        P[pivot_row] = temp_p;\r\n    }\r\n\r\n    barrier(CLK_GLOBAL_MEM_FENCE);\r\n\r\n    // Compute multipliers and eliminate\r\n    if (row < n) {\r\n        float pivot = A[k * n + k];\r\n        if (fabs(pivot) > 1e-10f) {\r\n            float multiplier = A[row * n + k] / pivot;\r\n            A[row * n + k] = multiplier; // Store L\r\n\r\n            // Eliminate\r\n            for (int j = k + 1; j < n; j++) {\r\n                A[row * n + j] -= multiplier * A[k * n + j];\r\n            }\r\n        }\r\n    }\r\n}"

Field Value

string

OpenCLMatrixMultiplyTiledKernel

OpenCL kernel for tiled matrix multiplication with shared memory optimization.

public const string OpenCLMatrixMultiplyTiledKernel = "\r\n// Optimized tiled matrix multiplication using shared memory\r\n__kernel void matrix_multiply_tiled(\r\n    __global const float* A,    // Matrix A (M x K)\r\n    __global const float* B,    // Matrix B (K x N)\r\n    __global float* C,          // Result matrix C (M x N)\r\n    const int M,                // Rows of A and C\r\n    const int N,                // Columns of B and C\r\n    const int K                 // Columns of A and rows of B\r\n) {\r\n    // Tile size (should match local work size)\r\n    const int TILE_SIZE = 16;\r\n\r\n    // Work group and work item IDs\r\n    int gid_x = get_group_id(0);\r\n    int gid_y = get_group_id(1);\r\n    int lid_x = get_local_id(0);\r\n    int lid_y = get_local_id(1);\r\n\r\n    // Shared memory for tiles\r\n    __local float tile_A[16][16];\r\n    __local float tile_B[16][16];\r\n\r\n    // Output coordinates\r\n    int row = gid_y * TILE_SIZE + lid_y;\r\n    int col = gid_x * TILE_SIZE + lid_x;\r\n\r\n    float sum = 0.0f;\r\n\r\n    // Loop over tiles\r\n    int num_tiles = (K + TILE_SIZE - 1) / TILE_SIZE;\r\n    for (int t = 0; t < num_tiles; t++) {\r\n        // Load tile of A\r\n        int a_row = row;\r\n        int a_col = t * TILE_SIZE + lid_x;\r\n        if (a_row < M && a_col < K) {\r\n            tile_A[lid_y][lid_x] = A[a_row * K + a_col];\r\n        } else {\r\n            tile_A[lid_y][lid_x] = 0.0f;\r\n        }\r\n\r\n        // Load tile of B\r\n        int b_row = t * TILE_SIZE + lid_y;\r\n        int b_col = col;\r\n        if (b_row < K && b_col < N) {\r\n            tile_B[lid_y][lid_x] = B[b_row * N + b_col];\r\n        } else {\r\n            tile_B[lid_y][lid_x] = 0.0f;\r\n        }\r\n\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n        // Compute partial sum\r\n        for (int k = 0; k < TILE_SIZE; k++) {\r\n            sum += tile_A[lid_y][k] * tile_B[k][lid_x];\r\n        }\r\n\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n    }\r\n\r\n    // Store result\r\n    if (row < M && col < N) {\r\n        C[row * N + col] = sum;\r\n    }\r\n}"

Field Value

string

OpenCLMatrixVectorKernel

OpenCL kernel for optimized matrix-vector multiplication.

public const string OpenCLMatrixVectorKernel = "\r\n__kernel void matrix_vector_multiply_optimized(\r\n    __global const float* A,     // Matrix A (m x n)\r\n    __global const float* x,     // Vector x (n x 1)\r\n    __global float* y,           // Result vector y (m x 1)\r\n    const int m,                 // Matrix rows\r\n    const int n                  // Matrix columns\r\n) {\r\n    int row = get_global_id(0);\r\n    int lid = get_local_id(0);\r\n\r\n    if (row >= m) return;\r\n\r\n    // Use shared memory for vector caching\r\n    __local float x_cache[256];\r\n\r\n    float sum = 0.0f;\r\n\r\n    // Process vector in chunks that fit in shared memory\r\n    for (int chunk = 0; chunk < n; chunk += 256) {\r\n        // Load chunk of x into shared memory\r\n        if (lid < 256 && chunk + lid < n) {\r\n            x_cache[lid] = x[chunk + lid];\r\n        } else if (lid < 256) {\r\n            x_cache[lid] = 0.0f;\r\n        }\r\n\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n        // Compute partial dot product\r\n        int chunk_size = min(256, n - chunk);\r\n        for (int i = 0; i < chunk_size; i++) {\r\n            sum += A[row * n + chunk + i] * x_cache[i];\r\n        }\r\n\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n    }\r\n\r\n    y[row] = sum;\r\n}"

Field Value

string

OpenCLParallelReductionKernel

OpenCL kernel for parallel reduction with multiple operations.

public const string OpenCLParallelReductionKernel = "\r\n__kernel void parallel_reduction_optimized(\r\n    __global const float* input,    // Input array\r\n    __global float* output,         // Output (one per work group)\r\n    __local float* scratch,         // Local memory for reduction\r\n    const int n,                    // Input size\r\n    const int operation             // 0=sum, 1=max, 2=min, 3=norm2\r\n) {\r\n    int gid = get_global_id(0);\r\n    int lid = get_local_id(0);\r\n    int group_size = get_local_size(0);\r\n\r\n    // Initialize local memory\r\n    float value = 0.0f;\r\n\r\n    // Load data and handle boundary conditions\r\n    if (gid < n) {\r\n        value = input[gid];\r\n        if (operation == 3) { // L2 norm squared\r\n            value = value * value;\r\n        }\r\n    }\r\n\r\n    // Handle operations that need special initialization\r\n    if (operation == 2 && gid >= n) { // Min operation\r\n        value = FLT_MAX;\r\n    } else if (operation == 1 && gid >= n) { // Max operation\r\n        value = -FLT_MAX;\r\n    }\r\n\r\n    scratch[lid] = value;\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // Parallel reduction\r\n    for (int offset = group_size / 2; offset > 0; offset /= 2) {\r\n        if (lid < offset) {\r\n            float other = scratch[lid + offset];\r\n\r\n            switch (operation) {\r\n                case 0: // Sum\r\n                case 3: // L2 norm (sum of squares)\r\n                    scratch[lid] += other;\r\n                    break;\r\n                case 1: // Max\r\n                    scratch[lid] = fmax(scratch[lid], other);\r\n                    break;\r\n                case 2: // Min\r\n                    scratch[lid] = fmin(scratch[lid], other);\r\n                    break;\r\n            }\r\n        }\r\n        barrier(CLK_LOCAL_MEM_FENCE);\r\n    }\r\n\r\n    // Store result\r\n    if (lid == 0) {\r\n        if (operation == 3) {\r\n            output[get_group_id(0)] = sqrt(scratch[0]); // Final sqrt for L2 norm\r\n        } else {\r\n            output[get_group_id(0)] = scratch[0];\r\n        }\r\n    }\r\n}"

Field Value

string

OpenCLQRShiftKernel

OpenCL kernel for QR algorithm with Wilkinson shift.

public const string OpenCLQRShiftKernel = "\r\n__kernel void qr_algorithm_shift(\r\n    __global float* A,            // Hessenberg matrix (modified in-place)\r\n    __global float* Q,            // Accumulated transformations\r\n    __global float* eigenvalues,  // Output eigenvalues\r\n    const int n,                  // Matrix dimension\r\n    const float tolerance         // Convergence tolerance\r\n) {\r\n    int tid = get_global_id(0);\r\n\r\n    if (tid >= n * n) return;\r\n\r\n    // Compute Wilkinson shift from bottom-right 2x2 submatrix\r\n    __local float shift_value;\r\n\r\n    if (tid == 0) {\r\n        if (n >= 2) {\r\n            float a = A[(n-2) * n + (n-2)];\r\n            float b = A[(n-2) * n + (n-1)];\r\n            float c = A[(n-1) * n + (n-2)];\r\n            float d = A[(n-1) * n + (n-1)];\r\n\r\n            float trace = a + d;\r\n            float det = a * d - b * c;\r\n            float discriminant = trace * trace - 4.0f * det;\r\n\r\n            if (discriminant >= 0.0f) {\r\n                float sqrt_disc = sqrt(discriminant);\r\n                float lambda1 = (trace + sqrt_disc) * 0.5f;\r\n                float lambda2 = (trace - sqrt_disc) * 0.5f;\r\n\r\n                // Choose eigenvalue closer to d\r\n                shift_value = (fabs(lambda1 - d) < fabs(lambda2 - d)) ? lambda1 : lambda2;\r\n            } else {\r\n                shift_value = d;\r\n            }\r\n        } else {\r\n            shift_value = 0.0f;\r\n        }\r\n    }\r\n\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // Apply shift: A = A - shift * I\r\n    int row = tid / n;\r\n    int col = tid % n;\r\n\r\n    if (row == col && tid < n * n) {\r\n        A[tid] -= shift_value;\r\n    }\r\n\r\n    barrier(CLK_GLOBAL_MEM_FENCE);\r\n\r\n    // Apply Givens rotations for QR factorization of Hessenberg matrix\r\n    // (This would be called iteratively for each rotation)\r\n\r\n    // After QR step, restore shift: A = A + shift * I\r\n    if (row == col && tid < n * n) {\r\n        A[tid] += shift_value;\r\n    }\r\n\r\n    // Extract eigenvalues from diagonal\r\n    if (tid < n) {\r\n        eigenvalues[tid] = A[tid * n + tid];\r\n    }\r\n}"

Field Value

string

OpenCLSingularValuesKernel

The open c l singular values kernel.

public const string OpenCLSingularValuesKernel = "\r\n__kernel void extract_singular_values(\r\n    __global const float* A,     // Diagonalized matrix\r\n    __global float* S,           // Output singular values\r\n    __global float* U,           // Left singular vectors (to fix signs)\r\n    const int n                  // Matrix dimension\r\n) {\r\n    int i = get_global_id(0);\r\n\r\n    if (i >= n) return;\r\n\r\n    // Extract diagonal element and ensure positive\r\n    float value = A[i * n + i];\r\n    float abs_value = fabs(value);\r\n\r\n    S[i] = abs_value;\r\n\r\n    // If singular value was negative, flip corresponding column of U\r\n    if (value < 0.0f) {\r\n        for (int j = 0; j < n; j++) {\r\n            U[j * n + i] = -U[j * n + i];\r\n        }\r\n    }\r\n}"

Field Value

string

Methods

GetKernelSource(LinearAlgebraOperation, string, string)

Gets the appropriate kernel source for a given operation and accelerator type.

public static string GetKernelSource(LinearAlgebraKernelLibrary.LinearAlgebraOperation operation, string acceleratorType, string precision = "single")

Parameters

operation LinearAlgebraKernelLibrary.LinearAlgebraOperation

The linear algebra operation.

acceleratorType string

The target accelerator type.

precision string

Precision mode (single/double).

Returns

string

Kernel source code.