Class LinearAlgebraKernelLibrary
- Namespace
- DotCompute.Algorithms
- Assembly
- DotCompute.Algorithms.dll
Comprehensive GPU kernel library for linear algebra operations. Provides optimized kernels for matrix operations, decompositions, and eigenvalue computations.
public static class LinearAlgebraKernelLibrary
- Inheritance
-
LinearAlgebraKernelLibrary
- Inherited Members
Fields
CUDACholeskyKernel
CUDA kernel for blocked Cholesky decomposition.
public const string CUDACholeskyKernel = "\r\nextern \"C\" __global__ void cholesky_decomposition_cuda(\r\n float* A, // Input matrix (modified to L)\r\n const int n, // Matrix dimension\r\n const int block_size // Block size for tiling\r\n) {\r\n int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n int i = blockIdx.y; // Current row being processed\r\n\r\n // Ensure positive definiteness\r\n __shared__ float pivot_value;\r\n\r\n if (threadIdx.x == 0 && blockIdx.x == 0) {\r\n pivot_value = A[i * n + i];\r\n }\r\n\r\n __syncthreads();\r\n\r\n if (pivot_value <= 0.0f) {\r\n return; // Matrix not positive definite\r\n }\r\n\r\n // Process diagonal element\r\n if (i == blockIdx.x && tid == i) {\r\n float sum = 0.0f;\r\n for (int k = 0; k < i; k++) {\r\n float val = A[i * n + k];\r\n sum += val * val;\r\n }\r\n\r\n float diagonal_val = A[i * n + i] - sum;\r\n if (diagonal_val > 0.0f) {\r\n A[i * n + i] = sqrtf(diagonal_val);\r\n }\r\n }\r\n\r\n __syncthreads();\r\n\r\n // Process elements below diagonal in current column\r\n if (tid > i && tid < n) {\r\n float sum = 0.0f;\r\n for (int k = 0; k < i; k++) {\r\n sum += A[tid * n + k] * A[i * n + k];\r\n }\r\n\r\n float diagonal = A[i * n + i];\r\n if (fabsf(diagonal) > 1e-10f) {\r\n A[tid * n + i] = (A[tid * n + i] - sum) / diagonal;\r\n }\r\n\r\n // Zero out upper triangular part\r\n if (tid < i) {\r\n A[i * n + tid] = 0.0f;\r\n }\r\n }\r\n}"
Field Value
CUDAGivensRotationKernel
The c u d a givens rotation kernel.
public const string CUDAGivensRotationKernel = "\r\nextern \"C\" __global__ void apply_givens_rotation_cuda(\r\n float* A, // Matrix to transform\r\n float* Q, // Accumulate rotations\r\n const int n, // Matrix dimension\r\n const int i, // First row/col index\r\n const int j, // Second row/col index\r\n const float c, // Cosine of rotation\r\n const float s // Sine of rotation\r\n) {\r\n int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n\r\n // Apply rotation to rows\r\n if (tid < n) {\r\n float a_i = A[i * n + tid];\r\n float a_j = A[j * n + tid];\r\n\r\n A[i * n + tid] = c * a_i - s * a_j;\r\n A[j * n + tid] = s * a_i + c * a_j;\r\n }\r\n\r\n __syncthreads();\r\n\r\n // Apply rotation to columns\r\n if (tid < n) {\r\n float a_i = A[tid * n + i];\r\n float a_j = A[tid * n + j];\r\n\r\n A[tid * n + i] = c * a_i - s * a_j;\r\n A[tid * n + j] = s * a_i + c * a_j;\r\n\r\n // Update Q matrix\r\n float q_i = Q[tid * n + i];\r\n float q_j = Q[tid * n + j];\r\n\r\n Q[tid * n + i] = c * q_i - s * q_j;\r\n Q[tid * n + j] = s * q_i + c * q_j;\r\n }\r\n}"
Field Value
CUDAJacobiSVDKernel
CUDA kernel for Jacobi SVD rotation with double precision support.
public const string CUDAJacobiSVDKernel = "\r\nextern \"C\" __global__ void jacobi_svd_rotation_cuda(\r\n float* A, // Matrix A (modified in-place)\r\n float* U, // Left singular vectors\r\n float* V, // Right singular vectors\r\n const int n, // Matrix dimension\r\n const int i, // Row index for rotation\r\n const int j, // Column index for rotation\r\n float* convergence_flag // Convergence indicator\r\n) {\r\n int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n\r\n if (tid >= n) return;\r\n\r\n // Load matrix elements for 2x2 submatrix\r\n float a_ii = A[i * n + i];\r\n float a_ij = A[i * n + j];\r\n float a_ji = A[j * n + i];\r\n float a_jj = A[j * n + j];\r\n\r\n // Check convergence criteria\r\n float off_diag = fabsf(a_ij) + fabsf(a_ji);\r\n if (off_diag < 1e-10f) {\r\n if (tid == 0) *convergence_flag = 1.0f;\r\n return;\r\n }\r\n\r\n // Compute Jacobi rotation parameters\r\n float tau = (a_jj - a_ii) / (2.0f * (a_ij + a_ji));\r\n float t = copysignf(1.0f, tau) / (fabsf(tau) + sqrtf(1.0f + tau * tau));\r\n float c = rsqrtf(1.0f + t * t); // 1/sqrt(1+t²)\r\n float s = c * t;\r\n\r\n __syncthreads();\r\n\r\n // Apply Givens rotation to columns i and j\r\n if (tid < n) {\r\n float a_ti = A[tid * n + i];\r\n float a_tj = A[tid * n + j];\r\n\r\n A[tid * n + i] = c * a_ti - s * a_tj;\r\n A[tid * n + j] = s * a_ti + c * a_tj;\r\n\r\n // Update U matrix\r\n float u_ti = U[tid * n + i];\r\n float u_tj = U[tid * n + j];\r\n\r\n U[tid * n + i] = c * u_ti - s * u_tj;\r\n U[tid * n + j] = s * u_ti + c * u_tj;\r\n }\r\n\r\n __syncthreads();\r\n\r\n // Apply Givens rotation to rows i and j\r\n if (tid < n) {\r\n float a_it = A[i * n + tid];\r\n float a_jt = A[j * n + tid];\r\n\r\n A[i * n + tid] = c * a_it - s * a_jt;\r\n A[j * n + tid] = s * a_it + c * a_jt;\r\n\r\n // Update V matrix\r\n float v_it = V[i * n + tid];\r\n float v_jt = V[j * n + tid];\r\n\r\n V[i * n + tid] = c * v_it - s * v_jt;\r\n V[j * n + tid] = s * v_it + c * v_jt;\r\n }\r\n}"
Field Value
CUDAMatrixMultiplyTiledKernel
The c u d a matrix multiply tiled kernel.
public const string CUDAMatrixMultiplyTiledKernel = "\r\nextern \"C\" __global__ void matrix_multiply_tiled_cuda(\r\n const float* A, // Matrix A (M x K)\r\n const float* B, // Matrix B (K x N)\r\n float* C, // Result matrix C (M x N)\r\n const int M, // Rows of A and C\r\n const int N, // Columns of B and C\r\n const int K // Columns of A and rows of B\r\n) {\r\n const int TILE_SIZE = 16;\r\n\r\n // Shared memory for tiles\r\n __shared__ float tile_A[16][16];\r\n __shared__ float tile_B[16][16];\r\n\r\n // Thread and block indices\r\n int tx = threadIdx.x;\r\n int ty = threadIdx.y;\r\n int bx = blockIdx.x;\r\n int by = blockIdx.y;\r\n\r\n // Output coordinates\r\n int row = by * TILE_SIZE + ty;\r\n int col = bx * TILE_SIZE + tx;\r\n\r\n float sum = 0.0f;\r\n\r\n // Loop over tiles\r\n int num_tiles = (K + TILE_SIZE - 1) / TILE_SIZE;\r\n for (int t = 0; t < num_tiles; t++) {\r\n // Load tile of A\r\n int a_row = row;\r\n int a_col = t * TILE_SIZE + tx;\r\n if (a_row < M && a_col < K) {\r\n tile_A[ty][tx] = A[a_row * K + a_col];\r\n } else {\r\n tile_A[ty][tx] = 0.0f;\r\n }\r\n\r\n // Load tile of B\r\n int b_row = t * TILE_SIZE + ty;\r\n int b_col = col;\r\n if (b_row < K && b_col < N) {\r\n tile_B[ty][tx] = B[b_row * N + b_col];\r\n } else {\r\n tile_B[ty][tx] = 0.0f;\r\n }\r\n\r\n __syncthreads();\r\n\r\n // Compute partial sum\r\n #pragma unroll\r\n for (int k = 0; k < TILE_SIZE; k++) {\r\n sum += tile_A[ty][k] * tile_B[k][tx];\r\n }\r\n\r\n __syncthreads();\r\n }\r\n\r\n // Store result\r\n if (row < M && col < N) {\r\n C[row * N + col] = sum;\r\n }\r\n}"
Field Value
CUDAVectorOperationsKernel
The c u d a vector operations kernel.
public const string CUDAVectorOperationsKernel = "\r\nextern \"C\" __global__ void vector_operations_cuda(\r\n const float* a, // Input vector a\r\n const float* b, // Input vector b\r\n float* result, // Output vector\r\n const int n, // Vector length\r\n const int operation // 0=add, 1=sub, 2=mul, 3=div\r\n) {\r\n int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n int stride = blockDim.x * gridDim.x;\r\n\r\n // Process multiple elements per thread for better efficiency\r\n for (int i = tid; i < n; i += stride) {\r\n float val_a = a[i];\r\n float val_b = b[i];\r\n\r\n switch (operation) {\r\n case 0: // Addition\r\n result[i] = val_a + val_b;\r\n break;\r\n case 1: // Subtraction\r\n result[i] = val_a - val_b;\r\n break;\r\n case 2: // Element-wise multiplication\r\n result[i] = val_a * val_b;\r\n break;\r\n case 3: // Element-wise division\r\n result[i] = (fabsf(val_b) > 1e-10f) ? val_a / val_b : 0.0f;\r\n break;\r\n }\r\n }\r\n}"
Field Value
CUDAWarpReductionKernel
The c u d a warp reduction kernel.
public const string CUDAWarpReductionKernel = "\r\n__device__ __forceinline__ float warp_reduce_sum(float val) {\r\n for (int offset = warpSize/2; offset > 0; offset /= 2) {\r\n val += __shfl_down_sync(0xFFFFFFFF, val, offset);\r\n }\r\n return val;\r\n}\r\n\r\nextern \"C\" __global__ void warp_reduction_cuda(\r\n const float* input, // Input array\r\n float* output, // Output array\r\n const int n // Input size\r\n) {\r\n int tid = threadIdx.x + blockIdx.x * blockDim.x;\r\n int lane = threadIdx.x % warpSize;\r\n int warp_id = threadIdx.x / warpSize;\r\n\r\n __shared__ float warp_sums[32]; // Max 32 warps per block\r\n\r\n // Load and sum\r\n float sum = 0.0f;\r\n for (int i = tid; i < n; i += blockDim.x * gridDim.x) {\r\n sum += input[i];\r\n }\r\n\r\n // Warp-level reduction\r\n sum = warp_reduce_sum(sum);\r\n\r\n // Store warp result\r\n if (lane == 0) {\r\n warp_sums[warp_id] = sum;\r\n }\r\n\r\n __syncthreads();\r\n\r\n // Block-level reduction\r\n if (warp_id == 0) {\r\n sum = (threadIdx.x < (blockDim.x + warpSize - 1) / warpSize) ?\r\n warp_sums[threadIdx.x] : 0.0f;\r\n sum = warp_reduce_sum(sum);\r\n\r\n if (threadIdx.x == 0) {\r\n output[blockIdx.x] = sum;\r\n }\r\n }\r\n}"
Field Value
OpenCLHouseholderTransformKernel
The open c l householder transform kernel.
public const string OpenCLHouseholderTransformKernel = "\r\n__kernel void apply_householder_transform_optimized(\r\n __global float* matrix, // Input/output matrix\r\n __global const float* v, // Householder vector\r\n const int m, // Matrix rows\r\n const int n, // Matrix columns\r\n const int v_len, // Householder vector length\r\n const int start_row // Starting row index\r\n) {\r\n int col = get_global_id(0);\r\n int lid = get_local_id(0);\r\n\r\n if (col >= n) return;\r\n\r\n // Shared memory for dot product computation\r\n __local float shared_dot[256];\r\n shared_dot[lid] = 0.0f;\r\n\r\n // Compute dot product v^T * A[:, col] using work group collaboration\r\n float local_dot = 0.0f;\r\n for (int i = lid; i < v_len; i += get_local_size(0)) {\r\n if (start_row + i < m) {\r\n local_dot += v[i] * matrix[(start_row + i) * n + col];\r\n }\r\n }\r\n\r\n shared_dot[lid] = local_dot;\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Parallel reduction\r\n for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) {\r\n if (lid < offset) {\r\n shared_dot[lid] += shared_dot[lid + offset];\r\n }\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n }\r\n\r\n float total_dot = shared_dot[0] * 2.0f;\r\n\r\n // Apply transformation: A[:, col] = A[:, col] - 2 * (v^T * A[:, col]) * v\r\n for (int i = lid; i < v_len; i += get_local_size(0)) {\r\n if (start_row + i < m) {\r\n matrix[(start_row + i) * n + col] -= total_dot * v[i];\r\n }\r\n }\r\n}"
Field Value
OpenCLHouseholderVectorKernel
OpenCL kernel for computing Householder vector with parallel reduction.
public const string OpenCLHouseholderVectorKernel = "\r\n__kernel void compute_householder_vector_parallel(\r\n __global const float* column, // Input column vector\r\n __global float* householder, // Output Householder vector\r\n __global float* norm_result, // Output norm value\r\n const int n, // Vector length\r\n const int start_idx // Starting index\r\n) {\r\n int gid = get_global_id(0);\r\n int lid = get_local_id(0);\r\n int group_size = get_local_size(0);\r\n\r\n // Shared memory for reduction\r\n __local float shared_data[256];\r\n\r\n // Initialize shared memory\r\n shared_data[lid] = 0.0f;\r\n\r\n // Load and square elements for norm computation\r\n if (gid < n - start_idx) {\r\n float val = column[start_idx + gid];\r\n householder[gid] = val;\r\n shared_data[lid] = val * val;\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Parallel reduction for norm\r\n for (int offset = group_size / 2; offset > 0; offset /= 2) {\r\n if (lid < offset) {\r\n shared_data[lid] += shared_data[lid + offset];\r\n }\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n }\r\n\r\n // Compute norm and update first element\r\n if (lid == 0) {\r\n float norm = sqrt(shared_data[0]);\r\n norm_result[get_group_id(0)] = norm;\r\n\r\n if (gid == 0 && norm > 1e-10f) {\r\n float sign = householder[0] >= 0.0f ? 1.0f : -1.0f;\r\n householder[0] += sign * norm;\r\n }\r\n }\r\n\r\n barrier(CLK_GLOBAL_MEM_FENCE);\r\n\r\n // Second reduction for normalization\r\n if (gid < n - start_idx) {\r\n shared_data[lid] = householder[gid] * householder[gid];\r\n } else {\r\n shared_data[lid] = 0.0f;\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Reduction for vector norm\r\n for (int offset = group_size / 2; offset > 0; offset /= 2) {\r\n if (lid < offset) {\r\n shared_data[lid] += shared_data[lid + offset];\r\n }\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n }\r\n\r\n // Normalize vector\r\n if (lid == 0) {\r\n float vnorm = sqrt(shared_data[0]);\r\n if (vnorm > 1e-10f) {\r\n for (int i = 0; i < n - start_idx; i++) {\r\n householder[i] /= vnorm;\r\n }\r\n }\r\n }\r\n}"
Field Value
OpenCLLUDecompositionKernel
OpenCL kernel for LU decomposition with partial pivoting.
public const string OpenCLLUDecompositionKernel = "\r\n__kernel void lu_decomposition_step(\r\n __global float* A, // Matrix being decomposed\r\n __global int* P, // Permutation array\r\n const int n, // Matrix dimension\r\n const int k // Current step\r\n) {\r\n int tid = get_global_id(0);\r\n int row = tid + k + 1;\r\n\r\n if (row >= n) return;\r\n\r\n // Find pivot (would need separate kernel for full pivoting)\r\n __local float max_val;\r\n __local int pivot_row;\r\n\r\n if (get_local_id(0) == 0) {\r\n max_val = fabs(A[k * n + k]);\r\n pivot_row = k;\r\n\r\n for (int i = k + 1; i < n; i++) {\r\n float val = fabs(A[i * n + k]);\r\n if (val > max_val) {\r\n max_val = val;\r\n pivot_row = i;\r\n }\r\n }\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Swap rows if needed (simplified - would need atomic operations)\r\n if (pivot_row != k && get_local_id(0) == 0) {\r\n for (int j = 0; j < n; j++) {\r\n float temp = A[k * n + j];\r\n A[k * n + j] = A[pivot_row * n + j];\r\n A[pivot_row * n + j] = temp;\r\n }\r\n\r\n // Update permutation\r\n int temp_p = P[k];\r\n P[k] = P[pivot_row];\r\n P[pivot_row] = temp_p;\r\n }\r\n\r\n barrier(CLK_GLOBAL_MEM_FENCE);\r\n\r\n // Compute multipliers and eliminate\r\n if (row < n) {\r\n float pivot = A[k * n + k];\r\n if (fabs(pivot) > 1e-10f) {\r\n float multiplier = A[row * n + k] / pivot;\r\n A[row * n + k] = multiplier; // Store L\r\n\r\n // Eliminate\r\n for (int j = k + 1; j < n; j++) {\r\n A[row * n + j] -= multiplier * A[k * n + j];\r\n }\r\n }\r\n }\r\n}"
Field Value
OpenCLMatrixMultiplyTiledKernel
OpenCL kernel for tiled matrix multiplication with shared memory optimization.
public const string OpenCLMatrixMultiplyTiledKernel = "\r\n// Optimized tiled matrix multiplication using shared memory\r\n__kernel void matrix_multiply_tiled(\r\n __global const float* A, // Matrix A (M x K)\r\n __global const float* B, // Matrix B (K x N)\r\n __global float* C, // Result matrix C (M x N)\r\n const int M, // Rows of A and C\r\n const int N, // Columns of B and C\r\n const int K // Columns of A and rows of B\r\n) {\r\n // Tile size (should match local work size)\r\n const int TILE_SIZE = 16;\r\n\r\n // Work group and work item IDs\r\n int gid_x = get_group_id(0);\r\n int gid_y = get_group_id(1);\r\n int lid_x = get_local_id(0);\r\n int lid_y = get_local_id(1);\r\n\r\n // Shared memory for tiles\r\n __local float tile_A[16][16];\r\n __local float tile_B[16][16];\r\n\r\n // Output coordinates\r\n int row = gid_y * TILE_SIZE + lid_y;\r\n int col = gid_x * TILE_SIZE + lid_x;\r\n\r\n float sum = 0.0f;\r\n\r\n // Loop over tiles\r\n int num_tiles = (K + TILE_SIZE - 1) / TILE_SIZE;\r\n for (int t = 0; t < num_tiles; t++) {\r\n // Load tile of A\r\n int a_row = row;\r\n int a_col = t * TILE_SIZE + lid_x;\r\n if (a_row < M && a_col < K) {\r\n tile_A[lid_y][lid_x] = A[a_row * K + a_col];\r\n } else {\r\n tile_A[lid_y][lid_x] = 0.0f;\r\n }\r\n\r\n // Load tile of B\r\n int b_row = t * TILE_SIZE + lid_y;\r\n int b_col = col;\r\n if (b_row < K && b_col < N) {\r\n tile_B[lid_y][lid_x] = B[b_row * N + b_col];\r\n } else {\r\n tile_B[lid_y][lid_x] = 0.0f;\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Compute partial sum\r\n for (int k = 0; k < TILE_SIZE; k++) {\r\n sum += tile_A[lid_y][k] * tile_B[k][lid_x];\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n }\r\n\r\n // Store result\r\n if (row < M && col < N) {\r\n C[row * N + col] = sum;\r\n }\r\n}"
Field Value
OpenCLMatrixVectorKernel
OpenCL kernel for optimized matrix-vector multiplication.
public const string OpenCLMatrixVectorKernel = "\r\n__kernel void matrix_vector_multiply_optimized(\r\n __global const float* A, // Matrix A (m x n)\r\n __global const float* x, // Vector x (n x 1)\r\n __global float* y, // Result vector y (m x 1)\r\n const int m, // Matrix rows\r\n const int n // Matrix columns\r\n) {\r\n int row = get_global_id(0);\r\n int lid = get_local_id(0);\r\n\r\n if (row >= m) return;\r\n\r\n // Use shared memory for vector caching\r\n __local float x_cache[256];\r\n\r\n float sum = 0.0f;\r\n\r\n // Process vector in chunks that fit in shared memory\r\n for (int chunk = 0; chunk < n; chunk += 256) {\r\n // Load chunk of x into shared memory\r\n if (lid < 256 && chunk + lid < n) {\r\n x_cache[lid] = x[chunk + lid];\r\n } else if (lid < 256) {\r\n x_cache[lid] = 0.0f;\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Compute partial dot product\r\n int chunk_size = min(256, n - chunk);\r\n for (int i = 0; i < chunk_size; i++) {\r\n sum += A[row * n + chunk + i] * x_cache[i];\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n }\r\n\r\n y[row] = sum;\r\n}"
Field Value
OpenCLParallelReductionKernel
OpenCL kernel for parallel reduction with multiple operations.
public const string OpenCLParallelReductionKernel = "\r\n__kernel void parallel_reduction_optimized(\r\n __global const float* input, // Input array\r\n __global float* output, // Output (one per work group)\r\n __local float* scratch, // Local memory for reduction\r\n const int n, // Input size\r\n const int operation // 0=sum, 1=max, 2=min, 3=norm2\r\n) {\r\n int gid = get_global_id(0);\r\n int lid = get_local_id(0);\r\n int group_size = get_local_size(0);\r\n\r\n // Initialize local memory\r\n float value = 0.0f;\r\n\r\n // Load data and handle boundary conditions\r\n if (gid < n) {\r\n value = input[gid];\r\n if (operation == 3) { // L2 norm squared\r\n value = value * value;\r\n }\r\n }\r\n\r\n // Handle operations that need special initialization\r\n if (operation == 2 && gid >= n) { // Min operation\r\n value = FLT_MAX;\r\n } else if (operation == 1 && gid >= n) { // Max operation\r\n value = -FLT_MAX;\r\n }\r\n\r\n scratch[lid] = value;\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Parallel reduction\r\n for (int offset = group_size / 2; offset > 0; offset /= 2) {\r\n if (lid < offset) {\r\n float other = scratch[lid + offset];\r\n\r\n switch (operation) {\r\n case 0: // Sum\r\n case 3: // L2 norm (sum of squares)\r\n scratch[lid] += other;\r\n break;\r\n case 1: // Max\r\n scratch[lid] = fmax(scratch[lid], other);\r\n break;\r\n case 2: // Min\r\n scratch[lid] = fmin(scratch[lid], other);\r\n break;\r\n }\r\n }\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n }\r\n\r\n // Store result\r\n if (lid == 0) {\r\n if (operation == 3) {\r\n output[get_group_id(0)] = sqrt(scratch[0]); // Final sqrt for L2 norm\r\n } else {\r\n output[get_group_id(0)] = scratch[0];\r\n }\r\n }\r\n}"
Field Value
OpenCLQRShiftKernel
OpenCL kernel for QR algorithm with Wilkinson shift.
public const string OpenCLQRShiftKernel = "\r\n__kernel void qr_algorithm_shift(\r\n __global float* A, // Hessenberg matrix (modified in-place)\r\n __global float* Q, // Accumulated transformations\r\n __global float* eigenvalues, // Output eigenvalues\r\n const int n, // Matrix dimension\r\n const float tolerance // Convergence tolerance\r\n) {\r\n int tid = get_global_id(0);\r\n\r\n if (tid >= n * n) return;\r\n\r\n // Compute Wilkinson shift from bottom-right 2x2 submatrix\r\n __local float shift_value;\r\n\r\n if (tid == 0) {\r\n if (n >= 2) {\r\n float a = A[(n-2) * n + (n-2)];\r\n float b = A[(n-2) * n + (n-1)];\r\n float c = A[(n-1) * n + (n-2)];\r\n float d = A[(n-1) * n + (n-1)];\r\n\r\n float trace = a + d;\r\n float det = a * d - b * c;\r\n float discriminant = trace * trace - 4.0f * det;\r\n\r\n if (discriminant >= 0.0f) {\r\n float sqrt_disc = sqrt(discriminant);\r\n float lambda1 = (trace + sqrt_disc) * 0.5f;\r\n float lambda2 = (trace - sqrt_disc) * 0.5f;\r\n\r\n // Choose eigenvalue closer to d\r\n shift_value = (fabs(lambda1 - d) < fabs(lambda2 - d)) ? lambda1 : lambda2;\r\n } else {\r\n shift_value = d;\r\n }\r\n } else {\r\n shift_value = 0.0f;\r\n }\r\n }\r\n\r\n barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n // Apply shift: A = A - shift * I\r\n int row = tid / n;\r\n int col = tid % n;\r\n\r\n if (row == col && tid < n * n) {\r\n A[tid] -= shift_value;\r\n }\r\n\r\n barrier(CLK_GLOBAL_MEM_FENCE);\r\n\r\n // Apply Givens rotations for QR factorization of Hessenberg matrix\r\n // (This would be called iteratively for each rotation)\r\n\r\n // After QR step, restore shift: A = A + shift * I\r\n if (row == col && tid < n * n) {\r\n A[tid] += shift_value;\r\n }\r\n\r\n // Extract eigenvalues from diagonal\r\n if (tid < n) {\r\n eigenvalues[tid] = A[tid * n + tid];\r\n }\r\n}"
Field Value
OpenCLSingularValuesKernel
The open c l singular values kernel.
public const string OpenCLSingularValuesKernel = "\r\n__kernel void extract_singular_values(\r\n __global const float* A, // Diagonalized matrix\r\n __global float* S, // Output singular values\r\n __global float* U, // Left singular vectors (to fix signs)\r\n const int n // Matrix dimension\r\n) {\r\n int i = get_global_id(0);\r\n\r\n if (i >= n) return;\r\n\r\n // Extract diagonal element and ensure positive\r\n float value = A[i * n + i];\r\n float abs_value = fabs(value);\r\n\r\n S[i] = abs_value;\r\n\r\n // If singular value was negative, flip corresponding column of U\r\n if (value < 0.0f) {\r\n for (int j = 0; j < n; j++) {\r\n U[j * n + i] = -U[j * n + i];\r\n }\r\n }\r\n}"
Field Value
Methods
GetKernelSource(LinearAlgebraOperation, string, string)
Gets the appropriate kernel source for a given operation and accelerator type.
public static string GetKernelSource(LinearAlgebraKernelLibrary.LinearAlgebraOperation operation, string acceleratorType, string precision = "single")
Parameters
operationLinearAlgebraKernelLibrary.LinearAlgebraOperationThe linear algebra operation.
acceleratorTypestringThe target accelerator type.
precisionstringPrecision mode (single/double).
Returns
- string
Kernel source code.