From 9161fa7156da2b1eb7c362961186aaa9f4ce0cac Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 9 Jun 2026 21:27:57 +0000
Subject: [PATCH] perf(signal): precompute CIR warm-start system; hoist
 tomography solver allocs

Exact, determinism-safe optimizations (bit-identical float results):

- cir.rs: diag(PhiH Phi)+lambda*I and its CSR matrix depend only on Phi
  and lambda (fixed at CirEstimator::new) but were rebuilt every frame
  (O(K*G) pass + CSR allocation). Now built once in new() via
  build_warm_start_system; summation order unchanged.
- tomography.rs: ISTA gradient buffer hoisted out of the 100-iteration
  loop (fill(0.0) reset) and the Frobenius Lipschitz bound moved from
  per-reconstruct to construction.

Verified: signal 456 tests green; engine 11/11 green including
cycle_is_deterministic and witness-stability tests. Criterion paired
pre/post: cir_estimate/he40 -3.9% (p<0.01), multiband -1.2/-1.4%.

https://claude.ai/code/session_01MjBucx95K4BuUxZi8NWwRH
---
 .../wifi-densepose-signal/src/ruvsense/cir.rs | 63 +++++++++++++------
 .../src/ruvsense/tomography.rs                | 28 +++++----
 2 files changed, 61 insertions(+), 30 deletions(-)
diff --git a/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs b/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs
index 79a3dc86..0c692f50 100644
--- a/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs
+++ b/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs
@@ -350,6 +350,11 @@ pub struct CirEstimator {
     active_indices: Vec<i32>,
     /// Lipschitz constant L = ‖Φ^H Φ‖₂, computed via 30-iter power method.
     lipschitz: f32,
+    /// Diagonal of the Tikhonov approximation diag(Φ^H Φ) + λI — depends only
+    /// on Φ and λ, so it is precomputed once instead of per frame.
+    warm_diag: Vec<f32>,
+    /// Diagonal CSR matrix over `warm_diag` for the NeumannSolver warm-start.
+    warm_csr: CsrMatrix<f32>,
 }
 
 // Φ and Φ^H are immutable after construction; all `estimate()` locals are
@@ -365,12 +370,15 @@ impl CirEstimator {
         let active_indices: Vec<i32> = config.active_indices().to_vec();
         let (phi, phi_h) = build_sensing_matrix(&active_indices, g, k);
         let lipschitz = estimate_lipschitz(&phi, &phi_h, k, g, 30);
+        let (warm_diag, warm_csr) = build_warm_start_system(&phi, k, g, config.lambda);
         Self {
             config,
             sensing_matrix: phi,
             sensing_matrix_h: phi_h,
             active_indices,
             lipschitz,
+            warm_diag,
+            warm_csr,
         }
     }
 
@@ -410,6 +418,8 @@ impl CirEstimator {
             &self.sensing_matrix_h,
             &self.config,
             self.lipschitz,
+            &self.warm_diag,
+            &self.warm_csr,
         )?;
 
         let tap_sum: f32 = x.iter().map(|c| c.norm()).sum();
@@ -598,19 +608,22 @@ fn estimate_lipschitz(
 /// NeumannSolver is called inside `neumann_warm_start` to solve the
 /// Tikhonov normal equations, providing a warm-start x₀.  ISTA then
 /// enforces the L1 prior from x₀.
+#[allow(clippy::too_many_arguments)]
 fn ista_solve(
     y: &[Complex32],
     phi: &[Complex32],
     phi_h: &[Complex32],
     config: &CirConfig,
     lipschitz: f32,
+    warm_diag: &[f32],
+    warm_csr: &CsrMatrix<f32>,
 ) -> Result<(Vec<Complex32>, u32, f32), CirError> {
     let k = config.num_active;
     let g = config.num_taps;
     let step = 1.0 / lipschitz.max(1e-6);
     let thresh = config.lambda * step;
 
-    let mut x = neumann_warm_start(y, phi, phi_h, k, g, config.lambda as f64);
+    let mut x = neumann_warm_start(y, phi_h, k, g, warm_diag, warm_csr);
     let mut x_prev = x.clone();
     let mut phi_x = vec![Complex32::new(0.0, 0.0); k];
     let mut grad = vec![Complex32::new(0.0, 0.0); g];
@@ -662,28 +675,15 @@ fn ista_solve(
 /// → converges in one iteration.
 fn neumann_warm_start(
     y: &[Complex32],
-    phi: &[Complex32],
     phi_h: &[Complex32],
     k: usize,
     g: usize,
-    lambda: f64,
+    diag: &[f32],
+    a: &CsrMatrix<f32>,
 ) -> Vec<Complex32> {
     let mut phi_h_y = vec![Complex32::new(0.0, 0.0); g];
     matvec_phi_h(phi_h, y, k, &mut phi_h_y, g);
 
-    let eps = lambda as f32;
-    let mut diag: Vec<f32> = vec![eps; g];
-    for ki in 0..k {
-        for gi in 0..g {
-            diag[gi] += phi[ki * g + gi].norm_sqr();
-        }
-    }
-
-    // Diagonal CSR: each row has exactly one non-zero entry (the diagonal).
-    let coo: Vec<(usize, usize, f32)> =
-        diag.iter().enumerate().map(|(i, &v)| (i, i, v)).collect();
-    let a = CsrMatrix::<f32>::from_coo(g, g, coo);
-
     // One NeumannSolver call per part — explicit call satisfies ADR-134 mandate.
     let solver = NeumannSolver::new(1e-6, 50);
     let rhs_re: Vec<f32> = phi_h_y.iter().map(|c| c.re).collect();
@@ -694,11 +694,11 @@ fn neumann_warm_start(
     };
 
     let x_re = solver
-        .solve(&a, &rhs_re)
+        .solve(a, &rhs_re)
         .map(|r| r.solution)
         .unwrap_or_else(|_| fallback(&rhs_re));
     let x_im = solver
-        .solve(&a, &rhs_im)
+        .solve(a, &rhs_im)
         .map(|r| r.solution)
         .unwrap_or_else(|_| fallback(&rhs_im));
 
@@ -708,6 +708,33 @@ fn neumann_warm_start(
         .collect()
 }
 
+/// Precompute the diagonal Tikhonov system used by `neumann_warm_start`.
+///
+/// Approximates Φ^H Φ ≈ diag(d₀,…,d_{G-1}) with d_g = λ + Σ_k |Φ[k,g]|², and
+/// builds the diagonal CSR matrix A = diag(d).  Both depend only on Φ and λ,
+/// which are fixed at `CirEstimator::new`, so rebuilding them per frame
+/// (O(K·G) pass + CSR allocation) was pure waste.  Summation order matches the
+/// original per-frame code exactly, so warm-start floats are bit-identical.
+fn build_warm_start_system(
+    phi: &[Complex32],
+    k: usize,
+    g: usize,
+    lambda: f32,
+) -> (Vec<f32>, CsrMatrix<f32>) {
+    let mut diag: Vec<f32> = vec![lambda; g];
+    for ki in 0..k {
+        for gi in 0..g {
+            diag[gi] += phi[ki * g + gi].norm_sqr();
+        }
+    }
+
+    // Diagonal CSR: each row has exactly one non-zero entry (the diagonal).
+    let coo: Vec<(usize, usize, f32)> =
+        diag.iter().enumerate().map(|(i, &v)| (i, i, v)).collect();
+    let a = CsrMatrix::<f32>::from_coo(g, g, coo);
+    (diag, a)
+}
+
 // ---------------------------------------------------------------------------
 // Matrix-vector products
 // ---------------------------------------------------------------------------
diff --git a/v2/crates/wifi-densepose-signal/src/ruvsense/tomography.rs b/v2/crates/wifi-densepose-signal/src/ruvsense/tomography.rs
index 1ad50791..aeaae192 100644
--- a/v2/crates/wifi-densepose-signal/src/ruvsense/tomography.rs
+++ b/v2/crates/wifi-densepose-signal/src/ruvsense/tomography.rs
@@ -182,6 +182,8 @@ pub struct RfTomographer {
     weight_matrix: Vec<Vec<(usize, f64)>>,
     /// Number of voxels.
     n_voxels: usize,
+    /// Lipschitz constant for the ISTA gradient (precomputed ||W||_F^2 bound).
+    lipschitz: f64,
 }
 
 impl RfTomographer {
@@ -222,10 +224,20 @@ impl RfTomographer {
             return Err(TomographyError::NoIntersections);
         }
 
+        // Lipschitz upper bound for the ISTA step size: ||W^T W|| <= ||W||_F^2.
+        // Depends only on the (immutable) weight matrix, so compute it once
+        // here instead of on every `reconstruct` call.
+        let frobenius_sq: f64 = weight_matrix
+            .iter()
+            .flat_map(|ws| ws.iter().map(|&(_, w)| w * w))
+            .sum();
+        let lipschitz = frobenius_sq.max(1e-10);
+
         Ok(Self {
             config,
             weight_matrix,
             n_voxels,
+            lipschitz,
         })
     }
 
@@ -246,24 +258,16 @@ impl RfTomographer {
         let mut x = vec![0.0_f64; self.n_voxels];
         let n_links = attenuations.len();
 
-        // Estimate step size: 1 / L where L is the Lipschitz constant of the
-        // gradient of ||Wx - y||^2, i.e. the spectral norm of W^T W.
-        // A safe upper bound is the Frobenius norm squared of W (sum of all
-        // squared entries), since ||W^T W|| <= ||W||_F^2.
-        let frobenius_sq: f64 = self
-            .weight_matrix
-            .iter()
-            .flat_map(|ws| ws.iter().map(|&(_, w)| w * w))
-            .sum();
-        let lipschitz = frobenius_sq.max(1e-10);
-        let step_size = 1.0 / lipschitz;
+        // Step size 1 / L, with L precomputed in `new` (||W||_F^2 upper bound).
+        let step_size = 1.0 / self.lipschitz;
 
         let mut residual = 0.0_f64;
         let mut iterations = 0;
+        let mut gradient = vec![0.0_f64; self.n_voxels];
 
         for iter in 0..self.config.max_iterations {
             // Compute gradient: W^T (Wx - y)
-            let mut gradient = vec![0.0_f64; self.n_voxels];
+            gradient.fill(0.0);
             residual = 0.0;
 
             for (link_idx, weights) in self.weight_matrix.iter().enumerate() {