From d92553877383d8cf42576ffbcf0bc550986ef81d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Jun 2026 20:43:42 +0000 Subject: [PATCH] feat(train): falsifiable occupancy benchmark with anti-overfitting gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the presence/person-count "beyond SOTA" claim falsifiable in code instead of aspirational (the unfalsifiability gap from the beyond-SOTA system review). occupancy_bench grades predictions vs ground truth and gates a SOTA claim behind one claim_allowed invariant requiring ALL of: - DataProvenance::Measured — synthetic/mock data is scorable for regression but never claimable (anti-mock-contamination; the CLAUDE.md Kconfig-bug lesson made structural). - A leak-free EvalSplit — validate() refuses any split where a subject OR environment id appears in both train and test (subject leakage / per-environment overfitting). - n_test >= min_test_samples (small-N guard). - Presence F1 whose bootstrap-CI lower bound (deterministic seeded splitmix64) clears the threshold — not the point estimate. - Count MAE within threshold. The claim string is unreadable except through the gate (NO_CLAIM otherwise), same discipline as the ruview-gamma acceptance gate. What remains is data, not method: a frozen, SHA-pinned, subject/environment-disjoint measured replay set turns the claim into a passing/failing test. Lives in wifi-densepose-train (the eval bounded context, alongside ablation/ eval/metrics). 10 tests cover each refusal path; warning-clean under the crate's missing_docs lint. Workspace gate 2,914 passed / 0 failed. Doc 03 updated. https://claude.ai/code/session_01MjBucx95K4BuUxZi8NWwRH --- CHANGELOG.md | 1 + .../03-benchmark-validation-methodology.md | 27 + v2/crates/wifi-densepose-train/src/lib.rs | 3 + .../src/occupancy_bench.rs | 546 ++++++++++++++++++ 4 files changed, 577 insertions(+) create mode 100644 v2/crates/wifi-densepose-train/src/occupancy_bench.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 3273bc40..fe6ea1fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **RF tomography solver hoisting** — ISTA gradient buffer no longer allocated inside the 100-iteration loop, and the Frobenius Lipschitz bound moved from per-`reconstruct` to construction (`ruvsense/tomography.rs`). Bit-identical results. ### Added +- **Falsifiable occupancy benchmark (`wifi-densepose-train::occupancy_bench`).** Makes the presence/person-count "beyond SOTA" claim falsifiable in code instead of aspirational (the unfalsifiability gap from the beyond-SOTA system review). Grades predictions vs ground truth and gates a SOTA claim behind one `claim_allowed` invariant requiring all of: `DataProvenance::Measured` (synthetic/mock is scorable but **never claimable** — anti-mock-contamination per the CLAUDE.md Kconfig-bug lesson), a leak-free `EvalSplit` (refuses any split where a subject *or* environment id appears in both train and test — subject leakage / per-environment overfitting), `n_test ≥ min`, presence-F1 **bootstrap-CI lower bound** (deterministic seeded splitmix64) clearing the threshold, and count MAE within threshold. The claim string is unreadable except through the gate (`NO_CLAIM` otherwise) — same discipline as the `ruview-gamma` acceptance gate. What remains is data, not method: a frozen, SHA-pinned, subject/environment-disjoint measured replay set turns the claim into a passing/failing test. 10 tests cover each refusal path. - **Live trust path: sensing-server now routes real frames through the governed `StreamingEngine`.** Previously the live server ran only the *bare* `MultistaticFuser` (fused amplitudes, no trust control plane), while the privacy/provenance/witness engine (ADR-135..146) ran only on synthetic in-test frames — the gap called out in ADR-136 §8 and the beyond-SOTA system review (the privacy control plane was bypassable). New `engine_bridge` module drives `StreamingEngine::process_cycle` from the server's live `NodeState` map (reusing the existing `NodeState → MultiBandCsiFrame` conversion), lazily wiring each node as a WorldGraph sensor and bounding belief growth via the retention cap. Wired additively into both live ESP32/WiFi fusion sites in `main.rs` (split-borrow off the write guard; does not alter person-count behavior) and stores the latest BLAKE3 witness on `AppState`. Every published belief now carries evidence + model + calibration + privacy decision and a deterministic witness. Adds `wifi-densepose-engine/-worldgraph/-bfld/-geo` deps. 6 new bridge tests (witnessed belief with provenance, determinism, idempotent node registration, retention bound, privacy-mode propagation); sensing-server suite 430+128 green. ### Fixed diff --git a/docs/research/ruview-beyond-sota/03-benchmark-validation-methodology.md b/docs/research/ruview-beyond-sota/03-benchmark-validation-methodology.md index 6e15d104..803b54a1 100644 --- a/docs/research/ruview-beyond-sota/03-benchmark-validation-methodology.md +++ b/docs/research/ruview-beyond-sota/03-benchmark-validation-methodology.md @@ -344,6 +344,33 @@ Anyone outside the project must be able to re-run every claimed result: --- +## Update — falsifiable occupancy benchmark implemented + +`wifi-densepose-train::occupancy_bench` (added this branch) makes the +presence/person-count claim **falsifiable in code**, directly enforcing the L3 +discipline above. It grades predictions vs ground truth and gates a SOTA claim +behind a single `claim_allowed` invariant that requires **all** of: + +1. `DataProvenance::Measured` — synthetic/mock data is scorable for regression + but **never claimable** (anti-mock-contamination; the CLAUDE.md Kconfig-bug + lesson made structural). +2. A leak-free `EvalSplit` — `validate()` refuses any split where a subject *or* + environment id appears in both train and test (subject leakage / per-env + overfitting). +3. `n_test ≥ min_test_samples` (small-N guard). +4. Presence F1 whose **bootstrap-CI lower bound** (deterministic splitmix64, + seeded) clears the threshold — not the point estimate. +5. Count MAE within threshold. + +The claim string is unreadable except through the gate (returns `NO_CLAIM` +otherwise) — same discipline as the `ruview-gamma` acceptance gate. 10 tests +cover each refusal path. What remains is *data*, not *method*: feed it a frozen, +SHA-pinned, subject/environment-disjoint **measured** replay set (the curated +room-A/room-B item above) and the "beyond SOTA" claim becomes a passing or +failing test, not a slogan. + +--- + *All values cited from: `benchmark_baseline.json`, `v2/crates/*/benches/*.rs` (15 files), `docs/adr/ADR-147-benchmark-proof.md`, `docs/adr/ADR-149-swarm-benchmarking-evaluation-methodology.md`, diff --git a/v2/crates/wifi-densepose-train/src/lib.rs b/v2/crates/wifi-densepose-train/src/lib.rs index 0b72852c..6ac7c03b 100644 --- a/v2/crates/wifi-densepose-train/src/lib.rs +++ b/v2/crates/wifi-densepose-train/src/lib.rs @@ -70,6 +70,9 @@ pub mod proof; /// ADR-145 — ablation evaluation harness (feature matrix + privacy/latency metrics). pub mod ablation; +/// Falsifiable occupancy/presence benchmark (real-CSI gate: provenance, +/// leak-free split, bootstrap-CI thresholds; refuses claims on synthetic/mock). +pub mod occupancy_bench; #[cfg(feature = "tch-backend")] pub mod trainer; diff --git a/v2/crates/wifi-densepose-train/src/occupancy_bench.rs b/v2/crates/wifi-densepose-train/src/occupancy_bench.rs new file mode 100644 index 00000000..13e37a2b --- /dev/null +++ b/v2/crates/wifi-densepose-train/src/occupancy_bench.rs @@ -0,0 +1,546 @@ +//! Falsifiable occupancy / presence benchmark over labeled CSI sequences. +//! +//! The beyond-SOTA system review found that "beyond SOTA" was *unfalsifiable*: +//! no real-CSI ground-truth benchmark existed, and the eval pyramid (doc 03) +//! lists the field's recurring measurement frauds — subject leakage between +//! train/test, per-environment overfitting, and **mock-mode contamination** +//! (CLAUDE.md: mock missed a real Kconfig bug). +//! +//! This module makes the claim falsifiable. It **grades** predictions against +//! ground truth (it does not run a model — keeping the eval crate light and the +//! scoring model-agnostic), and it enforces, *structurally*, the discipline +//! that prevents overclaiming: +//! +//! 1. **No SOTA claim on non-measured data.** A dataset is tagged +//! [`DataProvenance`]; only [`DataProvenance::Measured`] can release a claim. +//! Synthetic/Mock data can still be scored (useful for CI/regression) but the +//! [`ClaimGate`] returns [`NO_CLAIM`] — you cannot accidentally publish a +//! "beyond SOTA" number computed on simulated CSI. +//! 2. **No leaky splits.** [`EvalSplit::validate`] refuses a split where any +//! subject *or* environment id appears in both train and test. +//! 3. **Pre-registered thresholds + bootstrap CI.** The gate compares the +//! *lower* bound of a deterministic 95% bootstrap CI, not the point estimate, +//! so a lucky small-sample result cannot pass. +//! +//! The harness is the same shape as the `ruview-gamma` acceptance gate: a single +//! `claim_allowed` invariant, and the claim string is unreadable except through +//! the gate. + +use std::collections::BTreeSet; + +/// Provenance of the labeled data a benchmark runs on. Gates whether a SOTA +/// claim is releasable at all. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DataProvenance { + /// Real CSI captured from hardware with independent ground truth. The only + /// provenance that can release a claim. + Measured, + /// Deterministic synthetic CSI (e.g. the proof generator). Scorable for + /// regression, never claimable. + Synthetic, + /// Mock/stub data path. Scorable, never claimable — mock contamination is a + /// documented failure mode (CLAUDE.md Kconfig-bug lesson). + Mock, +} + +impl DataProvenance { + /// Whether data of this provenance may ever release a SOTA/accuracy claim. + pub fn is_claimable(self) -> bool { + matches!(self, DataProvenance::Measured) + } + + /// Stable lowercase tag for logs/reports. + pub fn tag(self) -> &'static str { + match self { + DataProvenance::Measured => "measured", + DataProvenance::Synthetic => "synthetic", + DataProvenance::Mock => "mock", + } + } +} + +/// The research-only string returned when a claim is withheld. +pub const NO_CLAIM: &str = "research use only — not claimable (non-measured data, leaky split, or unmet thresholds)"; + +/// Ground-truth / predicted occupancy for one sample. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Occupancy { + /// Whether any person is present. + pub present: bool, + /// Estimated number of people. + pub person_count: u32, +} + +impl Occupancy { + /// Construct an occupancy label. + pub fn new(present: bool, person_count: u32) -> Self { + Self { present, person_count } + } +} + +/// One labeled, attributed evaluation sample: who/where it came from (for +/// leakage checks) and the ground-truth vs predicted occupancy. +#[derive(Debug, Clone)] +pub struct LabeledSample { + /// Subject identity (for subject-disjoint split enforcement). + pub subject_id: String, + /// Capture environment/room (for environment-disjoint split enforcement). + pub environment_id: String, + /// Ground-truth occupancy. + pub truth: Occupancy, + /// Model-predicted occupancy. + pub predicted: Occupancy, +} + +/// A train/test split by sample index, with leakage validation. +#[derive(Debug, Clone)] +pub struct EvalSplit { + /// Indices of training samples. + pub train_idx: Vec, + /// Indices of held-out test samples (graded). + pub test_idx: Vec, +} + +/// Why a split is rejected. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SplitError { + /// A subject id appears in both train and test (subject leakage). + SubjectLeakage(String), + /// An environment id appears in both (per-environment overfitting risk). + EnvironmentLeakage(String), + /// An index is out of range for the sample set. + IndexOutOfRange(usize), + /// The test set is empty. + EmptyTest, +} + +impl EvalSplit { + /// Validate the split against `samples`: every test subject/environment must + /// be **disjoint** from the training set. This is the single most common + /// way WiFi-sensing papers overstate accuracy (doc 03). + pub fn validate(&self, samples: &[LabeledSample]) -> Result<(), SplitError> { + if self.test_idx.is_empty() { + return Err(SplitError::EmptyTest); + } + for &i in self.train_idx.iter().chain(&self.test_idx) { + if i >= samples.len() { + return Err(SplitError::IndexOutOfRange(i)); + } + } + let train_subjects: BTreeSet<&str> = + self.train_idx.iter().map(|&i| samples[i].subject_id.as_str()).collect(); + let train_envs: BTreeSet<&str> = + self.train_idx.iter().map(|&i| samples[i].environment_id.as_str()).collect(); + for &i in &self.test_idx { + let s = &samples[i]; + if train_subjects.contains(s.subject_id.as_str()) { + return Err(SplitError::SubjectLeakage(s.subject_id.clone())); + } + if train_envs.contains(s.environment_id.as_str()) { + return Err(SplitError::EnvironmentLeakage(s.environment_id.clone())); + } + } + Ok(()) + } +} + +/// Pre-registered acceptance thresholds (doc 03 acceptance table). Defaults are +/// deliberately conservative; tighten per capability axis. +#[derive(Debug, Clone, Copy)] +pub struct BenchmarkCriteria { + /// Minimum presence F1 (lower CI bound must clear this). + pub min_presence_f1: f64, + /// Maximum person-count mean absolute error. + pub max_count_mae: f64, + /// Minimum test samples to grade at all (small-N guard). + pub min_test_samples: usize, + /// Bootstrap resamples for the CI. + pub bootstrap_iters: usize, + /// Deterministic bootstrap seed. + pub bootstrap_seed: u64, +} + +impl Default for BenchmarkCriteria { + fn default() -> Self { + Self { + min_presence_f1: 0.9, + max_count_mae: 0.5, + min_test_samples: 30, + bootstrap_iters: 1000, + bootstrap_seed: 42, + } + } +} + +/// The graded result. +#[derive(Debug, Clone, PartialEq)] +pub struct BenchmarkReport { + /// Data provenance tag (`measured`/`synthetic`/`mock`). + pub provenance_tag: &'static str, + /// Number of held-out test samples graded. + pub n_test: usize, + /// Presence accuracy (TP+TN)/N. + pub presence_accuracy: f64, + /// Presence F1 (point estimate). + pub presence_f1: f64, + /// 95% bootstrap CI for presence F1 (lower, upper). + pub presence_f1_ci: (f64, f64), + /// Fraction of samples with an exactly correct person count. + pub count_exact_match: f64, + /// Person-count mean absolute error. + pub count_mae: f64, + /// Data is measured (claimable provenance). + pub provenance_pass: bool, + /// Split is leak-free (subject- and environment-disjoint). + pub split_pass: bool, + /// Presence F1 CI-lower clears the threshold. + pub presence_pass: bool, + /// Count MAE within the threshold. + pub count_pass: bool, + /// Test set is large enough to grade. + pub sample_size_pass: bool, + /// All five criteria pass. + pub overall_pass: bool, + /// The released claim string (or [`NO_CLAIM`]). + pub released_claim: String, +} + +impl BenchmarkReport { + /// The released claim string (program claim on pass, [`NO_CLAIM`] on fail). + pub fn claim(&self) -> &str { + &self.released_claim + } +} + +/// **The single claim invariant.** A SOTA/accuracy claim is releasable only when +/// the data is measured, the split is leak-free, the sample is large enough, and +/// both the (CI-lower) presence F1 and the count MAE clear their thresholds. +#[inline] +pub fn claim_allowed( + provenance_pass: bool, + split_pass: bool, + sample_size_pass: bool, + presence_pass: bool, + count_pass: bool, +) -> bool { + provenance_pass && split_pass && sample_size_pass && presence_pass && count_pass +} + +/// Grade the test split of `samples` under `criteria`. +/// +/// `split` is validated first; on any leakage the report is marked invalid and +/// the claim is withheld (metrics are still computed for visibility). +pub fn evaluate( + samples: &[LabeledSample], + provenance: DataProvenance, + split: &EvalSplit, + criteria: &BenchmarkCriteria, +) -> BenchmarkReport { + let split_pass = split.validate(samples).is_ok(); + let test: Vec<&LabeledSample> = split + .test_idx + .iter() + .filter(|&&i| i < samples.len()) + .map(|&i| &samples[i]) + .collect(); + let n_test = test.len(); + + // Presence confusion counts. + let (mut tp, mut fp, mut tn, mut fn_) = (0u64, 0u64, 0u64, 0u64); + let mut count_abs_err_sum = 0.0; + let mut count_exact = 0u64; + for s in &test { + match (s.predicted.present, s.truth.present) { + (true, true) => tp += 1, + (true, false) => fp += 1, + (false, false) => tn += 1, + (false, true) => fn_ += 1, + } + count_abs_err_sum += + (s.predicted.person_count as f64 - s.truth.person_count as f64).abs(); + if s.predicted.person_count == s.truth.person_count { + count_exact += 1; + } + } + let presence_accuracy = if n_test > 0 { + (tp + tn) as f64 / n_test as f64 + } else { + 0.0 + }; + let presence_f1 = f1_from_confusion(tp, fp, fn_); + let count_mae = if n_test > 0 { + count_abs_err_sum / n_test as f64 + } else { + f64::INFINITY + }; + let count_exact_match = if n_test > 0 { + count_exact as f64 / n_test as f64 + } else { + 0.0 + }; + let presence_f1_ci = bootstrap_f1_ci(&test, criteria.bootstrap_iters, criteria.bootstrap_seed); + + let provenance_pass = provenance.is_claimable(); + let sample_size_pass = n_test >= criteria.min_test_samples; + // Gate on the LOWER CI bound, not the point estimate (small-N guard). + let presence_pass = presence_f1_ci.0 >= criteria.min_presence_f1; + let count_pass = count_mae <= criteria.max_count_mae; + let overall_pass = claim_allowed( + provenance_pass, + split_pass, + sample_size_pass, + presence_pass, + count_pass, + ); + + let released_claim = if overall_pass { + format!( + "presence F1 {:.3} (95% CI {:.3}-{:.3}), count MAE {:.3} on {} held-out measured samples", + presence_f1, presence_f1_ci.0, presence_f1_ci.1, count_mae, n_test + ) + } else { + NO_CLAIM.to_string() + }; + + BenchmarkReport { + provenance_tag: provenance.tag(), + n_test, + presence_accuracy, + presence_f1, + presence_f1_ci, + count_exact_match, + count_mae, + provenance_pass, + split_pass, + presence_pass, + count_pass, + sample_size_pass, + overall_pass, + released_claim, + } +} + +fn f1_from_confusion(tp: u64, fp: u64, fn_: u64) -> f64 { + let denom = 2 * tp + fp + fn_; + if denom == 0 { + // No positives anywhere: define F1 = 1.0 only if there were also no + // predicted/actual positives at all (vacuous), else 0.0. + return if fp == 0 && fn_ == 0 { 1.0 } else { 0.0 }; + } + (2 * tp) as f64 / denom as f64 +} + +/// Deterministic 95% bootstrap CI for presence F1 (percentile method) using a +/// small splitmix64 PRNG — no external rng, reproducible across machines. +fn bootstrap_f1_ci(test: &[&LabeledSample], iters: usize, seed: u64) -> (f64, f64) { + let n = test.len(); + if n == 0 || iters == 0 { + return (0.0, 0.0); + } + let mut state = seed; + let mut next = || { + // splitmix64 + state = state.wrapping_add(0x9E37_79B9_7F4A_7C15); + let mut z = state; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); + z ^ (z >> 31) + }; + let mut f1s = Vec::with_capacity(iters); + for _ in 0..iters { + let (mut tp, mut fp, mut fn_) = (0u64, 0u64, 0u64); + for _ in 0..n { + let idx = (next() % n as u64) as usize; + let s = test[idx]; + match (s.predicted.present, s.truth.present) { + (true, true) => tp += 1, + (true, false) => fp += 1, + (false, true) => fn_ += 1, + (false, false) => {} + } + } + f1s.push(f1_from_confusion(tp, fp, fn_)); + } + f1s.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let pct = |q: f64| { + let rank = ((q * (f1s.len() as f64 - 1.0)).round() as usize).min(f1s.len() - 1); + f1s[rank] + }; + (pct(0.025), pct(0.975)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample(subj: &str, env: &str, t: (bool, u32), p: (bool, u32)) -> LabeledSample { + LabeledSample { + subject_id: subj.into(), + environment_id: env.into(), + truth: Occupancy::new(t.0, t.1), + predicted: Occupancy::new(p.0, p.1), + } + } + + /// A perfect predictor on a leak-free MEASURED split releases a claim. + fn perfect_measured(n: usize) -> (Vec, EvalSplit) { + let mut samples = Vec::new(); + // train subjects s0.., test subjects t0.. (disjoint); envs likewise. + for i in 0..n { + samples.push(sample( + &format!("train-s{i}"), + &format!("train-e{i}"), + (i % 2 == 0, (i % 3) as u32), + (i % 2 == 0, (i % 3) as u32), + )); + } + for i in 0..n { + samples.push(sample( + &format!("test-s{i}"), + &format!("test-e{i}"), + (i % 2 == 0, (i % 3) as u32), + (i % 2 == 0, (i % 3) as u32), + )); + } + let split = EvalSplit { + train_idx: (0..n).collect(), + test_idx: (n..2 * n).collect(), + }; + (samples, split) + } + + #[test] + fn perfect_measured_releases_claim() { + let (samples, split) = perfect_measured(40); + let r = evaluate(&samples, DataProvenance::Measured, &split, &BenchmarkCriteria::default()); + assert!(r.overall_pass); + assert!((r.presence_f1 - 1.0).abs() < 1e-9); + assert_eq!(r.count_mae, 0.0); + assert!(r.released_claim.contains("F1")); + assert!(!r.released_claim.contains("research use only")); + } + + #[test] + fn synthetic_data_is_scored_but_never_claimed() { + let (samples, split) = perfect_measured(40); + let r = evaluate(&samples, DataProvenance::Synthetic, &split, &BenchmarkCriteria::default()); + // Metrics are still computed... + assert!((r.presence_f1 - 1.0).abs() < 1e-9); + // ...but no claim, because the data is not measured. + assert!(!r.provenance_pass); + assert!(!r.overall_pass); + assert_eq!(r.claim(), NO_CLAIM); + } + + #[test] + fn mock_data_is_never_claimed() { + let (samples, split) = perfect_measured(40); + let r = evaluate(&samples, DataProvenance::Mock, &split, &BenchmarkCriteria::default()); + assert!(!r.provenance_pass); + assert_eq!(r.claim(), NO_CLAIM); + } + + #[test] + fn subject_leakage_is_rejected() { + // Same subject id in train and test. + let samples = vec![ + sample("shared", "e0", (true, 1), (true, 1)), + sample("shared", "e1", (true, 1), (true, 1)), + ]; + let split = EvalSplit { train_idx: vec![0], test_idx: vec![1] }; + assert_eq!( + split.validate(&samples), + Err(SplitError::SubjectLeakage("shared".into())) + ); + let r = evaluate(&samples, DataProvenance::Measured, &split, &BenchmarkCriteria::default()); + assert!(!r.split_pass); + assert!(!r.overall_pass); + assert_eq!(r.claim(), NO_CLAIM); + } + + #[test] + fn environment_leakage_is_rejected() { + let samples = vec![ + sample("s0", "shared-room", (true, 1), (true, 1)), + sample("s1", "shared-room", (true, 1), (true, 1)), + ]; + let split = EvalSplit { train_idx: vec![0], test_idx: vec![1] }; + assert_eq!( + split.validate(&samples), + Err(SplitError::EnvironmentLeakage("shared-room".into())) + ); + } + + #[test] + fn small_sample_is_withheld_even_if_perfect() { + let (samples, split) = perfect_measured(5); // 5 < default min 30 + let r = evaluate(&samples, DataProvenance::Measured, &split, &BenchmarkCriteria::default()); + assert!(!r.sample_size_pass); + assert!(!r.overall_pass); + } + + #[test] + fn gate_uses_ci_lower_bound_not_point_estimate() { + // A predictor that is right most of the time but with enough errors that + // the bootstrap LOWER bound dips below the 0.9 threshold even if the + // point F1 is near it. + let mut samples = Vec::new(); + for i in 0..40 { + samples.push(sample(&format!("train-{i}"), &format!("te-{i}"), (true, 1), (true, 1))); + } + for i in 0..40 { + // ~15% false negatives in test + let correct = i % 7 != 0; + samples.push(sample( + &format!("test-{i}"), + &format!("tn-{i}"), + (true, 1), + (correct, 1), + )); + } + let split = EvalSplit { train_idx: (0..40).collect(), test_idx: (40..80).collect() }; + let r = evaluate(&samples, DataProvenance::Measured, &split, &BenchmarkCriteria::default()); + // CI lower bound is below the point estimate. + assert!(r.presence_f1_ci.0 <= r.presence_f1); + } + + #[test] + fn bootstrap_ci_is_deterministic() { + let (samples, split) = perfect_measured(40); + let a = evaluate(&samples, DataProvenance::Measured, &split, &BenchmarkCriteria::default()); + let b = evaluate(&samples, DataProvenance::Measured, &split, &BenchmarkCriteria::default()); + assert_eq!(a.presence_f1_ci, b.presence_f1_ci); + } + + #[test] + fn count_mae_failure_withholds_claim() { + let mut samples = Vec::new(); + for i in 0..40 { + samples.push(sample(&format!("tr-{i}"), &format!("te-{i}"), (true, 1), (true, 1))); + } + for i in 0..40 { + // presence perfect, but count is always off by 2 -> MAE 2.0 > 0.5 + samples.push(sample(&format!("ts-{i}"), &format!("ev-{i}"), (true, 1), (true, 3))); + } + let split = EvalSplit { train_idx: (0..40).collect(), test_idx: (40..80).collect() }; + let r = evaluate(&samples, DataProvenance::Measured, &split, &BenchmarkCriteria::default()); + assert!(r.presence_pass); + assert!(!r.count_pass); + assert!(!r.overall_pass); + } + + #[test] + fn claim_invariant_requires_all_five() { + assert!(claim_allowed(true, true, true, true, true)); + let one_false = [ + (false, true, true, true, true), + (true, false, true, true, true), + (true, true, false, true, true), + (true, true, true, false, true), + (true, true, true, true, false), + ]; + for (a, b, c, d, e) in one_false { + assert!(!claim_allowed(a, b, c, d, e)); + } + } +}