fix(ruview-swarm): fail-closed on NaN/Inf at swarm-comm trust boundary (ADR-148)

Beyond-SOTA security review of the ADR-148 drone swarm control plane found
four IEEE-754 NaN/Inf fail-open / DoS bugs on data crossing the untrusted
swarm-comm boundary (receive_peer_state / receive_peer_detection accept full
DroneState/CsiDetection whose f64/f32 fields deserialize with no finite-check).

- HIGH: failsafe::tick collision-avoidance + battery checks fail-open on NaN
  (NaN < threshold == false silently disabled collision avoidance / kept a
  NaN-battery drone Nominal). Now fails closed to EmergencyDiverge / RTH.
- MED: geofence::check NaN-altitude bypass returned Safe through the
  point-in-polygon path. Now leading non-finite-coordinate guard -> HardBreach.
- MED/DoS: antijamming FhssRadio panicked with "% 0" on an empty deserialized
  channels_mhz. Now len==0 early-returns (benign 0.0 sentinel).
- LOW: multiview::fuse propagated a NaN victim_position into the fused
  "confirmed victim" location. Now requires finite confidence + position.

Each fix pinned by a fails-on-old / passes-on-new test (MEASURED: old code
returned Nominal/Safe or panicked). cargo test -p ruview-swarm
--no-default-features: 117 -> 123 passed, 0 failed. Workspace green; Python
deterministic proof unchanged (f8e76f21...46f7a, off the signal path).

Documented-not-fixed (ADR slot 176): Raft AppendEntries lacks Log-Matching
consistency check (topology/raft.rs); MavlinkSigner::verify uses non-constant
-time tag compare + no replay-window rejection (already doc-flagged).

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
ruv
2026-06-15 08:57:36 -04:00
parent 0f64d23516
commit f671000d70
5 changed files with 179 additions and 9 deletions
+1
View File
File diff suppressed because one or more lines are too long
+48 -4
View File
@@ -47,8 +47,18 @@ impl FailSafeMachine {
link_alive: bool,
nearest_neighbor_dist: f64,
) -> FailSafeState {
// Collision avoidance has highest priority
if nearest_neighbor_dist < self.collision_dist_m {
// Collision avoidance has highest priority.
//
// Fail CLOSED on a non-finite neighbour distance. `nearest_neighbor_dist`
// is derived from peer positions (see
// `SwarmOrchestrator::nearest_peer_distance`), which arrive over the
// untrusted swarm comm layer as `DroneState` values whose f64 position
// fields can deserialize to NaN/Inf. A naive `NaN < collision_dist_m`
// evaluates to `false`, silently DISABLING collision avoidance — the
// worst possible failure for a physical drone. Treat a non-finite
// distance as "too close" so the swarm diverges rather than trusting a
// poisoned reading.
if !nearest_neighbor_dist.is_finite() || nearest_neighbor_dist < self.collision_dist_m {
self.state = FailSafeState::EmergencyDiverge;
return self.state.clone();
}
@@ -71,8 +81,11 @@ impl FailSafeMachine {
}
}
// Battery checks
if state.battery_pct <= self.battery_rth_pct {
// Battery checks. A non-finite battery reading (NaN/Inf from a corrupt or
// forged telemetry/peer message) must fail CLOSED: `NaN <= threshold` is
// `false`, which would otherwise let a drone with an unknown battery
// level keep flying nominally. Treat a non-finite reading as critical.
if !state.battery_pct.is_finite() || state.battery_pct <= self.battery_rth_pct {
self.state = FailSafeState::ReturnToHome;
} else if state.battery_pct <= self.battery_warn_pct {
self.state = FailSafeState::LowBatteryWarn;
@@ -144,4 +157,35 @@ mod tests {
let result = fsm.tick(&s, true, 0.5); // too close
assert_eq!(result, FailSafeState::EmergencyDiverge);
}
/// Security: a NaN neighbour distance (poisoned peer position over the swarm
/// comm layer) must NOT silently disable collision avoidance. Fails on old
/// code where `NaN < collision_dist_m` is `false` and the state stays Nominal.
#[test]
fn test_nan_neighbor_distance_fails_closed_to_diverge() {
let mut fsm = FailSafeMachine::new();
let s = good_state();
let result = fsm.tick(&s, true, f64::NAN);
assert_eq!(
result,
FailSafeState::EmergencyDiverge,
"non-finite neighbour distance must fail closed to EmergencyDiverge"
);
}
/// Security: a NaN battery reading must fail closed to ReturnToHome rather
/// than being treated as a healthy battery. Fails on old code where
/// `NaN <= battery_rth_pct` is `false` and the drone stays Nominal.
#[test]
fn test_nan_battery_fails_closed_to_rth() {
let mut fsm = FailSafeMachine::new();
let mut s = good_state();
s.battery_pct = f32::NAN;
let result = fsm.tick(&s, true, 10.0);
assert_eq!(
result,
FailSafeState::ReturnToHome,
"non-finite battery must fail closed to ReturnToHome"
);
}
}
@@ -59,8 +59,16 @@ impl FhssRadio {
}
/// Returns the current active channel frequency in MHz.
///
/// `FhssConfig` is `Deserialize`, so `channels_mhz` can arrive empty from a
/// malformed or hostile config. An empty channel list would make `% n`
/// (n = 0) panic with a divide-by-zero. Guard it and return a benign `0.0`
/// sentinel instead of crashing the radio task (DoS-resistance).
pub fn current_channel_mhz(&self) -> f64 {
let n = self.config.channels_mhz.len();
if n == 0 {
return 0.0;
}
// XOR node seed into hop index so each node uses a different offset
let idx = (self.hop_index ^ (self.node_seed as usize)) % n;
self.config.channels_mhz[idx]
@@ -68,7 +76,11 @@ impl FhssRadio {
/// Advance the hop sequence by one step (call at hop_rate_hz).
pub fn next_hop(&mut self) {
self.hop_index = (self.hop_index + 1) % self.config.channels_mhz.len();
let n = self.config.channels_mhz.len();
if n == 0 {
return; // no channels configured — nothing to hop (avoid `% 0` panic)
}
self.hop_index = (self.hop_index + 1) % n;
}
/// Update with latest RSSI measurement. Drives jamming detection.
@@ -97,9 +109,13 @@ impl FhssRadio {
.wrapping_mul(lcg_a)
.wrapping_add(self.evasion_count)
.wrapping_add(lcg_c);
let n = self.config.channels_mhz.len() as u64;
let len = self.config.channels_mhz.len();
if len == 0 {
return; // no channels configured — avoid `% 0` panic
}
let n = len as u64;
let offset = (seed % n / 4 + 3) as usize;
self.hop_index = (self.hop_index + offset) % self.config.channels_mhz.len();
self.hop_index = (self.hop_index + offset) % len;
self.evasion_count += 1;
self.rssi_history.clear();
}
@@ -165,6 +181,23 @@ mod tests {
assert_eq!(radio.hop_index, (initial_idx + 2) % 50);
}
/// Security/DoS: an empty `channels_mhz` (deserialized from a malformed or
/// hostile config) must not panic with a `% 0` divide-by-zero. Fails on old
/// code, where `next_hop`/`current_channel_mhz`/`evasive_hop`/`tick` all do
/// modulo / index by `channels_mhz.len()`.
#[test]
fn test_empty_channels_does_not_panic() {
let cfg = FhssConfig { channels_mhz: vec![], jamming_detect_window: 1, ..Default::default() };
let mut radio = FhssRadio::new(7, cfg);
// None of these may panic.
let _ = radio.current_channel_mhz();
radio.next_hop();
radio.observe_rssi(-99.0); // window=1 → jamming_detected() true → evasive_hop()
radio.tick(100.0);
radio.evasive_hop();
assert_eq!(radio.current_channel_mhz(), 0.0, "empty channel list returns sentinel");
}
#[test]
fn test_channel_in_valid_range() {
let cfg = FhssConfig::default();
@@ -27,6 +27,16 @@ pub enum GeofenceResult {
impl Geofence {
/// Check a position against this geofence.
pub fn check(&self, pos: &Position3D) -> GeofenceResult {
// Fail CLOSED on a non-finite position. A NaN/Inf component (from a
// corrupt GPS/EKF estimate or a forged position) makes every subsequent
// comparison false: `NaN < min || NaN > max` is `false`, so the altitude
// breach is skipped, and a NaN altitude with otherwise-valid x/y would
// return `Safe` — a silent geofence bypass on a flight-safety boundary.
// Treat any non-finite coordinate as a hard breach.
if !pos.x.is_finite() || !pos.y.is_finite() || !pos.z.is_finite() {
return GeofenceResult::HardBreach;
}
let altitude_m = -pos.z; // NED: negative z = altitude above ground
// Altitude check
@@ -146,4 +156,29 @@ mod tests {
let pos = Position3D { x: 50.0, y: 50.0, z: -200.0 }; // 200m altitude
assert_eq!(f.check(&pos), GeofenceResult::HardBreach);
}
/// Security: a NaN altitude with an otherwise in-bounds x/y must fail closed
/// to HardBreach. Fails on old code where `NaN < min || NaN > max` is `false`,
/// the altitude check is skipped, and the point-in-polygon path returns Safe —
/// a silent geofence bypass.
#[test]
fn test_nan_altitude_fails_closed() {
let f = square_fence();
let pos = Position3D { x: 50.0, y: 50.0, z: f64::NAN };
assert_eq!(f.check(&pos), GeofenceResult::HardBreach);
}
/// Security: NaN/Inf horizontal coordinates must also fail closed.
#[test]
fn test_nonfinite_horizontal_fails_closed() {
let f = square_fence();
assert_eq!(
f.check(&Position3D { x: f64::NAN, y: 50.0, z: -30.0 }),
GeofenceResult::HardBreach
);
assert_eq!(
f.check(&Position3D { x: 50.0, y: f64::INFINITY, z: -30.0 }),
GeofenceResult::HardBreach
);
}
}
@@ -64,10 +64,25 @@ impl MultiViewFusion {
detections: &[CsiDetection],
drone_positions: &[(NodeId, Position3D)],
) -> Option<FusedDetection> {
// Filter by confidence and require estimated position
// Filter by confidence and require a FINITE estimated position.
//
// A peer detection (received via `receive_peer_detection`) carries f32/f64
// fields that can deserialize to NaN/Inf. A NaN `victim_position` passes
// `is_some()` and would propagate through the confidence-weighted average
// into the fused position — dispatching a NaN "confirmed victim" location
// to the swarm. A NaN `confidence` is already rejected by `>= min_confidence`
// (NaN comparisons are false), but we make that explicit and also require
// the victim position components to be finite. Fail CLOSED: drop poisoned
// detections rather than fusing them.
let valid: Vec<(&CsiDetection, &Position3D)> = detections
.iter()
.filter(|d| d.confidence >= self.min_confidence && d.victim_position.is_some())
.filter(|d| {
d.confidence.is_finite()
&& d.confidence >= self.min_confidence
&& d.victim_position
.map(|p| p.x.is_finite() && p.y.is_finite() && p.z.is_finite())
.unwrap_or(false)
})
.filter_map(|d| {
let drone_pos = drone_positions
.iter()
@@ -177,4 +192,46 @@ mod tests {
result.uncertainty_m
);
}
/// Security: a detection with a NaN victim position (poisoned peer report)
/// must be dropped, not fused. Fails on old code where the NaN propagates
/// into the confidence-weighted average and the fused position is NaN.
#[test]
fn test_nan_victim_position_dropped_from_fusion() {
let fusion = MultiViewFusion { min_viewpoints: 2, min_confidence: 0.5 };
let detections = vec![
CsiDetection {
drone_id: NodeId(0),
confidence: 0.9,
victim_position: Some(Position3D { x: 50.0, y: 50.0, z: 0.0 }),
timestamp_ms: 0,
},
CsiDetection {
drone_id: NodeId(1),
confidence: 0.9,
victim_position: Some(Position3D { x: f64::NAN, y: 50.0, z: 0.0 }),
timestamp_ms: 0,
},
CsiDetection {
drone_id: NodeId(2),
confidence: 0.9,
victim_position: Some(Position3D { x: 50.0, y: 50.0, z: 0.0 }),
timestamp_ms: 0,
},
];
let positions = vec![
(NodeId(0), Position3D { x: 0.0, y: 0.0, z: -30.0 }),
(NodeId(1), Position3D { x: 100.0, y: 0.0, z: -30.0 }),
(NodeId(2), Position3D { x: 50.0, y: 86.6, z: -30.0 }),
];
// Two finite viewpoints remain → still fuses, but the result must be finite.
let result = fusion.fuse(&detections, &positions).unwrap();
assert!(
result.estimated_position.x.is_finite()
&& result.estimated_position.y.is_finite()
&& result.estimated_position.z.is_finite(),
"fused position must be finite when a NaN detection is present"
);
assert!(!result.contributing_drones.contains(&NodeId(1)), "NaN detection must be excluded");
}
}