ruvnet--RuView/examples/research-sota/r9_rssi_fingerprint_knn.py

#!/usr/bin/env python3
"""R9 — RSSI fingerprint topology: does temporal proximity = feature proximity?

See docs/research/sota-2026-05-22/R9-rssi-fingerprint-knn.md.

Hypothesis: if RSSI sequences from temporally-adjacent windows are
nearest-neighbours in feature space, RSSI-fingerprint localisation is
viable. If the K-NN of every query is random in time, RSSI sequences
don't carry stable enough fingerprints — fall back to multi-modal cues
(BSSID lists, signal-of-opportunity).

Test:
  1. Build the same 20-dim RSSI proxy from the 1,077 paired windows
     (band-mean across 56 subcarriers per frame).
  2. For each sample i, find K-NN in cosine-similarity space.
  3. Measure: what fraction of the K-NN come from windows within
     ±60 seconds of the query's timestamp?
  4. Compare to a random baseline (what would the fraction be if K-NN
     were chosen at random?).

If the temporal-K-NN fraction is ≫ random, RSSI fingerprints have stable
spatial structure → R9 viable.

Usage:
    python examples/research-sota/r9_rssi_fingerprint_knn.py \
        --paired data/paired/wiflow-p7-1779210883.paired.jsonl
"""

from __future__ import annotations

import argparse
import json
from datetime import datetime, timezone
from pathlib import Path

import numpy as np

N_SUB, N_FRAMES = 56, 20


def load_rssi_proxy(path: Path) -> tuple[np.ndarray, np.ndarray]:
    """Return (X_rssi, ts_seconds). X_rssi is [N, 20], ts is [N] float seconds."""
    csis, ts = [], []
    with path.open(encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            d = json.loads(line)
            shape = d.get("csi_shape", [N_SUB, N_FRAMES])
            if shape != [N_SUB, N_FRAMES]:
                continue
            csi = np.asarray(d["csi"], dtype=np.float32).reshape(N_SUB, N_FRAMES)
            csis.append(csi.mean(axis=0))  # band-mean → [20]
            t_iso = d.get("ts_start", "1970-01-01T00:00:00Z")
            ts.append(datetime.fromisoformat(t_iso.replace("Z", "+00:00")).timestamp())
    return np.stack(csis), np.asarray(ts, dtype=np.float64)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--paired", required=True)
    parser.add_argument("--out", default="examples/research-sota/r9_rssi_fingerprint_results.json")
    parser.add_argument("--k", type=int, default=5)
    parser.add_argument("--temporal-window-s", type=float, default=60.0)
    args = parser.parse_args()

    print(f"Loading RSSI-proxy from {args.paired}")
    X, ts = load_rssi_proxy(Path(args.paired))
    print(f"  N samples: {X.shape[0]}, feature dim: {X.shape[1]}")
    print(f"  time range: {datetime.fromtimestamp(ts.min(), tz=timezone.utc):%H:%M:%S} - "
          f"{datetime.fromtimestamp(ts.max(), tz=timezone.utc):%H:%M:%S}  "
          f"({(ts.max() - ts.min()) / 60:.1f} min total)")

    # Z-score normalise across all samples — what a real device does via AGC
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, keepdims=True) + 1e-6
    Xn = (X - mu) / sd

    # All-pairs cosine similarity
    print(f"\nComputing all-pairs cosine similarity ({X.shape[0]}×{X.shape[0]} = "
          f"{X.shape[0]**2:,} pairs)...")
    norms = np.linalg.norm(Xn, axis=1, keepdims=True) + 1e-9
    Xnorm = Xn / norms
    sim = Xnorm @ Xnorm.T
    np.fill_diagonal(sim, -np.inf)  # exclude self-match

    N = X.shape[0]
    K = args.k
    W = args.temporal_window_s

    # For each query, find top-K nearest neighbours and measure how many are
    # within the temporal window
    print(f"\nMeasuring temporal-locality of top-{K} cosine-NN with window ±{W:.0f}s...")
    knn_idx = np.argsort(-sim, axis=1)[:, :K]   # [N, K]
    knn_ts = ts[knn_idx]                         # [N, K]
    delta_t = np.abs(knn_ts - ts[:, None])      # [N, K]
    within = (delta_t <= W).astype(np.float32)   # [N, K]
    per_query_within_frac = within.mean(axis=1) # [N] — fraction of K-NN within window
    overall_within_frac = within.mean()         # scalar

    # Random baseline: for each query, what fraction of all OTHER samples
    # fall within ±W of its timestamp?
    rand_within = np.zeros(N, dtype=np.float32)
    for i in range(N):
        delta = np.abs(ts - ts[i])
        delta[i] = np.inf
        rand_within[i] = (delta <= W).mean()
    rand_baseline = float(rand_within.mean())

    # Headline numbers
    lift = overall_within_frac / max(rand_baseline, 1e-9)

    print(f"\n=== R9 RSSI-fingerprint K-NN results ===")
    print(f"  K-NN within ±{W:.0f}s:   {overall_within_frac:.3f}")
    print(f"  Random baseline:        {rand_baseline:.3f}")
    print(f"  Lift over random:       {lift:.2f}×")
    print(f"  Per-query stdev:        {per_query_within_frac.std():.3f}")

    if lift >= 3.0:
        verdict = "STRONG: RSSI sequences carry stable spatial fingerprints"
    elif lift >= 1.5:
        verdict = "MODERATE: RSSI fingerprints work but with significant noise"
    else:
        verdict = "WEAK: RSSI-only fingerprint localisation is unreliable on this data"
    print(f"\n  Verdict: {verdict}")

    out = {
        "n_samples": int(N),
        "k": K,
        "temporal_window_s": W,
        "knn_within_window_fraction": float(overall_within_frac),
        "random_baseline": rand_baseline,
        "lift": float(lift),
        "per_query_within_fraction_stdev": float(per_query_within_frac.std()),
        "verdict": verdict,
    }
    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
    Path(args.out).write_text(json.dumps(out, indent=2))
    print(f"\nWrote {args.out}")


if __name__ == "__main__":
    main()