feat(aether-arena): publish RuView MM-Fi SOTA result + ADR-150 RF Foundation Encoder

- Ledger witness row (seq 1, Gold): RuView CSI-Transformer 81.63% torso-PCK@20 on MM-Fi random_split, exceeding MultiFormer 72.25% (CSI2Pose 68.41%) — protocol- and metric-matched, self-corrected from inflated 91.86% bbox. Hash-chained, verifiable. - HF Space updated with the controlled SOTA claim + caveat (cross-subject is the frontier). - Proof/replay/witness gist: gist.github.com/ruvnet/af2fbc1c7674dddf09c15509b3c7f785 - Tracking issue #876 (result + Generalization Track roadmap). - ADR-150: RuView RF Foundation Encoder — pose-preserving, subject/room/device-invariant SSL embedding (masked CSI + pose-contrast-across-subjects + coherence head); the principled attack on the cross-subject frontier. DANN failed; this is the corrected design. Co-Authored-By: claude-flow <ruv@ruv.net>
2026-06-09 10:13:17 +00:00 · 2026-05-30 19:55:58 -04:00
parent 4f7ab8e4f0
commit 046b2564b8
4 changed files with 140 additions and 7 deletions
@@ -1 +1,2 @@
 {"benchmark": "AetherArena", "created": "2026-05-30", "kind": "genesis", "note": "Official Spatial-Intelligence Benchmark \u2014 append-only signed ledger. Entries are real harness scores only; no seeded numbers.", "prev_hash": "0000000000000000000000000000000000000000000000000000000000000000", "row_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "seq": 0, "spec": "ADR-149"}
+{"abs_gain": "+9.38", "benchmark": "MM-Fi", "category": "pose", "caveat": "Protocol-matched MM-Fi random_split result; NOT solved real-world generalization. Random split has temporal/subject-adjacency effects common to this benchmark family. Leakage-free cross-subject is far lower (~11-27%) and is the real deployment frontier.", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20 (||right_shoulder-left_hip|| norm, 17 COCO kpts)", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer (4L/8H ~2M params, temporal-attention)", "prev_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "protocol": "random_split (ratio=0.8, seed=0)", "rel_gain": "+13.0%", "reproduce": "download MM-Fi -> parse_mmfi_zips.py -> train_tf_torso.py X.npy Y.npy split_random.npy (seed 0)", "row_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "score_pct": 81.63, "scored_at": "2026-05-30", "seq": 1, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"}
@@ -41,14 +41,15 @@ def verify_chain():
 def leaderboard(category: str):
    results = [r for r in _rows() if r.get("kind") == "result" and (category == "all" or r.get("category") == category)]
    if not results:
-        return [["— no entries yet —", "be the first", "", "", ""]]
-    results.sort(key=lambda r: r.get("pck20_all") or r.get("pck_all") or 0, reverse=True)
+        return [["— no entries yet —", "", "", "", "", ""]]
+    results.sort(key=lambda r: r.get("score_pct") or 0, reverse=True)
    return [[
        r.get("submitter", "?"),
        r.get("model_ref", "?"),
-        r.get("tier", "?"),
-        f"{(r.get('pck20_all') or r.get('pck_all') or 0):.4f}",
-        (r.get("proof_sha256") or "")[:16],
+        f"{r.get('benchmark','?')} / {r.get('protocol','?')}",
+        r.get("metric", "?"),
+        f"{r.get('score_pct', 0):.2f}%",
+        f"{r.get('tier','?')} (vs {r.get('sota_ref','?')})",
    ] for r in results]


@@ -107,16 +108,21 @@ rerun the scorer locally → understand why the rank is fair. That is the launch
 with gr.Blocks(title="AetherArena — Spatial-Intelligence Benchmark") as demo:
    gr.Markdown("# 📡 AetherArena (AA)\n## The Official Spatial-Intelligence Benchmark")
    gr.Markdown(FOUR_PART)
+    gr.Markdown(
+        "## 🏆 RuView sets new MM-Fi random-split SOTA for WiFi-CSI pose estimation — **81.63% torso-PCK@20**\n"
+        "**81.63% vs MultiFormer 72.25%** (CSI2Pose 68.41%) — same MM-Fi `random_split` (0.8, seed 0), same torso-normalized PCK@20, 17 COCO keypoints. **+9.38 abs / +13.0% rel.**\n\n"
+        "> ⚠️ **Controlled claim.** This is a *protocol-matched MM-Fi random-split* result — **not** solved real-world generalization. Random split contains temporal/subject-adjacency effects common to this benchmark family. Our leakage-free **cross-subject** result is far lower (~11–27%), and we treat cross-subject pose estimation as the real deployment frontier."
+    )
    chain = gr.Markdown(verify_chain())

    with gr.Tab("🏆 Leaderboard"):
        cat = gr.Dropdown(["all", "pose", "presence"], value="all", label="Category")
        tbl = gr.Dataframe(
-            headers=["Submitter", "Model", "Tier", "Score", "Proof (sha256…)"],
+            headers=["Submitter", "Model", "Benchmark / Protocol", "Metric", "Score", "Tier (vs SOTA)"],
            value=leaderboard("all"), interactive=False, wrap=True,
        )
        cat.change(leaderboard, cat, tbl)
-        gr.Markdown("*Benchmark-first: the board starts empty. Every row is a real harness witness — no seeded numbers.*")
+        gr.Markdown("*Benchmark-first: every row is a real, metric- and protocol-matched result — no seeded numbers. Integrity note: the headline 81.63% was self-corrected down from an inflated 91.86% (bbox metric) before publishing.*")

    with gr.Tab("📤 Submit"):
        gr.Markdown(SUBMIT)
@@ -1 +1,2 @@
 {"benchmark": "AetherArena", "created": "2026-05-30", "kind": "genesis", "note": "Official Spatial-Intelligence Benchmark \u2014 append-only signed ledger. Entries are real harness scores only; no seeded numbers.", "prev_hash": "0000000000000000000000000000000000000000000000000000000000000000", "row_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "seq": 0, "spec": "ADR-149"}
+{"abs_gain": "+9.38", "benchmark": "MM-Fi", "category": "pose", "caveat": "Protocol-matched MM-Fi random_split result; NOT solved real-world generalization. Random split has temporal/subject-adjacency effects common to this benchmark family. Leakage-free cross-subject is far lower (~11-27%) and is the real deployment frontier.", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20 (||right_shoulder-left_hip|| norm, 17 COCO kpts)", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer (4L/8H ~2M params, temporal-attention)", "prev_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "protocol": "random_split (ratio=0.8, seed=0)", "rel_gain": "+13.0%", "reproduce": "download MM-Fi -> parse_mmfi_zips.py -> train_tf_torso.py X.npy Y.npy split_random.npy (seed 0)", "row_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "score_pct": 81.63, "scored_at": "2026-05-30", "seq": 1, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"}