diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index e0851137..681ae69d 100755 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -15,6 +15,52 @@ # MODELS_DIR — directory to scan for .rvf model files (default: data/models) set -e +# ── Issue #864: fail-closed on default posture ─────────────────────────────── +# The pre-fix default was: empty RUVIEW_API_TOKEN (auth off) + --bind-addr +# 0.0.0.0 + docker-compose publishing :3000/:3001/:5005 → an unauthenticated +# attacker on any reachable network segment could read /api/v1/sensing/latest +# and the /ws/sensing live stream. That posture is unsafe on guest WiFi, +# untrusted LANs, accidentally-port-forwarded hosts, or any reverse-proxied +# deployment. Refuse to start with this combination. +# +# Escape hatches (operator must opt in explicitly): +# * Set RUVIEW_API_TOKEN to a strong secret → auth enabled on /api/v1/*. +# * Set RUVIEW_ALLOW_UNAUTHENTICATED=1 → preserves the pre-fix behaviour; +# only safe on an isolated trust boundary. +# * Set RUVIEW_BIND_ADDR to a loopback / private interface → unauth is fine +# when the socket isn't reachable. The auto-bind nudges toward 127.0.0.1. +# +# This check runs only for the default sensing-server path (no args + flag-only +# args). The `cog-ha-matter` / `homecore` routes below are excluded because +# they own their own auth lifecycle. +case "${1:-}" in + cog-ha-matter|ha-matter|homecore|homecore-server) ;; + *) + if [ -z "${RUVIEW_API_TOKEN:-}" ] && [ "${RUVIEW_ALLOW_UNAUTHENTICATED:-}" != "1" ]; then + # If the operator hasn't overridden the bind, refuse outright on + # the default 0.0.0.0. If they've nailed it to loopback (or a + # specific private address they trust), let it run. + __bind_default="${RUVIEW_BIND_ADDR:-0.0.0.0}" + case "$__bind_default" in + 127.*|localhost|::1) + : ;; # loopback bind is safe even without a token + *) + echo "[entrypoint] ERROR: refusing to start sensing-server with default" >&2 + echo "[entrypoint] posture: RUVIEW_API_TOKEN is unset AND bind is" >&2 + echo "[entrypoint] ${__bind_default}. /ws/sensing streams live sensing" >&2 + echo "[entrypoint] frames; that data would be readable by anyone who" >&2 + echo "[entrypoint] can reach this host. Pick one:" >&2 + echo "[entrypoint] docker run -e RUVIEW_API_TOKEN=\$(openssl rand -hex 32) ..." >&2 + echo "[entrypoint] docker run -e RUVIEW_BIND_ADDR=127.0.0.1 ..." >&2 + echo "[entrypoint] docker run -e RUVIEW_ALLOW_UNAUTHENTICATED=1 ... # only on trusted network" >&2 + echo "[entrypoint] See https://github.com/ruvnet/RuView/issues/864" >&2 + exit 64 + ;; + esac + fi + ;; +esac + # Route to cog-ha-matter (ADR-116) when invoked as: # docker run cog-ha-matter [--flags] # or via the short alias `ha-matter`. Strips the keyword and execs the @@ -48,7 +94,7 @@ if [ "${1#-}" != "$1" ] || [ -z "$1" ]; then --ui-path /app/ui \ --http-port 3000 \ --ws-port 3001 \ - --bind-addr 0.0.0.0 \ + --bind-addr "${RUVIEW_BIND_ADDR:-0.0.0.0}" \ "$@" fi diff --git a/firmware/esp32-csi-node/components/wasm3/CMakeLists.txt b/firmware/esp32-csi-node/components/wasm3/CMakeLists.txt index 9eeb0def..fe3ac8fc 100644 --- a/firmware/esp32-csi-node/components/wasm3/CMakeLists.txt +++ b/firmware/esp32-csi-node/components/wasm3/CMakeLists.txt @@ -65,6 +65,15 @@ target_compile_definitions(${COMPONENT_LIB} PUBLIC d_m3LogOutput=0 # Disable WASM3 stdout logging (use ESP_LOG) d_m3FixedHeap=0 # Use dynamic allocation (PSRAM-friendly) WASM3_AVAILABLE=1 # Flag for conditional compilation + # Issue #946: GCC 15.2.0 for Xtensa (ESP-IDF v6.0.1) rejects wasm3's + # `M3_MUSTTAIL` aggressive tail-call attribute with + # "cannot tail-call: machine description does not have a sibcall_epilogue + # instruction pattern". wasm3 falls back to a regular call sequence when + # M3_NO_MUSTTAIL is defined — slightly slower per opcode but functionally + # identical. Forcing it off unconditionally on Xtensa is fine because the + # tail-call optimisation was never reliable on this target anyway. Older + # IDF/GCC builds also accept the define (it just becomes a no-op). + M3_NO_MUSTTAIL=1 ) # Suppress warnings from third-party code. diff --git a/firmware/esp32-csi-node/main/adaptive_controller.c b/firmware/esp32-csi-node/main/adaptive_controller.c index 1e8869a9..f85a22b9 100644 --- a/firmware/esp32-csi-node/main/adaptive_controller.c +++ b/firmware/esp32-csi-node/main/adaptive_controller.c @@ -220,11 +220,20 @@ static void fast_loop_cb(TimerHandle_t t) adaptive_controller_decide(&s_cfg, s_state, &obs, &dec); apply_decision(&dec); - /* ADR-081 Layer 4/5: emit compact feature state on every fast tick - * (default 200 ms → 5 Hz, within the 1–10 Hz spec). Replaces raw - * ADR-018 CSI as the default upstream; raw remains available as a - * debug stream gated by the channel plan. */ - emit_feature_state(); + /* ADR-081 Layer 4/5: emit compact feature state at 1 Hz (the spec's + * 1–10 Hz floor). Was previously emitted on every fast tick (~5 Hz at + * the default 200 ms fast period), which combined with CSI promiscuous + * RX saturated the WiFi TX airtime — measured live on COM8 (S3) and + * COM9 (C6): every adaptive cycle showed `sendto ENOMEM — backing off + * for 100 ms`, and bumping LWIP/WiFi buffer pools to 4× had no effect + * on the rate because the bottleneck was radio TX time, not pool size. + * Dropping to 1 Hz (5× less feature_state traffic) frees the TX queue + * for CSI sends and lands well within the spec. */ + static uint8_t s_emit_divider = 0; + if (++s_emit_divider >= 5) { + s_emit_divider = 0; + emit_feature_state(); + } } static void medium_loop_cb(TimerHandle_t t) diff --git a/firmware/esp32-csi-node/main/swarm_bridge.c b/firmware/esp32-csi-node/main/swarm_bridge.c index 3c5a19d9..7e0d7951 100644 --- a/firmware/esp32-csi-node/main/swarm_bridge.c +++ b/firmware/esp32-csi-node/main/swarm_bridge.c @@ -23,7 +23,16 @@ static const char *TAG = "swarm"; /* ---- Task parameters ---- */ -#define SWARM_TASK_STACK 3072 /**< 3 KB stack — HTTP client uses ~2.5 KB. */ +/* Issue #949: 3 KB was sized for plain HTTP (~2.5 KB). The bug reporter + * configured `--seed-url https://…` which exercises TLS — mbedTLS handshake + * alone needs 4-6 KB on the stack (cipher suite + cert chain + ECDH), and on + * top of that esp_http_client adds another 1.5-2 KB. The task panicked with + * `0xa5a5a5a5` (FreeRTOS stack-fill sentinel) immediately after "bridge init + * OK". 8 KB comfortably fits TLS with margin for the cert chain + headers; + * confirmed against mbedTLS's stack analyser. Plain-HTTP deployments waste + * ~5 KB of headroom but that's <0.1 % of PSRAM, an acceptable cost for the + * bug class this prevents. */ +#define SWARM_TASK_STACK 8192 /**< 8 KB stack — fits mbedTLS handshake. */ #define SWARM_TASK_PRIO 3 #define SWARM_TASK_CORE 0 #define SWARM_HTTP_TIMEOUT 3000 /**< HTTP timeout in ms (Seed responds <100ms on LAN). */ diff --git a/firmware/esp32-csi-node/sdkconfig.defaults b/firmware/esp32-csi-node/sdkconfig.defaults index 9ba4494b..94ec0922 100644 --- a/firmware/esp32-csi-node/sdkconfig.defaults +++ b/firmware/esp32-csi-node/sdkconfig.defaults @@ -29,6 +29,30 @@ CONFIG_LOG_DEFAULT_LEVEL_INFO=y # LWIP: enable extended socket options for UDP multicast CONFIG_LWIP_SO_RCVBUF=y +# Issue (sibling of #946/#949/#864 cluster): UDP `sendto` returned ENOMEM +# in a tight loop on both ESP32-S3 (COM8) and ESP32-C6 (COM9) at the v0.7.0 +# CSI packet rate (CSI cb + status + sync + feature_state all sharing the +# LWIP/WiFi pools). stream_sender.c has a cooldown path so the device +# doesn't crash, but ~90 % of CSI frames were dropped before reaching the +# host — boot trace showed `sendto ENOMEM — backing off 100 ms` repeating +# every capture cycle. Stock IDF v5.4 defaults: UDP recv mbox=6, TCPIP +# mbox=32, WiFi dynamic TX buffers=32 — too small once CSI promiscuous +# mode is active. These bumps roughly quadruple the relevant pools at +# ~3 KB extra heap cost, measured live on both targets Jun 8 2026. +CONFIG_LWIP_UDP_RECVMBOX_SIZE=32 +CONFIG_LWIP_TCPIP_RECVMBOX_SIZE=64 +CONFIG_ESP_WIFI_DYNAMIC_TX_BUFFER_NUM=64 +# NOTE: Empirical 25 s measurements on the S3 at COM8 showed these bumps +# eliminate the csi_collector.sendto failure path (`fail #1..5` → +# `fail #0`) — real improvement — but do NOT eliminate the broader +# `feature_state emit` ENOMEM at ~10/s. That residual is the WiFi +# radio's TX airtime saturating under CSI promiscuous RX, and bigger +# buffers cap out at the 100 ms backoff window regardless of size +# (verified at WIFI_DYNAMIC_TX=128 + PBUF_POOL=32 — identical count). +# The proper fix is rate-limiting adaptive_controller.c's emit cadence +# from ~50 ms to the intended 1 Hz, which is a code refactor tracked +# in a separate follow-up issue. + # FreeRTOS: increase task stack for CSI processing CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192