From 575ee4d2eb8d2e501fa0ba96405ee75e2062032c Mon Sep 17 00:00:00 2001 From: ruv Date: Wed, 10 Jun 2026 23:04:38 -0400 Subject: [PATCH] feat(benchmarks): static PTQ int8 (calibrated) results + overnight capture script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Conv-only static QDQ beats dynamic int8 on accuracy (PCK@20 96.61-96.63% vs 96.52%, MPJPE +10% vs +18% over fp32) at ~equal size/latency; all-ops QDQ strictly worse (int8 activations through attention glue). Entropy calibration verified bit-identical to MinMax on this data. Deployment: ONNX fp32 for speed (3.2ms), static conv-only QDQ for smallest (2.53MB). Also: scripts/overnight-empty-capture.py — segmented UDP CSI recorder for empty-room baselines (no glob collisions, detach-safe). Co-Authored-By: claude-flow --- benchmarks/wiflow-std/RESULTS.md | 57 +++ .../wiflow-std/results/edge_optimization.json | 391 ++++++++++++++++++ benchmarks/wiflow-std/static_ptq_bench.py | 332 +++++++++++++++ scripts/overnight-empty-capture.py | 80 ++++ 4 files changed, 860 insertions(+) create mode 100644 benchmarks/wiflow-std/static_ptq_bench.py create mode 100644 scripts/overnight-empty-capture.py diff --git a/benchmarks/wiflow-std/RESULTS.md b/benchmarks/wiflow-std/RESULTS.md index 48f38933..a1906c53 100644 --- a/benchmarks/wiflow-std/RESULTS.md +++ b/benchmarks/wiflow-std/RESULTS.md @@ -176,6 +176,63 @@ a model blocker). Parity vs torch on the stored fixture (`results/parity_fixture.npz`, batch 2, seed 42): **max abs diff 2.4e-7 — PASS** (< 1e-4). ORT-quantized int8 model: `results/retrained_int8_ort_dynamic.onnx`. +### Static PTQ (calibrated) — follow-up + +Follow-up to the dynamic-int8 row above (2026-06-10, same box, onnxruntime +1.26.0): ONNX Runtime **static** post-training quantization +(`quantize_static`, QDQ format, per-channel int8 weights + int8 activations) +of the same fp32 export, calibrated on **corruption-free TRAINING-split +windows only** (seed-42 file-level split, same masks; 1,000 windows for +MinMax, 512 for the histogram calibrators; never test windows). Scopes: +"conv-only" (`op_types_to_quantize=["Conv"]` — the attention path exports as +Einsum/Softmax, which ORT never quantizes anyway, so "all-ops" additionally +quantizes the elementwise Mul/Sigmoid/Add/AveragePool glue). Accuracy on the +identical 10k-window seed-42 corruption-free test subset; latency median of +3 interleaved reps (fp32/dynamic re-benched in-session as references). +Script: `static_ptq_bench.py`; raw: `results/edge_optimization.json` +(`onnx_static_ptq`). + +| Variant | Disk size | Batch 1 (ms/win) | Batch 64 (ms/win) | PCK@20 | PCK@50 | MPJPE | +|---|---|---|---|---|---|---| +| ONNX fp32 (reference) | 8.97 MB | 2.5 | 1.9 | 96.68% | 99.15% | 0.00936 | +| ORT dynamic int8 (baseline) | **2.44 MB** | 5.7 | 4.6 | 96.52% | 99.15% | 0.01108 | +| static QDQ **Percentile(99.99) conv-only** | 2.53 MB | 5.3 | 4.7 | 96.61% | 99.16% | **0.01031** | +| static QDQ MinMax conv-only | 2.53 MB | 5.2 | 3.3 | **96.63%** | 99.19% | 0.01084 | +| static QDQ Entropy conv-only | 2.53 MB | 5.2 | 3.1 | 96.60% | 99.19% | 0.01078 | +| static QDQ MinMax all-ops | 2.60 MB | 6.5 | 3.9 | 95.45% | 99.14% | 0.01486 | +| static QDQ Entropy all-ops | 2.60 MB | 5.7 | 4.1 | 95.30% | 99.13% | 0.01510 | +| static QDQ Percentile all-ops | 2.60 MB | 5.3 | 4.3 | 96.39% | 99.17% | 0.01218 | + +**Verdict: static PTQ (conv-only) is the new best int8 point on accuracy — +but only modestly, and it does not fix int8's latency penalty.** + +- **Accuracy: beats dynamic.** All three conv-only calibrations land at + PCK@20 96.60–96.63% (vs dynamic 96.52%, fp32 96.68% — recovers ~⅔ of the + dynamic gap) and MPJPE 0.0103–0.0108 (vs dynamic 0.01108). Best MPJPE: + Percentile conv-only, +10% over fp32 instead of dynamic's +18%. +- **Size: slightly worse.** 2.53 MB vs 2.44 MB (+3.6%) — QDQ nodes and + per-channel scales cost a little; BatchNorm stays fp32 in both (the 12 BNs + follow Slice/Einsum/Reshape, never Conv, so they cannot be folded). +- **Latency: a wash vs dynamic, still ~2× slower than ONNX fp32 at batch 1.** + Batch-1 medians 5.2–5.3 vs dynamic 5.7 ms/win in-session — within this + box's ±20–40% noise. Batch 64 leans static (3.1–3.3 for MinMax/Entropy + conv-only vs 4.6), same caveat. +- **All-ops QDQ is strictly worse**: up to −1.4 pt PCK@20 and +60% MPJPE for + zero size/latency benefit — int8 activations through the elementwise glue + around the attention blocks is where the damage is. Conv-only is the right + scope. +- Negative result worth recording: **Entropy calibration is a no-op here** — + on an identical calibration set it selects full-range thresholds + bit-identical to MinMax (all 247 scales equal; verified on a 64-window + smoke set). Also, ORT 1.26's `CalibMaxIntermediateOutputs` raises a + spurious "No data is collected" when the batch count divides the chunk + size (worked around in the script). + +Deployment guidance: need speed → ONNX fp32 (3.2 ms b1). Need int8 weights +for size → static QDQ conv-only (Percentile or MinMax, +`results/retrained_int8_static_percentile_conv.onnx`), which strictly +dominates dynamic int8 on accuracy at ~equal latency and +0.09 MB. + ## Measurement (b): BLOCKED-ON-DATA (attempted 2026-06-10) The fine-tune-on-ESP32 measurement stopped at dataset characterization, per the diff --git a/benchmarks/wiflow-std/results/edge_optimization.json b/benchmarks/wiflow-std/results/edge_optimization.json index 257d6ad5..ba07eac5 100644 --- a/benchmarks/wiflow-std/results/edge_optimization.json +++ b/benchmarks/wiflow-std/results/edge_optimization.json @@ -235,5 +235,396 @@ 7.8067296875019565 ] } + }, + "onnx_static_ptq": { + "env": { + "onnxruntime": "1.26.0", + "torch": "2.12.0+cpu", + "platform": "Windows-11-10.0.26200-SP0", + "source_model": "retrained_fp32_dynamic.onnx", + "preprocessed_model": { + "file": "retrained_fp32_preproc.onnx", + "size_mb": 8.981529 + } + }, + "variants": { + "minmax_all": { + "file": "retrained_int8_static_minmax_all.onnx", + "size_bytes": 2604286, + "size_mb": 2.604286, + "calibration": { + "method": "minmax", + "windows": 1000, + "percentile": null, + "seconds": 5.052440166473389 + }, + "scope": "all", + "per_channel": true, + "activation_type": "QInt8", + "weight_type": "QInt8", + "node_counts": { + "Add": 9, + "AveragePool": 1, + "BatchNormalization": 12, + "Concat": 10, + "Conv": 43, + "DequantizeLinear": 283, + "Einsum": 4, + "Gather": 16, + "Mul": 39, + "QuantizeLinear": 181, + "Reshape": 14, + "Shape": 2, + "Sigmoid": 37, + "Slice": 8, + "Softmax": 2, + "Squeeze": 1, + "Transpose": 7, + "Unsqueeze": 11 + }, + "max_abs_diff_vs_fp32_fixture": 0.015945255756378174, + "accuracy": { + "samples": 10000, + "pck@20": 0.9545266661643982, + "pck@50": 0.9913666645050049, + "mpjpe": 0.014860070134699345, + "wall_seconds": 43.455235958099365 + } + }, + "minmax_conv": { + "file": "retrained_int8_static_minmax_conv.onnx", + "size_bytes": 2527421, + "size_mb": 2.527421, + "calibration": { + "method": "minmax", + "windows": 1000, + "percentile": null, + "seconds": 4.380746126174927 + }, + "scope": "conv", + "per_channel": true, + "activation_type": "QInt8", + "weight_type": "QInt8", + "node_counts": { + "Add": 9, + "AveragePool": 1, + "BatchNormalization": 12, + "Concat": 10, + "Conv": 43, + "DequantizeLinear": 156, + "Einsum": 4, + "Gather": 16, + "Mul": 39, + "QuantizeLinear": 78, + "Reshape": 14, + "Shape": 2, + "Sigmoid": 37, + "Slice": 8, + "Softmax": 2, + "Squeeze": 1, + "Transpose": 7, + "Unsqueeze": 11 + }, + "max_abs_diff_vs_fp32_fixture": 0.010693132877349854, + "accuracy": { + "samples": 10000, + "pck@20": 0.9663399996757507, + "pck@50": 0.9918666641235352, + "mpjpe": 0.01084446222037077, + "wall_seconds": 35.937947034835815 + } + }, + "entropy_all": { + "file": "retrained_int8_static_entropy_all.onnx", + "size_bytes": 2604268, + "size_mb": 2.604268, + "calibration": { + "method": "entropy", + "windows": 512, + "percentile": null, + "seconds": 23.835066318511963 + }, + "scope": "all", + "per_channel": true, + "activation_type": "QInt8", + "weight_type": "QInt8", + "node_counts": { + "Add": 9, + "AveragePool": 1, + "BatchNormalization": 12, + "Concat": 10, + "Conv": 43, + "DequantizeLinear": 283, + "Einsum": 4, + "Gather": 16, + "Mul": 39, + "QuantizeLinear": 181, + "Reshape": 14, + "Shape": 2, + "Sigmoid": 37, + "Slice": 8, + "Softmax": 2, + "Squeeze": 1, + "Transpose": 7, + "Unsqueeze": 11 + }, + "max_abs_diff_vs_fp32_fixture": 0.015280365943908691, + "accuracy": { + "samples": 10000, + "pck@20": 0.9530466662406921, + "pck@50": 0.9912600006103516, + "mpjpe": 0.015098519864678382, + "wall_seconds": 51.514281034469604 + } + }, + "entropy_conv": { + "file": "retrained_int8_static_entropy_conv.onnx", + "size_bytes": 2527403, + "size_mb": 2.527403, + "calibration": { + "method": "entropy", + "windows": 512, + "percentile": null, + "seconds": 9.634419918060303 + }, + "scope": "conv", + "per_channel": true, + "activation_type": "QInt8", + "weight_type": "QInt8", + "node_counts": { + "Add": 9, + "AveragePool": 1, + "BatchNormalization": 12, + "Concat": 10, + "Conv": 43, + "DequantizeLinear": 156, + "Einsum": 4, + "Gather": 16, + "Mul": 39, + "QuantizeLinear": 78, + "Reshape": 14, + "Shape": 2, + "Sigmoid": 37, + "Slice": 8, + "Softmax": 2, + "Squeeze": 1, + "Transpose": 7, + "Unsqueeze": 11 + }, + "max_abs_diff_vs_fp32_fixture": 0.012535125017166138, + "accuracy": { + "samples": 10000, + "pck@20": 0.9659599989891052, + "pck@50": 0.9918666648864746, + "mpjpe": 0.010778637571632861, + "wall_seconds": 41.01180171966553 + } + }, + "percentile_all": { + "file": "retrained_int8_static_percentile_all.onnx", + "size_bytes": 2604052, + "size_mb": 2.604052, + "calibration": { + "method": "percentile", + "windows": 512, + "percentile": 99.99, + "seconds": 20.221954584121704 + }, + "scope": "all", + "per_channel": true, + "activation_type": "QInt8", + "weight_type": "QInt8", + "node_counts": { + "Add": 9, + "AveragePool": 1, + "BatchNormalization": 12, + "Concat": 10, + "Conv": 43, + "DequantizeLinear": 283, + "Einsum": 4, + "Gather": 16, + "Mul": 39, + "QuantizeLinear": 181, + "Reshape": 14, + "Shape": 2, + "Sigmoid": 37, + "Slice": 8, + "Softmax": 2, + "Squeeze": 1, + "Transpose": 7, + "Unsqueeze": 11 + }, + "max_abs_diff_vs_fp32_fixture": 0.017689883708953857, + "accuracy": { + "samples": 10000, + "pck@20": 0.9639333323478698, + "pck@50": 0.9916799991607667, + "mpjpe": 0.012176512064039708, + "wall_seconds": 49.365190744400024 + } + }, + "percentile_conv": { + "file": "retrained_int8_static_percentile_conv.onnx", + "size_bytes": 2527241, + "size_mb": 2.527241, + "calibration": { + "method": "percentile", + "windows": 512, + "percentile": 99.99, + "seconds": 8.223475694656372 + }, + "scope": "conv", + "per_channel": true, + "activation_type": "QInt8", + "weight_type": "QInt8", + "node_counts": { + "Add": 9, + "AveragePool": 1, + "BatchNormalization": 12, + "Concat": 10, + "Conv": 43, + "DequantizeLinear": 156, + "Einsum": 4, + "Gather": 16, + "Mul": 39, + "QuantizeLinear": 78, + "Reshape": 14, + "Shape": 2, + "Sigmoid": 37, + "Slice": 8, + "Softmax": 2, + "Squeeze": 1, + "Transpose": 7, + "Unsqueeze": 11 + }, + "max_abs_diff_vs_fp32_fixture": 0.014725983142852783, + "accuracy": { + "samples": 10000, + "pck@20": 0.9660599988937378, + "pck@50": 0.9916066654205322, + "mpjpe": 0.010310938355326652, + "wall_seconds": 36.89548587799072 + } + } + }, + "latency": { + "note": "3 interleaved repetitions per variant, median ms/window; onnx_fp32 / onnx_int8_ort_dynamic are same-session references", + "onnx_fp32": { + "batch1_reps": [ + 4.5327999996516155, + 2.535649999117595, + 2.167549997466267 + ], + "batch64_reps": [ + 1.9354515624740998, + 2.4948054687854437, + 1.9334703125082342 + ], + "batch1_ms_per_window_median": 2.535649999117595, + "batch64_ms_per_window_median": 1.9354515624740998 + }, + "onnx_int8_ort_dynamic": { + "batch1_reps": [ + 5.698599999959697, + 5.721350000385428, + 4.805099997611251 + ], + "batch64_reps": [ + 4.096601562508795, + 4.857628124995017, + 4.583800000006022 + ], + "batch1_ms_per_window_median": 5.698599999959697, + "batch64_ms_per_window_median": 4.583800000006022 + }, + "entropy_all": { + "batch1_reps": [ + 6.444149999879301, + 5.038299999796436, + 5.713200000172947 + ], + "batch64_reps": [ + 4.149468750028973, + 3.437125000004926, + 4.410960937491382 + ], + "batch1_ms_per_window_median": 5.713200000172947, + "batch64_ms_per_window_median": 4.149468750028973 + }, + "entropy_conv": { + "batch1_reps": [ + 4.874750000453787, + 5.169099998965976, + 5.236699998931726 + ], + "batch64_reps": [ + 3.010160156236452, + 3.1175546875203963, + 3.516850781238645 + ], + "batch1_ms_per_window_median": 5.169099998965976, + "batch64_ms_per_window_median": 3.1175546875203963 + }, + "percentile_all": { + "batch1_reps": [ + 5.184749999898486, + 5.2898499998264015, + 5.916899999647285 + ], + "batch64_reps": [ + 4.305105468745296, + 4.460741406262514, + 4.184502343747454 + ], + "batch1_ms_per_window_median": 5.2898499998264015, + "batch64_ms_per_window_median": 4.305105468745296 + }, + "percentile_conv": { + "batch1_reps": [ + 4.916449999655015, + 7.150899999032845, + 5.284949998895172 + ], + "batch64_reps": [ + 3.855813281262499, + 4.688969531230214, + 5.220103124997877 + ], + "batch1_ms_per_window_median": 5.284949998895172, + "batch64_ms_per_window_median": 4.688969531230214 + }, + "minmax_all": { + "batch1_reps": [ + 6.463300000177696, + 7.149449998905766, + 5.3209000016067876 + ], + "batch64_reps": [ + 3.9251343750095202, + 4.033442187505898, + 3.428199218745931 + ], + "batch1_ms_per_window_median": 6.463300000177696, + "batch64_ms_per_window_median": 3.9251343750095202 + }, + "minmax_conv": { + "batch1_reps": [ + 5.9961499991914025, + 5.236549999608542, + 4.854399998293957 + ], + "batch64_reps": [ + 4.368359375007458, + 3.249617187492504, + 3.0238906249735464 + ], + "batch1_ms_per_window_median": 5.236549999608542, + "batch64_ms_per_window_median": 3.249617187492504 + } + }, + "accuracy_subset": { + "description": "seed-42 file-level 70/15/15 test split, corrupted windows excluded, seed-42 random subset (same as quantize_bench/eval_ort_accuracy)", + "subset_size": 10000 + } } } \ No newline at end of file diff --git a/benchmarks/wiflow-std/static_ptq_bench.py b/benchmarks/wiflow-std/static_ptq_bench.py new file mode 100644 index 00000000..1f74eeb6 --- /dev/null +++ b/benchmarks/wiflow-std/static_ptq_bench.py @@ -0,0 +1,332 @@ +"""ADR-152 edge optimization follow-up: ONNX Runtime STATIC post-training +quantization (calibration-based QDQ) of the retrained WiFlow-STD model, to +improve on the dynamic-int8 result (2.44 MB, PCK@20 96.52%, 6.5 ms/win b1). + +Static PTQ pre-computes activation ranges from calibration data, so inference +uses QLinearConv/QDQ kernels instead of dynamic ConvInteger -- typically both +faster and (with good calibration) closer to fp32 accuracy. + +Method: + - Calibration set: corruption-free windows drawn ONLY from the seed-42 + file-level TRAINING split (same split as eval_repro.py; corrupted windows + excluded via results/nan_windows_mask.npy | big_windows_mask.npy), chosen + with np.random.default_rng(42). Never test windows. + - quantize_static, QuantFormat.QDQ, per-channel int8 weights, int8 + activations; calibration methods MinMax / Entropy / Percentile(99.99); + scopes "all" (ORT default op set) vs "conv" (op_types_to_quantize= + ["Conv"] -- leaves the attention path, which exports as Einsum/Softmax + and elementwise ops, in fp32). + - Model is pre-processed first (quant_pre_process: symbolic shape + inference + ORT graph optimization, folds BatchNormalization into Conv). + - Accuracy: identical protocol to eval_ort_accuracy.py -- the 10,000-window + seed-42 subset of the corruption-free test split (PCK@20/50, MPJPE). + - Latency: median ms/window at batch 1 (100 runs) and batch 64 (30 runs), + 3 interleaved repetitions across all variants (fp32 and dynamic-int8 + sessions included as same-session reference points). + +Usage: + PYTHONUTF8=1 .venv/Scripts/python.exe static_ptq_bench.py \ + [--data-dir ] [--subset 10000] + [--calib-minmax 1000] [--calib-hist 512] [--skip-accuracy] + +Writes/merges into results/edge_optimization.json under key "onnx_static_ptq". +""" + +import argparse +import collections +import json +import os +import platform +import statistics +import sys +import time + +import numpy as np +import torch + +HERE = os.path.dirname(os.path.abspath(__file__)) +RESULTS = os.path.join(HERE, "results") +sys.path.insert(0, HERE) + +# quantize_bench sets up upstream imports + the np.load mmap patch +from quantize_bench import build_test_subset # noqa: E402 +import quantize_bench as qb # noqa: E402 +from eval_ort_accuracy import evaluate_ort # noqa: E402 + +FP32_ONNX = os.path.join(RESULTS, "retrained_fp32_dynamic.onnx") +DYN_INT8_ONNX = os.path.join(RESULTS, "retrained_int8_ort_dynamic.onnx") +PREPROC_ONNX = os.path.join(RESULTS, "retrained_fp32_preproc.onnx") + + +# --------------------------------------------------------------------------- +# calibration data: corruption-free TRAINING-split windows only +# --------------------------------------------------------------------------- + +def build_calibration_windows(data_dir, n_windows): + """Seed-42 file-level 70/15/15 TRAIN split (exactly as eval_repro.py), + minus corrupted windows, then a seed-42 random draw of n_windows.""" + dataset = qb.PreprocessedCSIKeypointsDataset( + data_dir=data_dir, keypoint_scale=1000.0, enable_temporal_clean=True) + train_loader, _va, _te = qb.create_preprocessed_train_val_test_loaders( + dataset=dataset, batch_size=64, num_workers=0, random_seed=42) + train_indices = np.asarray(train_loader.dataset.indices) + + corrupted = (np.load(os.path.join(RESULTS, "nan_windows_mask.npy")) + | np.load(os.path.join(RESULTS, "big_windows_mask.npy"))) + clean = train_indices[~corrupted[train_indices]] + print(f"train split: {len(train_indices)} windows, " + f"{len(train_indices) - len(clean)} corrupted excluded, " + f"{len(clean)} clean") + + rng = np.random.default_rng(42) + sel = np.sort(rng.choice(clean, size=n_windows, replace=False)) + xs = np.stack([dataset[int(i)][0].numpy() for i in sel]).astype(np.float32) + print(f"calibration tensor: {xs.shape} from {n_windows} clean TRAIN windows") + return xs + + +def make_reader(windows, batch_size=64): + from onnxruntime.quantization import CalibrationDataReader + + class WindowReader(CalibrationDataReader): + def __init__(self): + self._batches = [windows[i:i + batch_size] + for i in range(0, len(windows), batch_size)] + self._it = iter(self._batches) + + def get_next(self): + b = next(self._it, None) + return None if b is None else {"input": b} + + def rewind(self): + self._it = iter(self._batches) + + def __len__(self): + return len(self._batches) + + return WindowReader() + + +# --------------------------------------------------------------------------- +# quantization variants +# --------------------------------------------------------------------------- + +def preprocess_model(): + from onnxruntime.quantization.shape_inference import quant_pre_process + quant_pre_process(FP32_ONNX, PREPROC_ONNX) + return PREPROC_ONNX + + +def quantize_variant(src, dst, method, scope, calib_windows): + from onnxruntime.quantization import (CalibrationMethod, QuantFormat, + QuantType, quantize_static) + methods = { + "minmax": CalibrationMethod.MinMax, + "entropy": CalibrationMethod.Entropy, + "percentile": CalibrationMethod.Percentile, + } + # NB: do NOT pass CalibMaxIntermediateOutputs -- in ORT 1.26 the MinMax + # calibrater clears its buffer every N batches and then raises + # "No data is collected" if the batch count is divisible by N. + extra = {} + if method == "percentile": + extra["CalibPercentile"] = 99.99 + op_types = ["Conv"] if scope == "conv" else None + + t0 = time.time() + quantize_static( + src, dst, make_reader(calib_windows), + quant_format=QuantFormat.QDQ, + op_types_to_quantize=op_types, + per_channel=True, + activation_type=QuantType.QInt8, + weight_type=QuantType.QInt8, + calibrate_method=methods[method], + extra_options=extra, + ) + secs = time.time() - t0 + + import onnx + ops = collections.Counter(n.op_type for n in onnx.load(dst).graph.node) + return { + "file": os.path.basename(dst), + "size_bytes": os.path.getsize(dst), + "size_mb": os.path.getsize(dst) / 1e6, + "calibration": {"method": method, + "windows": int(len(calib_windows)), + "percentile": extra.get("CalibPercentile"), + "seconds": secs}, + "scope": scope, + "per_channel": True, + "activation_type": "QInt8", + "weight_type": "QInt8", + "node_counts": {k: v for k, v in sorted(ops.items())}, + } + + +# --------------------------------------------------------------------------- +# latency (3 interleaved reps, like the latency_controlled_rerun) +# --------------------------------------------------------------------------- + +def ort_session(path): + import onnxruntime as ort + return ort.InferenceSession(path, providers=["CPUExecutionProvider"]) + + +def bench_ort(sess, batch, n_runs): + rng = np.random.default_rng(123) + x = rng.random((batch, 540, 20), dtype=np.float32) + inp = sess.get_inputs()[0].name + for _ in range(max(5, n_runs // 10)): + sess.run(None, {inp: x}) + times = [] + for _ in range(n_runs): + t0 = time.perf_counter() + sess.run(None, {inp: x}) + times.append(time.perf_counter() - t0) + return statistics.median(times) * 1e3 / batch # ms/window + + +def interleaved_latency(sessions, reps=3, runs_b1=100, runs_b64=30): + lat = {name: {"batch1_reps": [], "batch64_reps": []} for name in sessions} + for rep in range(reps): + for name, sess in sessions.items(): + lat[name]["batch1_reps"].append(bench_ort(sess, 1, runs_b1)) + lat[name]["batch64_reps"].append(bench_ort(sess, 64, runs_b64)) + print(f" rep {rep + 1}/{reps} {name}: " + f"b1={lat[name]['batch1_reps'][-1]:.2f} " + f"b64={lat[name]['batch64_reps'][-1]:.3f} ms/win", flush=True) + for name in lat: + lat[name]["batch1_ms_per_window_median"] = statistics.median( + lat[name]["batch1_reps"]) + lat[name]["batch64_ms_per_window_median"] = statistics.median( + lat[name]["batch64_reps"]) + return lat + + +# --------------------------------------------------------------------------- + +def main(): + import onnxruntime + parser = argparse.ArgumentParser() + parser.add_argument("--data-dir", default=os.path.join( + os.path.expanduser("~"), ".cache", "kagglehub", "datasets", "kaka2434", + "wiflow-dataset", "versions", "1", "preprocessed_csi_data")) + parser.add_argument("--subset", type=int, default=10000) + parser.add_argument("--calib-minmax", type=int, default=1000) + parser.add_argument("--calib-hist", type=int, default=512, + help="calibration windows for Entropy/Percentile " + "(histogram calibraters hold all intermediate " + "activations in RAM)") + parser.add_argument("--skip-accuracy", action="store_true") + parser.add_argument("--methods", default="minmax,entropy,percentile", + help="comma list of calibration methods to (re)run; " + "results merge into existing onnx_static_ptq") + parser.add_argument("--out", default=os.path.join(RESULTS, "edge_optimization.json")) + args = parser.parse_args() + + results = { + "env": { + "onnxruntime": onnxruntime.__version__, + "torch": torch.__version__, + "platform": platform.platform(), + "source_model": os.path.basename(FP32_ONNX), + }, + "variants": {}, + } + + # ---- calibration data (TRAIN split only) ------------------------------- + calib_mm = build_calibration_windows(args.data_dir, args.calib_minmax) + calib_hist = calib_mm[:args.calib_hist] + + # ---- preprocess + quantize --------------------------------------------- + print("\n=== quant_pre_process (shape inference + graph optimization) ===") + src = preprocess_model() + results["env"]["preprocessed_model"] = { + "file": os.path.basename(src), + "size_mb": os.path.getsize(src) / 1e6, + } + + matrix = [(m, s) for m in args.methods.split(",") + for s in ("all", "conv")] + for method, scope in matrix: + name = f"{method}_{scope}" + dst = os.path.join(RESULTS, f"retrained_int8_static_{name}.onnx") + calib = calib_mm if method == "minmax" else calib_hist + print(f"\n=== quantize_static: {name} " + f"({len(calib)} calib windows) ===", flush=True) + try: + results["variants"][name] = quantize_variant( + src, dst, method, scope, calib) + print(f" {results['variants'][name]['size_mb']:.3f} MB") + except Exception as e: # noqa: BLE001 + results["variants"][name] = {"error": f"{type(e).__name__}: {e}"} + print(f" FAILED: {e}") + + # ---- fixture parity (sanity, batch 2) ---------------------------------- + fixture = np.load(os.path.join(RESULTS, "parity_fixture.npz")) + fx, fy = fixture["input"], fixture["output"] + sessions = {} + for name, info in results["variants"].items(): + if "error" in info: + continue + path = os.path.join(RESULTS, info["file"]) + try: + sess = ort_session(path) + yq = sess.run(None, {sess.get_inputs()[0].name: fx})[0] + info["max_abs_diff_vs_fp32_fixture"] = float(np.abs(yq - fy).max()) + sessions[name] = sess + except Exception as e: # noqa: BLE001 + info["run_error"] = f"{type(e).__name__}: {e}" + print("\nfixture max-abs-diff vs fp32:", + {n: round(results["variants"][n].get("max_abs_diff_vs_fp32_fixture", + float("nan")), 5) + for n in results["variants"]}) + + # ---- latency: 3 interleaved reps incl. fp32 + dynamic-int8 reference ---- + print("\n=== latency (3 interleaved reps) ===") + lat_sessions = {"onnx_fp32": ort_session(FP32_ONNX), + "onnx_int8_ort_dynamic": ort_session(DYN_INT8_ONNX)} + lat_sessions.update(sessions) + results["latency"] = { + "note": "3 interleaved repetitions per variant, median ms/window; " + "onnx_fp32 / onnx_int8_ort_dynamic are same-session references", + **interleaved_latency(lat_sessions), + } + + # ---- accuracy on the standard 10k corruption-free test subset ---------- + if not args.skip_accuracy: + loader, n_clean = build_test_subset(args.data_dir, args.subset) + results["accuracy_subset"] = { + "description": "seed-42 file-level 70/15/15 test split, corrupted " + "windows excluded, seed-42 random subset (same as " + "quantize_bench/eval_ort_accuracy)", + "subset_size": min(args.subset, n_clean) if args.subset else n_clean, + } + for name, sess in sessions.items(): + print(f"\n=== accuracy: {name} ===") + results["variants"][name]["accuracy"] = evaluate_ort( + sess, loader, name) + print(json.dumps(results["variants"][name]["accuracy"], indent=2)) + + # ---- merge into edge_optimization.json ---------------------------------- + merged = {} + if os.path.exists(args.out): + with open(args.out) as f: + merged = json.load(f) + prev = merged.get("onnx_static_ptq") + if prev: # nested merge so partial --methods reruns don't clobber + prev["env"] = results["env"] + prev["variants"].update(results["variants"]) + prev.setdefault("latency", {}).update(results["latency"]) + if "accuracy_subset" in results: + prev["accuracy_subset"] = results["accuracy_subset"] + else: + merged["onnx_static_ptq"] = results + with open(args.out, "w") as f: + json.dump(merged, f, indent=2) + print(f"\nwrote {args.out}") + + +if __name__ == "__main__": + main() diff --git a/scripts/overnight-empty-capture.py b/scripts/overnight-empty-capture.py new file mode 100644 index 00000000..0eb8b999 --- /dev/null +++ b/scripts/overnight-empty-capture.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +"""Segmented overnight empty-room CSI capture (ADR-135 baseline / MAE corpus). + +Binds UDP once and writes fixed-duration JSONL segments with explicit names — +no post-hoc renaming, no glob collisions with other recordings. + +Usage: + python scripts/overnight-empty-capture.py --segments 8 --segment-seconds 3300 +""" + +import argparse +import json +import os +import socket +import struct +import time + + +def parse_csi_packet(data): + """ADR-018 binary CSI packet → dict (same layout as record-csi-udp.py).""" + if len(data) < 8: + return None + node_id = data[4] + rssi = struct.unpack("b", bytes([data[6]]))[0] + channel = data[7] + iq = data[8:] + amplitudes = [] + for i in range(0, len(iq) - 1, 2): + I = struct.unpack("b", bytes([iq[i]]))[0] + Q = struct.unpack("b", bytes([iq[i + 1]]))[0] + amplitudes.append(round((I * I + Q * Q) ** 0.5, 2)) + return { + "type": "raw_csi", + "ts_ns": time.time_ns(), + "node_id": node_id, + "rssi": rssi, + "channel": channel, + "subcarriers": len(iq) // 2, + "amplitudes": amplitudes, + "iq_hex": iq.hex(), + } + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--port", type=int, default=5005) + ap.add_argument("--segments", type=int, default=8) + ap.add_argument("--segment-seconds", type=int, default=3300) + ap.add_argument("--output", default="data/recordings") + ap.add_argument("--prefix", default="overnight-empty") + args = ap.parse_args() + + os.makedirs(args.output, exist_ok=True) + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock.bind(("0.0.0.0", args.port)) + sock.settimeout(2.0) + + for seg in range(1, args.segments + 1): + path = os.path.join( + args.output, f"{args.prefix}-seg{seg}-{int(time.time())}.csi.jsonl" + ) + n = 0 + t_end = time.time() + args.segment_seconds + with open(path, "w", encoding="utf-8") as f: + while time.time() < t_end: + try: + data, _ = sock.recvfrom(4096) + except socket.timeout: + continue + rec = parse_csi_packet(data) + if rec is not None: + f.write(json.dumps(rec) + "\n") + n += 1 + print(f"segment {seg}: {n} frames -> {path}", flush=True) + + print("capture complete", flush=True) + + +if __name__ == "__main__": + main()