phenomenoner · phenomenoner · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/README.md b/README.md
@@ -158,6 +158,8 @@ scripts/run_lancedb_vs_openclaw_mem_assisted.sh \
 
 Deterministic long-run profile (stable run-group path for reproducible reruns):
 
+For the counterfactual ON/OFF protocol (Pillar A execute now; Pillar B pre-registered), see `docs/FULL_BENCHMARK_PLAN.md#counterfactual-onoff-plan-for-the-two-pillars`.
+
 ```bash
 scripts/run_phase_ab_longmemeval50.sh
 # writes to artifacts/phase-ab-compare/phase-ab-longmemeval50-seed7-topk10/

diff --git a/docs/FULL_BENCHMARK_PLAN.md b/docs/FULL_BENCHMARK_PLAN.md
@@ -68,6 +68,44 @@ Required reporting:
 - If available from provider/tool payloads, include in compare artifact.
 - Current v0.2 retrieval reports do not provide tokenized cost telemetry; compare artifact records this as unavailable.
 
+## Counterfactual ON/OFF plan for the two pillars
+
+This section adds a falsifiable ON/OFF design while keeping current Phase A/B priorities intact.
+
+### Scope and sequencing
+- **Pillar A (execute now):** context pack contract hardening effects.
+- **Pillar B (pre-register now, execute later):** learning-record/self-improving loop effects.
+- Do not mix A and B rollout in the same implementation window.
+
+### Experimental arms
+- `A0/B0`: baseline behavior (current pack contract, no learning block).
+- `A1/B0`: Pillar A ON (contract hardening enabled).
+- `A0/B1`: Pillar B ON (reserved; spec only until Pillar A gate passes).
+- `A1/B1`: both ON (reserved for later confirmation run).
+
+For the current cycle, run only `A0/B0` vs `A1/B0`.
+
+### Metric definitions (anti-gaming, explicit)
+- **Recall@K / Precision@K / nDCG@K**:
+  - `K` must be fixed per run and written in manifest (`top_k`, default `10`).
+  - relevance source must be dataset `relevant_session_ids` (no post-hoc re-labeling).
+- **Citation coverage**:
+  - `1 - (included_without_citation_count / included_count)`.
+  - also report numerator/denominator explicitly.
+- **Rationale coverage**:
+  - `1 - (included_without_reason_count / included_count)`.
+- **Determinism pass rate**:
+  - For each fixed DB/query case, run 5 repeats and compare canonicalized JSON (excluding timestamp fields only).
+  - pass rate denominator must be number of distinct cases, not total runs.
+- **Budget exclusion rate**:
+  - `excluded_by_budget / total_candidates`.
+- **Latency p50/p95**:
+  - measured on the retrieval+pack path under the same runner/hardware profile recorded in manifest.
+
+### Decision posture
+- Treat this as a non-regression gate first (quality and determinism).
+- Only after Pillar A gate passes should Pillar B execution be scheduled.
+
 ## Reproducibility contract
 
 Each run package must include a manifest with:

diff --git a/scripts/hybrid_two_stage_from_reports.py b/scripts/hybrid_two_stage_from_reports.py
@@ -8,7 +8,9 @@
 
 
 def main() -> int:
-    ap = argparse.ArgumentParser(description="Build a two-stage hybrid retrieval report from two reports.")
+    ap = argparse.ArgumentParser(
+        description="Build a two-stage hybrid retrieval report from two reports."
+    )
     ap.add_argument("--must-report", required=True)
     ap.add_argument("--fallback-report", required=True)
     ap.add_argument("--run-id", required=True)
@@ -33,7 +35,9 @@ def main() -> int:
     must_report = load_report(must_path)
     fallback_report = load_report(fallback_path)
 
-    stage2_max_ms = float(args.stage2_max_ms) if args.stage2_max_ms and args.stage2_max_ms > 0 else None
+    stage2_max_ms = (
+        float(args.stage2_max_ms) if args.stage2_max_ms and args.stage2_max_ms > 0 else None
+    )
 
     manifest = {
         "experiment": {
@@ -62,7 +66,9 @@ def main() -> int:
     )
 
     report_path = out_dir / "retrieval-report.json"
-    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+    report_path.write_text(
+        json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
+    )
 
     md_lines = [
         f"# Two-stage hybrid report ({args.run_id})",

diff --git a/scripts/run_lancedb_vs_openclaw_mem_assisted.py b/scripts/run_lancedb_vs_openclaw_mem_assisted.py
@@ -62,7 +62,9 @@ def _session_importance_label(session: dict[str, Any]) -> str:
             return "ignore"
 
     # Fallback lexical proxy when dataset lacks labels.
-    merged = "\n".join(str(m.get("content") or "") for m in session.get("messages", []) if isinstance(m, dict)).lower()
+    merged = "\n".join(
+        str(m.get("content") or "") for m in session.get("messages", []) if isinstance(m, dict)
+    ).lower()
     must_kw = (
         "must remember",
         "important",
@@ -370,7 +372,9 @@ def _metric_pack(report: dict[str, Any]) -> dict[str, float]:
     }
 
 
-def _win_eval(*, baseline: dict[str, float], candidate: dict[str, float], policy: str) -> dict[str, Any]:
+def _win_eval(
+    *, baseline: dict[str, float], candidate: dict[str, float], policy: str
+) -> dict[str, Any]:
     p95_gain = (
         (baseline["search_ms_p95"] - candidate["search_ms_p95"]) / baseline["search_ms_p95"]
         if baseline["search_ms_p95"]
@@ -493,7 +497,9 @@ def main() -> int:
     args = ap.parse_args()
 
     repo_root = Path(__file__).resolve().parents[1]
-    dataset_path = repo_root / args.dataset if not Path(args.dataset).is_absolute() else Path(args.dataset)
+    dataset_path = (
+        repo_root / args.dataset if not Path(args.dataset).is_absolute() else Path(args.dataset)
+    )
     out_root = repo_root / args.output_root
 
     run_group = _resolve_run_group(explicit_run_group=args.run_group, run_label=args.run_label)
@@ -524,7 +530,9 @@ def main() -> int:
     if args.include_observational:
         obs_dataset, obs_stats = _compress_dataset_observational(raw)
         obs_path = run_dir / "derived-dataset-observational.json"
-        obs_path.write_text(json.dumps(obs_dataset, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+        obs_path.write_text(
+            json.dumps(obs_dataset, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
+        )
 
         obs_report = _run_lancedb(
             dataset_path=obs_path,
@@ -561,7 +569,9 @@ def main() -> int:
     for policy in args.policies:
         filtered, filter_stats = _filter_dataset(raw, policy=policy)
         filtered_path = run_dir / f"derived-dataset-{policy}.json"
-        filtered_path.write_text(json.dumps(filtered, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+        filtered_path.write_text(
+            json.dumps(filtered, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
+        )
 
         report = _run_lancedb(
             dataset_path=filtered_path,
@@ -624,7 +634,11 @@ def main() -> int:
                     "fallback_report_path": fallback_candidate["report"]["report_path"],
                     "mode": "must_count_gate",
                     "fusion_mode": args.hybrid_fusion_mode,
-                    "k_rrf": (float(args.hybrid_k_rrf) if args.hybrid_fusion_mode == "rrf_fusion" else None),
+                    "k_rrf": (
+                        float(args.hybrid_k_rrf)
+                        if args.hybrid_fusion_mode == "rrf_fusion"
+                        else None
+                    ),
                     "min_must_count": int(args.hybrid_min_must_count),
                     "stage2_max_additional": int(args.hybrid_stage2_max_additional),
                     "stage2_max_ms": stage2_max_ms,
@@ -640,7 +654,9 @@ def main() -> int:
         hybrid_dir = run_dir / "hybrid"
         hybrid_dir.mkdir(parents=True, exist_ok=True)
         hybrid_json = hybrid_dir / "retrieval-report.json"
-        hybrid_json.write_text(json.dumps(hybrid_report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+        hybrid_json.write_text(
+            json.dumps(hybrid_report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
+        )
         hybrid_md = hybrid_dir / "retrieval-report.md"
         _write_hybrid_markdown(
             path=hybrid_md,
@@ -702,10 +718,16 @@ def main() -> int:
     # Pass if p95 improves >=20% while recall drop <=3pp and nDCG non-negative.
     wins: list[dict[str, Any]] = []
     for row in curve:
-        wins.append(_win_eval(baseline=row["baseline"], candidate=row["experimental"], policy=row["policy"]))
+        wins.append(
+            _win_eval(baseline=row["baseline"], candidate=row["experimental"], policy=row["policy"])
+        )
 
     if hybrid_tradeoff is not None:
-        wins.append(_win_eval(baseline=baseline_metrics, candidate=hybrid_tradeoff["hybrid"], policy="hybrid"))
+        wins.append(
+            _win_eval(
+                baseline=baseline_metrics, candidate=hybrid_tradeoff["hybrid"], policy="hybrid"
+            )
+        )
 
     arms: dict[str, Any] = {
         "baseline": baseline,
@@ -750,7 +772,9 @@ def main() -> int:
     }
 
     compare_json = run_dir / f"compare-{run_group}.json"
-    compare_json.write_text(json.dumps(compare, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+    compare_json.write_text(
+        json.dumps(compare, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
+    )
 
     lines = [
         f"# Phase A/B compare ({run_group})",
@@ -859,7 +883,9 @@ def main() -> int:
                 "compare_md": str(compare_md),
                 "latest_pointer": str(latest_pointer),
                 "baseline_report": baseline["report_path"],
-                "observational_report": (observational["report"]["report_path"] if observational is not None else None),
+                "observational_report": (
+                    observational["report"]["report_path"] if observational is not None else None
+                ),
                 "experimental_reports": [x["report"]["report_path"] for x in candidates],
                 "hybrid_report": (hybrid["report"]["report_path"] if hybrid is not None else None),
             },

diff --git a/scripts/run_longmemeval50_qa_compare.py b/scripts/run_longmemeval50_qa_compare.py
@@ -96,12 +96,7 @@ def openai_chat_completions(
             with urllib.request.urlopen(req, timeout=timeout_s) as resp:
                 raw = resp.read().decode("utf-8")
             data = json.loads(raw)
-            return (
-                data.get("choices", [{}])[0]
-                .get("message", {})
-                .get("content", "")
-                .strip()
-            )
+            return data.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
         except urllib.error.HTTPError as e:
             # Read body for debugging (keep local)
             try:
@@ -275,13 +270,20 @@ def main() -> int:
     ap.add_argument("--judge-model", default="")
     ap.add_argument("--limit", type=int, default=20, help="question limit (default 20 for Phase A)")
     ap.add_argument("--seed", type=int, default=7)
-    ap.add_argument("--arms", nargs="+", default=["oracle", "observational"], choices=["oracle", "full", "observational"])
+    ap.add_argument(
+        "--arms",
+        nargs="+",
+        default=["oracle", "observational"],
+        choices=["oracle", "full", "observational"],
+    )
     ap.add_argument("--max-msg-chars", type=int, default=600)
     args = ap.parse_args()
 
     api_key = os.getenv("OPENAI_API_KEY") or ""
     if not api_key.strip():
-        raise SystemExit("OPENAI_API_KEY is missing/empty. Set it in the environment before running.")
+        raise SystemExit(
+            "OPENAI_API_KEY is missing/empty. Set it in the environment before running."
+        )
 
     judge_model = args.judge_model.strip() or args.model
 
@@ -311,7 +313,9 @@ def main() -> int:
         "created_at": datetime.now(UTC).isoformat(),
         "note": "Phase A QA compare on repo-local longmemeval-50 format (not official LongMemEval runner).",
     }
-    (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    (out_dir / "manifest.json").write_text(
+        json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8"
+    )
 
     summary: dict[str, Any] = {"manifest": manifest, "arms": {}}
 
@@ -323,7 +327,10 @@ def main() -> int:
 
         rows: list[Row] = []
 
-        with hyp_path.open("w", encoding="utf-8") as hyp_f, eval_path.open("w", encoding="utf-8") as eval_f:
+        with (
+            hyp_path.open("w", encoding="utf-8") as hyp_f,
+            eval_path.open("w", encoding="utf-8") as eval_f,
+        ):
             for i, q in enumerate(qs):
                 qid = str(q.get("question_id") or "")
                 qtype = str(q.get("question_type") or "")
@@ -333,7 +340,9 @@ def main() -> int:
 
                 rel_ids = set(str(x) for x in (q.get("relevant_session_ids") or []) if str(x))
                 if arm == "oracle":
-                    arm_sessions = [s for s in sessions if str(s.get("session_id") or "") in rel_ids]
+                    arm_sessions = [
+                        s for s in sessions if str(s.get("session_id") or "") in rel_ids
+                    ]
                 else:
                     arm_sessions = sessions
 
@@ -352,11 +361,16 @@ def main() -> int:
                     max_tokens=256,
                 )
 
-                print(json.dumps({"question_id": qid, "hypothesis": hyp}, ensure_ascii=False), file=hyp_f)
+                print(
+                    json.dumps({"question_id": qid, "hypothesis": hyp}, ensure_ascii=False),
+                    file=hyp_f,
+                )
 
                 # Judge
                 abstention = "_abs" in qid
-                judge_prompt = get_anscheck_prompt(qtype, question, answer, hyp, abstention=abstention)
+                judge_prompt = get_anscheck_prompt(
+                    qtype, question, answer, hyp, abstention=abstention
+                )
 
                 _jitter_sleep()
                 judge_resp = openai_chat_completions(
@@ -384,7 +398,7 @@ def main() -> int:
                 rows.append(Row(qid, qtype, question, answer, hyp, bool(label)))
 
                 # Progress line
-                print(f"[{arm}] {i+1}/{len(qs)} qid={qid} label={label}")
+                print(f"[{arm}] {i + 1}/{len(qs)} qid={qid} label={label}")
 
         # Summarize
         by_type: dict[str, list[bool]] = {}
@@ -398,13 +412,20 @@ def acc(xs: Iterable[bool]) -> float:
         arm_summary = {
             "n": len(rows),
             "accuracy": acc([r.label for r in rows]),
-            "by_question_type": {k: {"accuracy": acc(v), "n": len(v)} for k, v in sorted(by_type.items())},
-            "paths": {"hypotheses": str(hyp_path.relative_to(REPO_ROOT)), "eval": str(eval_path.relative_to(REPO_ROOT))},
+            "by_question_type": {
+                k: {"accuracy": acc(v), "n": len(v)} for k, v in sorted(by_type.items())
+            },
+            "paths": {
+                "hypotheses": str(hyp_path.relative_to(REPO_ROOT)),
+                "eval": str(eval_path.relative_to(REPO_ROOT)),
+            },
         }
         summary["arms"][arm] = arm_summary
 
     # Write summary
-    (out_dir / "summary.json").write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    (out_dir / "summary.json").write_text(
+        json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8"
+    )
 
     # Markdown
     md_lines: list[str] = []