Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ scripts/run_lancedb_vs_openclaw_mem_assisted.sh \

Deterministic long-run profile (stable run-group path for reproducible reruns):

For the counterfactual ON/OFF protocol (Pillar A execute now; Pillar B pre-registered), see `docs/FULL_BENCHMARK_PLAN.md#counterfactual-onoff-plan-for-the-two-pillars`.

```bash
scripts/run_phase_ab_longmemeval50.sh
# writes to artifacts/phase-ab-compare/phase-ab-longmemeval50-seed7-topk10/
Expand Down
38 changes: 38 additions & 0 deletions docs/FULL_BENCHMARK_PLAN.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,44 @@ Required reporting:
- If available from provider/tool payloads, include in compare artifact.
- Current v0.2 retrieval reports do not provide tokenized cost telemetry; compare artifact records this as unavailable.

## Counterfactual ON/OFF plan for the two pillars

This section adds a falsifiable ON/OFF design while keeping current Phase A/B priorities intact.

### Scope and sequencing
- **Pillar A (execute now):** context pack contract hardening effects.
- **Pillar B (pre-register now, execute later):** learning-record/self-improving loop effects.
- Do not mix A and B rollout in the same implementation window.

### Experimental arms
- `A0/B0`: baseline behavior (current pack contract, no learning block).
- `A1/B0`: Pillar A ON (contract hardening enabled).
- `A0/B1`: Pillar B ON (reserved; spec only until Pillar A gate passes).
- `A1/B1`: both ON (reserved for later confirmation run).

For the current cycle, run only `A0/B0` vs `A1/B0`.

### Metric definitions (anti-gaming, explicit)
- **Recall@K / Precision@K / nDCG@K**:
- `K` must be fixed per run and written in manifest (`top_k`, default `10`).
- relevance source must be dataset `relevant_session_ids` (no post-hoc re-labeling).
- **Citation coverage**:
- `1 - (included_without_citation_count / included_count)`.
- also report numerator/denominator explicitly.
- **Rationale coverage**:
- `1 - (included_without_reason_count / included_count)`.
- **Determinism pass rate**:
- For each fixed DB/query case, run 5 repeats and compare canonicalized JSON (excluding timestamp fields only).
- pass rate denominator must be number of distinct cases, not total runs.
- **Budget exclusion rate**:
- `excluded_by_budget / total_candidates`.
- **Latency p50/p95**:
- measured on the retrieval+pack path under the same runner/hardware profile recorded in manifest.

### Decision posture
- Treat this as a non-regression gate first (quality and determinism).
- Only after Pillar A gate passes should Pillar B execution be scheduled.

## Reproducibility contract

Each run package must include a manifest with:
Expand Down
12 changes: 9 additions & 3 deletions scripts/hybrid_two_stage_from_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@


def main() -> int:
ap = argparse.ArgumentParser(description="Build a two-stage hybrid retrieval report from two reports.")
ap = argparse.ArgumentParser(
description="Build a two-stage hybrid retrieval report from two reports."
)
ap.add_argument("--must-report", required=True)
ap.add_argument("--fallback-report", required=True)
ap.add_argument("--run-id", required=True)
Expand All @@ -33,7 +35,9 @@ def main() -> int:
must_report = load_report(must_path)
fallback_report = load_report(fallback_path)

stage2_max_ms = float(args.stage2_max_ms) if args.stage2_max_ms and args.stage2_max_ms > 0 else None
stage2_max_ms = (
float(args.stage2_max_ms) if args.stage2_max_ms and args.stage2_max_ms > 0 else None
)

manifest = {
"experiment": {
Expand Down Expand Up @@ -62,7 +66,9 @@ def main() -> int:
)

report_path = out_dir / "retrieval-report.json"
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
report_path.write_text(
json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
)

md_lines = [
f"# Two-stage hybrid report ({args.run_id})",
Expand Down
48 changes: 37 additions & 11 deletions scripts/run_lancedb_vs_openclaw_mem_assisted.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def _session_importance_label(session: dict[str, Any]) -> str:
return "ignore"

# Fallback lexical proxy when dataset lacks labels.
merged = "\n".join(str(m.get("content") or "") for m in session.get("messages", []) if isinstance(m, dict)).lower()
merged = "\n".join(
str(m.get("content") or "") for m in session.get("messages", []) if isinstance(m, dict)
).lower()
must_kw = (
"must remember",
"important",
Expand Down Expand Up @@ -370,7 +372,9 @@ def _metric_pack(report: dict[str, Any]) -> dict[str, float]:
}


def _win_eval(*, baseline: dict[str, float], candidate: dict[str, float], policy: str) -> dict[str, Any]:
def _win_eval(
*, baseline: dict[str, float], candidate: dict[str, float], policy: str
) -> dict[str, Any]:
p95_gain = (
(baseline["search_ms_p95"] - candidate["search_ms_p95"]) / baseline["search_ms_p95"]
if baseline["search_ms_p95"]
Expand Down Expand Up @@ -493,7 +497,9 @@ def main() -> int:
args = ap.parse_args()

repo_root = Path(__file__).resolve().parents[1]
dataset_path = repo_root / args.dataset if not Path(args.dataset).is_absolute() else Path(args.dataset)
dataset_path = (
repo_root / args.dataset if not Path(args.dataset).is_absolute() else Path(args.dataset)
)
out_root = repo_root / args.output_root

run_group = _resolve_run_group(explicit_run_group=args.run_group, run_label=args.run_label)
Expand Down Expand Up @@ -524,7 +530,9 @@ def main() -> int:
if args.include_observational:
obs_dataset, obs_stats = _compress_dataset_observational(raw)
obs_path = run_dir / "derived-dataset-observational.json"
obs_path.write_text(json.dumps(obs_dataset, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
obs_path.write_text(
json.dumps(obs_dataset, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
)

obs_report = _run_lancedb(
dataset_path=obs_path,
Expand Down Expand Up @@ -561,7 +569,9 @@ def main() -> int:
for policy in args.policies:
filtered, filter_stats = _filter_dataset(raw, policy=policy)
filtered_path = run_dir / f"derived-dataset-{policy}.json"
filtered_path.write_text(json.dumps(filtered, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
filtered_path.write_text(
json.dumps(filtered, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
)

report = _run_lancedb(
dataset_path=filtered_path,
Expand Down Expand Up @@ -624,7 +634,11 @@ def main() -> int:
"fallback_report_path": fallback_candidate["report"]["report_path"],
"mode": "must_count_gate",
"fusion_mode": args.hybrid_fusion_mode,
"k_rrf": (float(args.hybrid_k_rrf) if args.hybrid_fusion_mode == "rrf_fusion" else None),
"k_rrf": (
float(args.hybrid_k_rrf)
if args.hybrid_fusion_mode == "rrf_fusion"
else None
),
"min_must_count": int(args.hybrid_min_must_count),
"stage2_max_additional": int(args.hybrid_stage2_max_additional),
"stage2_max_ms": stage2_max_ms,
Expand All @@ -640,7 +654,9 @@ def main() -> int:
hybrid_dir = run_dir / "hybrid"
hybrid_dir.mkdir(parents=True, exist_ok=True)
hybrid_json = hybrid_dir / "retrieval-report.json"
hybrid_json.write_text(json.dumps(hybrid_report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
hybrid_json.write_text(
json.dumps(hybrid_report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
)
hybrid_md = hybrid_dir / "retrieval-report.md"
_write_hybrid_markdown(
path=hybrid_md,
Expand Down Expand Up @@ -702,10 +718,16 @@ def main() -> int:
# Pass if p95 improves >=20% while recall drop <=3pp and nDCG non-negative.
wins: list[dict[str, Any]] = []
for row in curve:
wins.append(_win_eval(baseline=row["baseline"], candidate=row["experimental"], policy=row["policy"]))
wins.append(
_win_eval(baseline=row["baseline"], candidate=row["experimental"], policy=row["policy"])
)

if hybrid_tradeoff is not None:
wins.append(_win_eval(baseline=baseline_metrics, candidate=hybrid_tradeoff["hybrid"], policy="hybrid"))
wins.append(
_win_eval(
baseline=baseline_metrics, candidate=hybrid_tradeoff["hybrid"], policy="hybrid"
)
)

arms: dict[str, Any] = {
"baseline": baseline,
Expand Down Expand Up @@ -750,7 +772,9 @@ def main() -> int:
}

compare_json = run_dir / f"compare-{run_group}.json"
compare_json.write_text(json.dumps(compare, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
compare_json.write_text(
json.dumps(compare, ensure_ascii=False, indent=2) + "\n", encoding="utf-8"
)

lines = [
f"# Phase A/B compare ({run_group})",
Expand Down Expand Up @@ -859,7 +883,9 @@ def main() -> int:
"compare_md": str(compare_md),
"latest_pointer": str(latest_pointer),
"baseline_report": baseline["report_path"],
"observational_report": (observational["report"]["report_path"] if observational is not None else None),
"observational_report": (
observational["report"]["report_path"] if observational is not None else None
),
"experimental_reports": [x["report"]["report_path"] for x in candidates],
"hybrid_report": (hybrid["report"]["report_path"] if hybrid is not None else None),
},
Expand Down
55 changes: 38 additions & 17 deletions scripts/run_longmemeval50_qa_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,7 @@ def openai_chat_completions(
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
raw = resp.read().decode("utf-8")
data = json.loads(raw)
return (
data.get("choices", [{}])[0]
.get("message", {})
.get("content", "")
.strip()
)
return data.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
except urllib.error.HTTPError as e:
# Read body for debugging (keep local)
try:
Expand Down Expand Up @@ -275,13 +270,20 @@ def main() -> int:
ap.add_argument("--judge-model", default="")
ap.add_argument("--limit", type=int, default=20, help="question limit (default 20 for Phase A)")
ap.add_argument("--seed", type=int, default=7)
ap.add_argument("--arms", nargs="+", default=["oracle", "observational"], choices=["oracle", "full", "observational"])
ap.add_argument(
"--arms",
nargs="+",
default=["oracle", "observational"],
choices=["oracle", "full", "observational"],
)
ap.add_argument("--max-msg-chars", type=int, default=600)
args = ap.parse_args()

api_key = os.getenv("OPENAI_API_KEY") or ""
if not api_key.strip():
raise SystemExit("OPENAI_API_KEY is missing/empty. Set it in the environment before running.")
raise SystemExit(
"OPENAI_API_KEY is missing/empty. Set it in the environment before running."
)

judge_model = args.judge_model.strip() or args.model

Expand Down Expand Up @@ -311,7 +313,9 @@ def main() -> int:
"created_at": datetime.now(UTC).isoformat(),
"note": "Phase A QA compare on repo-local longmemeval-50 format (not official LongMemEval runner).",
}
(out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
(out_dir / "manifest.json").write_text(
json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8"
)

summary: dict[str, Any] = {"manifest": manifest, "arms": {}}

Expand All @@ -323,7 +327,10 @@ def main() -> int:

rows: list[Row] = []

with hyp_path.open("w", encoding="utf-8") as hyp_f, eval_path.open("w", encoding="utf-8") as eval_f:
with (
hyp_path.open("w", encoding="utf-8") as hyp_f,
eval_path.open("w", encoding="utf-8") as eval_f,
):
for i, q in enumerate(qs):
qid = str(q.get("question_id") or "")
qtype = str(q.get("question_type") or "")
Expand All @@ -333,7 +340,9 @@ def main() -> int:

rel_ids = set(str(x) for x in (q.get("relevant_session_ids") or []) if str(x))
if arm == "oracle":
arm_sessions = [s for s in sessions if str(s.get("session_id") or "") in rel_ids]
arm_sessions = [
s for s in sessions if str(s.get("session_id") or "") in rel_ids
]
else:
arm_sessions = sessions

Expand All @@ -352,11 +361,16 @@ def main() -> int:
max_tokens=256,
)

print(json.dumps({"question_id": qid, "hypothesis": hyp}, ensure_ascii=False), file=hyp_f)
print(
json.dumps({"question_id": qid, "hypothesis": hyp}, ensure_ascii=False),
file=hyp_f,
)

# Judge
abstention = "_abs" in qid
judge_prompt = get_anscheck_prompt(qtype, question, answer, hyp, abstention=abstention)
judge_prompt = get_anscheck_prompt(
qtype, question, answer, hyp, abstention=abstention
)

_jitter_sleep()
judge_resp = openai_chat_completions(
Expand Down Expand Up @@ -384,7 +398,7 @@ def main() -> int:
rows.append(Row(qid, qtype, question, answer, hyp, bool(label)))

# Progress line
print(f"[{arm}] {i+1}/{len(qs)} qid={qid} label={label}")
print(f"[{arm}] {i + 1}/{len(qs)} qid={qid} label={label}")

# Summarize
by_type: dict[str, list[bool]] = {}
Expand All @@ -398,13 +412,20 @@ def acc(xs: Iterable[bool]) -> float:
arm_summary = {
"n": len(rows),
"accuracy": acc([r.label for r in rows]),
"by_question_type": {k: {"accuracy": acc(v), "n": len(v)} for k, v in sorted(by_type.items())},
"paths": {"hypotheses": str(hyp_path.relative_to(REPO_ROOT)), "eval": str(eval_path.relative_to(REPO_ROOT))},
"by_question_type": {
k: {"accuracy": acc(v), "n": len(v)} for k, v in sorted(by_type.items())
},
"paths": {
"hypotheses": str(hyp_path.relative_to(REPO_ROOT)),
"eval": str(eval_path.relative_to(REPO_ROOT)),
},
}
summary["arms"][arm] = arm_summary

# Write summary
(out_dir / "summary.json").write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
(out_dir / "summary.json").write_text(
json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8"
)

# Markdown
md_lines: list[str] = []
Expand Down
Loading