From 3eea302c5499fab86aa387e9dd1a33eac4ac0231 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 20 Jan 2026 23:26:07 +0000 Subject: [PATCH 1/4] Add uv requirement for Python development to example worldview Demonstrates the agent's ability to encode development practices: - Creates Python-development concept with .execution facet - Uses ! modifier for emphasis and | for condition --- example.wvf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/example.wvf b/example.wvf index c5ec062..b36f997 100644 --- a/example.wvf +++ b/example.wvf @@ -48,3 +48,7 @@ Institutions - ossify | over time - self-perpetuate // original purpose - capture-by-interests^ @public-choice-theory + +Python-development + .execution + - use uv ! | system python unavailable From 2897cd58478dce355cb53332c2e2213ea8c7c692 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 20 Jan 2026 23:32:04 +0000 Subject: [PATCH 2/4] Add personal experience entry to example worldview Demonstrates encoding a concrete life event: - Creates Personal-experiences concept with .social facet - Uses | condition marker for temporal context (sunday) --- example.wvf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/example.wvf b/example.wvf index b36f997..16311f6 100644 --- a/example.wvf +++ b/example.wvf @@ -52,3 +52,7 @@ Institutions Python-development .execution - use uv ! | system python unavailable + +Personal-experiences + .social + - park visit with friend | sunday From 6df99895081d1b42bd16f286030c086c8ec8a61f Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 20 Jan 2026 23:34:19 +0000 Subject: [PATCH 3/4] Reject ephemeral events in worldview agent Add guidance to the agent prompt to reject transient personal events (like "I went to the park on Sunday") that don't represent durable beliefs or worldviews. The agent now politely declines and suggests reframing as a belief instead. Also reverts the test personal-experience entry from example.wvf. --- agent/src/main.rs | 10 ++++++++++ example.wvf | 4 ---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index f48a25a..89ad04c 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -35,6 +35,16 @@ The Worldview file stores information the user explicitly provides for later ref The purpose is to capture the user's specific framing and claims, not to build a comprehensive knowledge base. General facts already exist in model weights and don't need to be stored. +## Critical: Reject Ephemeral Events + +The Worldview format is designed to store **durable beliefs, values, perspectives, and knowledge** — not transient events or one-time occurrences. You must: + +- **Reject ephemeral personal events** like "I went to the park on Sunday" or "I had coffee this morning" +- **Reject time-bound occurrences** that describe what happened rather than what the user believes or values +- **Accept beliefs about events** like "parks are good for mental health" or "Sunday routines matter" + +If given an ephemeral event, respond politely explaining that the Worldview format is for beliefs and perspectives, not personal diary entries, and do NOT modify the file. + Remember the design principles: state over narrative, predictability allows omission, conflict tolerance, freeform vocabulary, and LLM-native density. "#; diff --git a/example.wvf b/example.wvf index 16311f6..b36f997 100644 --- a/example.wvf +++ b/example.wvf @@ -52,7 +52,3 @@ Institutions Python-development .execution - use uv ! | system python unavailable - -Personal-experiences - .social - - park visit with friend | sunday From e70916fb3333b31c2ea84b0bb84e2e2dc52e6002 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 20 Jan 2026 23:40:21 +0000 Subject: [PATCH 4/4] Add ephemeral event rejection tests and silent agent mode Agent changes: - Suppress all output unless --verbose flag is used - Exit 0 for success (including correct rejections) - Exit 1 only for actual errors Eval changes: - Add REJECT task type for testing rejection behavior - Add should_modify_file field to WriteTestCase - Add evaluator handling for rejection cases - Add three new test cases: - accept-dev-preference: "always use uv to run python..." - reject-ephemeral-event: "on sunday I went to the park..." - filter-mixed-ephemeral: statement with both ephemeral framing and durable belief content --- agent/src/main.rs | 8 +++-- evals/write_eval/__init__.py | 2 ++ evals/write_eval/evaluator.py | 34 ++++++++++++++++++ evals/write_eval/test_cases.py | 66 +++++++++++++++++++++++++++++++++- 4 files changed, 106 insertions(+), 4 deletions(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index 89ad04c..91fa969 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -338,7 +338,9 @@ async fn main() -> Result<()> { while let Some(step) = agent.next().await { match step { AgentStep::TextDelta(text) => { - print!("{}", text); + if cli.verbose { + print!("{}", text); + } } AgentStep::ThinkingDelta(thinking) => { if cli.verbose { @@ -403,12 +405,12 @@ async fn main() -> Result<()> { if cli.verbose { eprintln!("[error:{}ms] {}", total_elapsed.as_millis(), e); } - eprintln!("\nError: {}", e); + eprintln!("Error: {}", e); std::process::exit(1); } } } - println!("\n\nWorldview file updated: {:?}", file_path); + // Exit 0 for success (including correct rejections) Ok(()) } diff --git a/evals/write_eval/__init__.py b/evals/write_eval/__init__.py index d8996c7..842022a 100644 --- a/evals/write_eval/__init__.py +++ b/evals/write_eval/__init__.py @@ -27,6 +27,7 @@ ExpectedStructure, WriteTestCase, ALL_WRITE_CASES, + REJECTION_CASES, get_cases_by_complexity, get_cases_by_task_type, get_case_by_id, @@ -53,6 +54,7 @@ "ExpectedStructure", "WriteTestCase", "ALL_WRITE_CASES", + "REJECTION_CASES", "get_cases_by_complexity", "get_cases_by_task_type", "get_case_by_id", diff --git a/evals/write_eval/evaluator.py b/evals/write_eval/evaluator.py index f0372ee..bc38c65 100644 --- a/evals/write_eval/evaluator.py +++ b/evals/write_eval/evaluator.py @@ -390,6 +390,40 @@ def evaluate_write( expected = test_case.expected score = WriteScore(min_claims_required=expected.min_claims) + # Special handling for REJECT cases (should_modify_file=False) + if not test_case.should_modify_file: + # For rejection cases, the file should NOT be modified + # Compare to base_content - they should be identical + base_normalized = test_case.base_content.strip() + generated_normalized = generated_content.strip() + + if base_normalized == generated_normalized: + # Success: file was not modified + score.syntax_valid = True + score.syntax_score = 1.0 + score.concept_score = 1.0 + score.facet_score = 1.0 + score.operator_score = 1.0 + score.term_score = 1.0 + score.claim_count_score = 1.0 + score.overall_score = 1.0 + score.notes = "Correctly rejected: file unchanged" + return score + else: + # Failure: file was modified when it shouldn't have been + score.syntax_valid = True # Syntax might be valid but behavior is wrong + score.syntax_score = 1.0 + score.overall_score = 0.0 + score.notes = "Failed to reject: file was modified when it should have been left unchanged" + + # Still check for forbidden terms to provide useful feedback + for term in expected.forbidden_terms: + if find_term(term, generated_content): + score.forbidden_terms_found.append(term) + if score.forbidden_terms_found: + score.notes += f"; Forbidden terms found: {score.forbidden_terms_found}" + return score + # Syntax validation if validator_path: syntax_valid, errors, warnings = validate_syntax_with_binary( diff --git a/evals/write_eval/test_cases.py b/evals/write_eval/test_cases.py index 6e3a6a2..771668a 100644 --- a/evals/write_eval/test_cases.py +++ b/evals/write_eval/test_cases.py @@ -36,6 +36,7 @@ class TaskType(Enum): CREATE = "create" # Add to empty file APPEND = "append" # Add new content to existing file UPDATE = "update" # Modify existing content + REJECT = "reject" # Should refuse to modify file (ephemeral events, etc.) @dataclass @@ -88,6 +89,7 @@ class WriteTestCase: expected: ExpectedStructure base_content: str = "" notes: Optional[str] = None + should_modify_file: bool = True # False for REJECT cases # ============================================================================= @@ -380,11 +382,73 @@ class WriteTestCase: ), ] +# ============================================================================= +# REJECTION/FILTER TEST CASES +# Tests for ephemeral events that should be rejected or filtered +# ============================================================================= + +REJECTION_CASES = [ + WriteTestCase( + id="accept-dev-preference", + name="Development tooling preference", + complexity=Complexity.SIMPLE, + task_type=TaskType.CREATE, + fact_statement=( + "always use uv to run python projects a system python install is not available" + ), + expected=ExpectedStructure( + required_concepts=["Python"], + required_facets=[".execution", ".tooling", ".development"], + required_terms=["uv"], + min_claims=1, + ), + notes="Valid belief about development practices - should be accepted", + ), + WriteTestCase( + id="reject-ephemeral-event", + name="Ephemeral personal event", + complexity=Complexity.SIMPLE, + task_type=TaskType.REJECT, + fact_statement="on sunday I went to the park and met a friend", + expected=ExpectedStructure( + required_concepts=[], + required_facets=[], + required_terms=[], + forbidden_terms=["sunday", "park", "friend", "Personal"], + min_claims=0, + ), + should_modify_file=False, + notes="Ephemeral event with no durable belief - should be rejected entirely", + ), + WriteTestCase( + id="filter-mixed-ephemeral", + name="Mixed ephemeral and factual statement", + complexity=Complexity.MODERATE, + task_type=TaskType.CREATE, + fact_statement=( + "Last Tuesday my doctor told me that lack of sleep causes cognitive " + "decline and poor decision-making over time" + ), + expected=ExpectedStructure( + required_concepts=["Sleep"], + required_facets=[".deprivation", ".effects", ".cognition"], + required_operators=["=>"], + required_terms=["cognitive", "decision"], + forbidden_terms=["Tuesday", "doctor", "told me", "Last"], + min_claims=1, + ), + notes=( + "Contains ephemeral framing ('Last Tuesday my doctor told me') but also " + "a durable belief. Agent should extract only the belief about sleep." + ), + ), +] + # ============================================================================= # ALL TEST CASES # ============================================================================= -ALL_WRITE_CASES = SIMPLE_CASES + MODERATE_CASES + COMPLEX_CASES +ALL_WRITE_CASES = SIMPLE_CASES + MODERATE_CASES + COMPLEX_CASES + REJECTION_CASES def get_cases_by_complexity(complexity: Complexity) -> list[WriteTestCase]: