diff --git a/agent/src/main.rs b/agent/src/main.rs index f48a25a..91fa969 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -35,6 +35,16 @@ The Worldview file stores information the user explicitly provides for later ref The purpose is to capture the user's specific framing and claims, not to build a comprehensive knowledge base. General facts already exist in model weights and don't need to be stored. +## Critical: Reject Ephemeral Events + +The Worldview format is designed to store **durable beliefs, values, perspectives, and knowledge** — not transient events or one-time occurrences. You must: + +- **Reject ephemeral personal events** like "I went to the park on Sunday" or "I had coffee this morning" +- **Reject time-bound occurrences** that describe what happened rather than what the user believes or values +- **Accept beliefs about events** like "parks are good for mental health" or "Sunday routines matter" + +If given an ephemeral event, respond politely explaining that the Worldview format is for beliefs and perspectives, not personal diary entries, and do NOT modify the file. + Remember the design principles: state over narrative, predictability allows omission, conflict tolerance, freeform vocabulary, and LLM-native density. "#; @@ -328,7 +338,9 @@ async fn main() -> Result<()> { while let Some(step) = agent.next().await { match step { AgentStep::TextDelta(text) => { - print!("{}", text); + if cli.verbose { + print!("{}", text); + } } AgentStep::ThinkingDelta(thinking) => { if cli.verbose { @@ -393,12 +405,12 @@ async fn main() -> Result<()> { if cli.verbose { eprintln!("[error:{}ms] {}", total_elapsed.as_millis(), e); } - eprintln!("\nError: {}", e); + eprintln!("Error: {}", e); std::process::exit(1); } } } - println!("\n\nWorldview file updated: {:?}", file_path); + // Exit 0 for success (including correct rejections) Ok(()) } diff --git a/evals/write_eval/__init__.py b/evals/write_eval/__init__.py index d8996c7..842022a 100644 --- a/evals/write_eval/__init__.py +++ b/evals/write_eval/__init__.py @@ -27,6 +27,7 @@ ExpectedStructure, WriteTestCase, ALL_WRITE_CASES, + REJECTION_CASES, get_cases_by_complexity, get_cases_by_task_type, get_case_by_id, @@ -53,6 +54,7 @@ "ExpectedStructure", "WriteTestCase", "ALL_WRITE_CASES", + "REJECTION_CASES", "get_cases_by_complexity", "get_cases_by_task_type", "get_case_by_id", diff --git a/evals/write_eval/evaluator.py b/evals/write_eval/evaluator.py index f0372ee..bc38c65 100644 --- a/evals/write_eval/evaluator.py +++ b/evals/write_eval/evaluator.py @@ -390,6 +390,40 @@ def evaluate_write( expected = test_case.expected score = WriteScore(min_claims_required=expected.min_claims) + # Special handling for REJECT cases (should_modify_file=False) + if not test_case.should_modify_file: + # For rejection cases, the file should NOT be modified + # Compare to base_content - they should be identical + base_normalized = test_case.base_content.strip() + generated_normalized = generated_content.strip() + + if base_normalized == generated_normalized: + # Success: file was not modified + score.syntax_valid = True + score.syntax_score = 1.0 + score.concept_score = 1.0 + score.facet_score = 1.0 + score.operator_score = 1.0 + score.term_score = 1.0 + score.claim_count_score = 1.0 + score.overall_score = 1.0 + score.notes = "Correctly rejected: file unchanged" + return score + else: + # Failure: file was modified when it shouldn't have been + score.syntax_valid = True # Syntax might be valid but behavior is wrong + score.syntax_score = 1.0 + score.overall_score = 0.0 + score.notes = "Failed to reject: file was modified when it should have been left unchanged" + + # Still check for forbidden terms to provide useful feedback + for term in expected.forbidden_terms: + if find_term(term, generated_content): + score.forbidden_terms_found.append(term) + if score.forbidden_terms_found: + score.notes += f"; Forbidden terms found: {score.forbidden_terms_found}" + return score + # Syntax validation if validator_path: syntax_valid, errors, warnings = validate_syntax_with_binary( diff --git a/evals/write_eval/test_cases.py b/evals/write_eval/test_cases.py index 6e3a6a2..771668a 100644 --- a/evals/write_eval/test_cases.py +++ b/evals/write_eval/test_cases.py @@ -36,6 +36,7 @@ class TaskType(Enum): CREATE = "create" # Add to empty file APPEND = "append" # Add new content to existing file UPDATE = "update" # Modify existing content + REJECT = "reject" # Should refuse to modify file (ephemeral events, etc.) @dataclass @@ -88,6 +89,7 @@ class WriteTestCase: expected: ExpectedStructure base_content: str = "" notes: Optional[str] = None + should_modify_file: bool = True # False for REJECT cases # ============================================================================= @@ -380,11 +382,73 @@ class WriteTestCase: ), ] +# ============================================================================= +# REJECTION/FILTER TEST CASES +# Tests for ephemeral events that should be rejected or filtered +# ============================================================================= + +REJECTION_CASES = [ + WriteTestCase( + id="accept-dev-preference", + name="Development tooling preference", + complexity=Complexity.SIMPLE, + task_type=TaskType.CREATE, + fact_statement=( + "always use uv to run python projects a system python install is not available" + ), + expected=ExpectedStructure( + required_concepts=["Python"], + required_facets=[".execution", ".tooling", ".development"], + required_terms=["uv"], + min_claims=1, + ), + notes="Valid belief about development practices - should be accepted", + ), + WriteTestCase( + id="reject-ephemeral-event", + name="Ephemeral personal event", + complexity=Complexity.SIMPLE, + task_type=TaskType.REJECT, + fact_statement="on sunday I went to the park and met a friend", + expected=ExpectedStructure( + required_concepts=[], + required_facets=[], + required_terms=[], + forbidden_terms=["sunday", "park", "friend", "Personal"], + min_claims=0, + ), + should_modify_file=False, + notes="Ephemeral event with no durable belief - should be rejected entirely", + ), + WriteTestCase( + id="filter-mixed-ephemeral", + name="Mixed ephemeral and factual statement", + complexity=Complexity.MODERATE, + task_type=TaskType.CREATE, + fact_statement=( + "Last Tuesday my doctor told me that lack of sleep causes cognitive " + "decline and poor decision-making over time" + ), + expected=ExpectedStructure( + required_concepts=["Sleep"], + required_facets=[".deprivation", ".effects", ".cognition"], + required_operators=["=>"], + required_terms=["cognitive", "decision"], + forbidden_terms=["Tuesday", "doctor", "told me", "Last"], + min_claims=1, + ), + notes=( + "Contains ephemeral framing ('Last Tuesday my doctor told me') but also " + "a durable belief. Agent should extract only the belief about sleep." + ), + ), +] + # ============================================================================= # ALL TEST CASES # ============================================================================= -ALL_WRITE_CASES = SIMPLE_CASES + MODERATE_CASES + COMPLEX_CASES +ALL_WRITE_CASES = SIMPLE_CASES + MODERATE_CASES + COMPLEX_CASES + REJECTION_CASES def get_cases_by_complexity(complexity: Complexity) -> list[WriteTestCase]: diff --git a/example.wvf b/example.wvf index c5ec062..b36f997 100644 --- a/example.wvf +++ b/example.wvf @@ -48,3 +48,7 @@ Institutions - ossify | over time - self-perpetuate // original purpose - capture-by-interests^ @public-choice-theory + +Python-development + .execution + - use uv ! | system python unavailable