Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions agent/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ The Worldview file stores information the user explicitly provides for later ref

The purpose is to capture the user's specific framing and claims, not to build a comprehensive knowledge base. General facts already exist in model weights and don't need to be stored.

## Critical: Reject Ephemeral Events

The Worldview format is designed to store **durable beliefs, values, perspectives, and knowledge** — not transient events or one-time occurrences. You must:

- **Reject ephemeral personal events** like "I went to the park on Sunday" or "I had coffee this morning"
- **Reject time-bound occurrences** that describe what happened rather than what the user believes or values
- **Accept beliefs about events** like "parks are good for mental health" or "Sunday routines matter"

If given an ephemeral event, respond politely explaining that the Worldview format is for beliefs and perspectives, not personal diary entries, and do NOT modify the file.

Remember the design principles: state over narrative, predictability allows omission, conflict tolerance, freeform vocabulary, and LLM-native density.
"#;

Expand Down Expand Up @@ -328,7 +338,9 @@ async fn main() -> Result<()> {
while let Some(step) = agent.next().await {
match step {
AgentStep::TextDelta(text) => {
print!("{}", text);
if cli.verbose {
print!("{}", text);
}
}
AgentStep::ThinkingDelta(thinking) => {
if cli.verbose {
Expand Down Expand Up @@ -393,12 +405,12 @@ async fn main() -> Result<()> {
if cli.verbose {
eprintln!("[error:{}ms] {}", total_elapsed.as_millis(), e);
}
eprintln!("\nError: {}", e);
eprintln!("Error: {}", e);
std::process::exit(1);
}
}
}

println!("\n\nWorldview file updated: {:?}", file_path);
// Exit 0 for success (including correct rejections)
Ok(())
}
2 changes: 2 additions & 0 deletions evals/write_eval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
ExpectedStructure,
WriteTestCase,
ALL_WRITE_CASES,
REJECTION_CASES,
get_cases_by_complexity,
get_cases_by_task_type,
get_case_by_id,
Expand All @@ -53,6 +54,7 @@
"ExpectedStructure",
"WriteTestCase",
"ALL_WRITE_CASES",
"REJECTION_CASES",
"get_cases_by_complexity",
"get_cases_by_task_type",
"get_case_by_id",
Expand Down
34 changes: 34 additions & 0 deletions evals/write_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,40 @@ def evaluate_write(
expected = test_case.expected
score = WriteScore(min_claims_required=expected.min_claims)

# Special handling for REJECT cases (should_modify_file=False)
if not test_case.should_modify_file:
# For rejection cases, the file should NOT be modified
# Compare to base_content - they should be identical
base_normalized = test_case.base_content.strip()
generated_normalized = generated_content.strip()

if base_normalized == generated_normalized:
# Success: file was not modified
score.syntax_valid = True
score.syntax_score = 1.0
score.concept_score = 1.0
score.facet_score = 1.0
score.operator_score = 1.0
score.term_score = 1.0
score.claim_count_score = 1.0
score.overall_score = 1.0
score.notes = "Correctly rejected: file unchanged"
return score
else:
# Failure: file was modified when it shouldn't have been
score.syntax_valid = True # Syntax might be valid but behavior is wrong
score.syntax_score = 1.0
score.overall_score = 0.0
score.notes = "Failed to reject: file was modified when it should have been left unchanged"

# Still check for forbidden terms to provide useful feedback
for term in expected.forbidden_terms:
if find_term(term, generated_content):
score.forbidden_terms_found.append(term)
if score.forbidden_terms_found:
score.notes += f"; Forbidden terms found: {score.forbidden_terms_found}"
return score

# Syntax validation
if validator_path:
syntax_valid, errors, warnings = validate_syntax_with_binary(
Expand Down
66 changes: 65 additions & 1 deletion evals/write_eval/test_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class TaskType(Enum):
CREATE = "create" # Add to empty file
APPEND = "append" # Add new content to existing file
UPDATE = "update" # Modify existing content
REJECT = "reject" # Should refuse to modify file (ephemeral events, etc.)


@dataclass
Expand Down Expand Up @@ -88,6 +89,7 @@ class WriteTestCase:
expected: ExpectedStructure
base_content: str = ""
notes: Optional[str] = None
should_modify_file: bool = True # False for REJECT cases


# =============================================================================
Expand Down Expand Up @@ -380,11 +382,73 @@ class WriteTestCase:
),
]

# =============================================================================
# REJECTION/FILTER TEST CASES
# Tests for ephemeral events that should be rejected or filtered
# =============================================================================

REJECTION_CASES = [
WriteTestCase(
id="accept-dev-preference",
name="Development tooling preference",
complexity=Complexity.SIMPLE,
task_type=TaskType.CREATE,
fact_statement=(
"always use uv to run python projects a system python install is not available"
),
expected=ExpectedStructure(
required_concepts=["Python"],
required_facets=[".execution", ".tooling", ".development"],
required_terms=["uv"],
min_claims=1,
),
notes="Valid belief about development practices - should be accepted",
),
WriteTestCase(
id="reject-ephemeral-event",
name="Ephemeral personal event",
complexity=Complexity.SIMPLE,
task_type=TaskType.REJECT,
fact_statement="on sunday I went to the park and met a friend",
expected=ExpectedStructure(
required_concepts=[],
required_facets=[],
required_terms=[],
forbidden_terms=["sunday", "park", "friend", "Personal"],
min_claims=0,
),
should_modify_file=False,
notes="Ephemeral event with no durable belief - should be rejected entirely",
),
WriteTestCase(
id="filter-mixed-ephemeral",
name="Mixed ephemeral and factual statement",
complexity=Complexity.MODERATE,
task_type=TaskType.CREATE,
fact_statement=(
"Last Tuesday my doctor told me that lack of sleep causes cognitive "
"decline and poor decision-making over time"
),
expected=ExpectedStructure(
required_concepts=["Sleep"],
required_facets=[".deprivation", ".effects", ".cognition"],
required_operators=["=>"],
required_terms=["cognitive", "decision"],
forbidden_terms=["Tuesday", "doctor", "told me", "Last"],
min_claims=1,
),
notes=(
"Contains ephemeral framing ('Last Tuesday my doctor told me') but also "
"a durable belief. Agent should extract only the belief about sleep."
),
),
]

# =============================================================================
# ALL TEST CASES
# =============================================================================

ALL_WRITE_CASES = SIMPLE_CASES + MODERATE_CASES + COMPLEX_CASES
ALL_WRITE_CASES = SIMPLE_CASES + MODERATE_CASES + COMPLEX_CASES + REJECTION_CASES


def get_cases_by_complexity(complexity: Complexity) -> list[WriteTestCase]:
Expand Down
4 changes: 4 additions & 0 deletions example.wvf
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,7 @@ Institutions
- ossify | over time
- self-perpetuate // original purpose
- capture-by-interests^ @public-choice-theory

Python-development
.execution
- use uv ! | system python unavailable