diff --git a/src/cases/difficulty-ladder.ts b/src/cases/difficulty-ladder.ts new file mode 100644 index 0000000..df14bb2 --- /dev/null +++ b/src/cases/difficulty-ladder.ts @@ -0,0 +1,254 @@ +/** + * Difficulty Ladder System + * + * Generates difficulty variants (easy/medium/hard) from a base case. + * Each variant provides different levels of scaffolding/context. + */ + +import { Case, CaseFile, CaseDifficulty } from './types'; + +// ============================================================================= +// Difficulty Levels +// ============================================================================= + +/** + * Scaffolding level for a difficulty variant + */ +export interface ScaffoldingLevel { + /** How much context is provided */ + level: 'easy' | 'medium' | 'hard'; + + /** Additional files to include (beyond base case files) */ + additionalFiles?: CaseFile[]; + + /** Hints or guidance to add to the prompt */ + hints?: string[]; + + /** Whether to show the solution (for testing only) */ + showSolution?: boolean; + + /** Notes about this variant */ + notes?: string; +} + +/** + * Difficulty ladder configuration for a case + */ +export interface DifficultyLadder { + /** Base case (the original task) */ + base: Case; + + /** Scaffolding levels for each difficulty */ + levels: Record; + + /** Whether to auto-generate variants on load */ + autoGenerate?: boolean; +} + +// ============================================================================= +// Difficulty Ladder Generator +// ============================================================================= + +/** + * Generate difficulty variants from a base case + */ +export function generateDifficultyVariants(baseCase: Case): Case[] { + const variants: Case[] = []; + + // Create easy variant + const easyVariant = createVariant(baseCase, 'easy'); + if (easyVariant) variants.push(easyVariant); + + // Create medium variant + const mediumVariant = createVariant(baseCase, 'medium'); + if (mediumVariant) variants.push(mediumVariant); + + // Create hard variant + const hardVariant = createVariant(baseCase, 'hard'); + if (hardVariant) variants.push(hardVariant); + + return variants; +} + +/** + * Create a single difficulty variant + */ +function createVariant(baseCase: Case, difficulty: CaseDifficulty): Case | null { + const level = baseCase.difficultyLadder?.levels[difficulty]; + if (!level) { + // No ladder defined, return null (use base case as-is) + return null; + } + + // Combine base files with additional files + const allFiles = baseCase.files ? [...baseCase.files] : []; + if (level.additionalFiles) { + allFiles.push(...level.additionalFiles); + } + + // Build prompt with hints + let prompt = baseCase.prompt; + if (level.hints && level.hints.length > 0) { + prompt += '\n\n' + level.hints.join('\n'); + } + + // Create variant case + const variant: Case = { + ...baseCase, + id: `${baseCase.id}-${difficulty}`, + title: `${baseCase.title} (${difficulty})`, + prompt, + files: allFiles, + difficulty, + // Don't include solution in variants (unless explicitly requested) + solution: level.showSolution ? baseCase.solution : undefined, + notes: level.notes, + }; + + return variant; +} + +/** + * Get the scaffolding level for a difficulty + */ +export function getScaffoldingLevel(baseCase: Case, difficulty: CaseDifficulty): ScaffoldingLevel | null { + return baseCase.difficultyLadder?.levels[difficulty] || null; +} + +/** + * Check if a case has a difficulty ladder defined + */ +export function hasDifficultyLadder(baseCase: Case): boolean { + return !!baseCase.difficultyLadder; +} + +// ============================================================================= +// Difficulty Ladder Builder +// ============================================================================= + +/** + * Builder for creating difficulty ladders + */ +export class DifficultyLadderBuilder { + private base: Case; + private levels: Partial> = {}; + + constructor(baseCase: Case) { + this.base = baseCase; + } + + /** + * Set the easy level scaffolding + */ + withEasy(hints?: string[], additionalFiles?: CaseFile[], notes?: string): this { + this.levels.easy = { + level: 'easy', + hints, + additionalFiles, + notes, + }; + return this; + } + + /** + * Set the medium level scaffolding + */ + withMedium(hints?: string[], additionalFiles?: CaseFile[], notes?: string): this { + this.levels.medium = { + level: 'medium', + hints, + additionalFiles, + notes, + }; + return this; + } + + /** + * Set the hard level scaffolding + */ + withHard(hints?: string[], additionalFiles?: CaseFile[], notes?: string): this { + this.levels.hard = { + level: 'hard', + hints, + additionalFiles, + notes, + }; + return this; + } + + /** + * Build the difficulty ladder + */ + build(): DifficultyLadder { + return { + base: this.base, + levels: this.levels as Record, + autoGenerate: true, + }; + } +} + +// ============================================================================= +// Default Scaffolding Strategies +// ============================================================================= + +/** + * Default scaffolding for easy level + * - More context, hints, and guidance + */ +export function defaultEasyScaffolding(_baseCase: Case): ScaffoldingLevel { + return { + level: 'easy', + hints: [ + 'This is an easy task. Focus on correctness and following best practices.', + 'You have all the context you need to complete this task.', + 'Take your time to understand the codebase before making changes.', + ], + notes: 'Easy: Maximum scaffolding provided', + }; +} + +/** + * Default scaffolding for medium level + * - Standard context, minimal hints + */ +export function defaultMediumScaffolding(_baseCase: Case): ScaffoldingLevel { + return { + level: 'medium', + hints: [ + 'Complete this task to the best of your ability.', + ], + notes: 'Medium: Standard scaffolding', + }; +} + +/** + * Default scaffolding for hard level + * - Minimal context, agent must discover + */ +export function defaultHardScaffolding(_baseCase: Case): ScaffoldingLevel { + return { + level: 'hard', + hints: [ + 'You need to figure out the best approach for this task.', + 'Explore the codebase to understand the context.', + 'Make reasonable assumptions and document them.', + ], + notes: 'Hard: Minimal scaffolding', + }; +} + +/** + * Create a difficulty ladder with default scaffolding + */ +export function createDefaultLadder(_baseCase: Case): DifficultyLadder { + return { + base: _baseCase, + levels: { + easy: defaultEasyScaffolding(_baseCase), + medium: defaultMediumScaffolding(_baseCase), + hard: defaultHardScaffolding(_baseCase), + }, + autoGenerate: true, + }; +} diff --git a/src/cases/types.ts b/src/cases/types.ts index aaaf1fe..5aa4104 100644 --- a/src/cases/types.ts +++ b/src/cases/types.ts @@ -1,3 +1,4 @@ +import type { ScaffoldingLevel } from './difficulty-ladder'; /** * Case and Rubric Schema Types * @@ -39,6 +40,20 @@ export type CaseSource = 'bootstrap' | 'generated' | 'manual' | 'imported' | 'cl */ export type CaseDifficulty = 'easy' | 'medium' | 'hard'; + +/** + * Difficulty ladder configuration for a case + */ +export interface DifficultyLadder { + /** Base case (the original task) */ + base: Case; + + /** Scaffolding levels for each difficulty */ + levels: Record; + + /** Whether to auto-generate variants on load */ + autoGenerate?: boolean; +} /** * Agent behavior expectations for a case */ @@ -110,6 +125,9 @@ export interface Case { /** Additional notes or hints (not shown to agent) */ notes?: string; + // Difficulty ladder configuration for generating variants + difficultyLadder?: DifficultyLadder; + // Metadata added by loader /** Source file path (added by loader) */ _sourcePath?: string; diff --git a/src/cases/types.ts.bak b/src/cases/types.ts.bak new file mode 100644 index 0000000..aaaf1fe --- /dev/null +++ b/src/cases/types.ts.bak @@ -0,0 +1,541 @@ +/** + * Case and Rubric Schema Types + * + * Cases are structured as "interview questions" - a prompt given to an agent + * with optional starting files and metadata. + * + * Rubrics define how to grade the agent's response - weighted criteria + * with evaluators that produce scores. + */ + +// ============================================================================= +// Case Types (The Interview Question) +// ============================================================================= + +/** + * A file provided as part of a case (starting code, tests, etc.) + */ +export interface CaseFile { + /** Relative path within the workspace */ + path: string; + + /** File content (inline) */ + content?: string; + + /** Reference to external file (alternative to inline content) */ + ref?: string; + + /** Whether this file is read-only (agent shouldn't modify) */ + readonly?: boolean; +} + +/** + * Source/origin of a case + */ +export type CaseSource = 'bootstrap' | 'generated' | 'manual' | 'imported' | 'closed_issue'; + +/** + * Difficulty level + */ +export type CaseDifficulty = 'easy' | 'medium' | 'hard'; + +/** + * Agent behavior expectations for a case + */ +export interface CaseExpectations { + /** Maximum time in seconds */ + maxTimeSeconds?: number; + + /** Maximum tokens (input + output) */ + maxTokens?: number; + + /** Maximum iterations/turns */ + maxIterations?: number; + + /** Tools the agent is allowed to use */ + allowedTools?: string[]; + + /** Tools the agent should not use */ + disallowedTools?: string[]; +} + +/** + * A test case - the "interview question" given to an agent + */ +export interface Case { + /** Unique identifier (e.g., "bootstrap-001", "error-handling-py-001") */ + id: string; + + /** Human-readable title */ + title: string; + + /** The interview question - what we're asking the agent to do */ + prompt: string; + + /** Optional starting files (empty = greenfield task) */ + files?: CaseFile[]; + + /** + * Rubric to use for evaluation. + * Can be: + * - string: reference to a rubric ID (e.g., "default", "strict-security") + * - object: inline rubric or extension of existing rubric + */ + rubric?: string | RubricReference; + + /** Where this case came from */ + source: CaseSource; + + /** Primary programming language */ + language: string; + + /** Difficulty level */ + difficulty: CaseDifficulty; + + /** Category for organization (e.g., "error-handling", "security", "performance") */ + category: string; + + /** Tags for filtering */ + tags?: string[]; + + /** Expected agent behavior bounds */ + expectations?: CaseExpectations; + + /** Version of this case (for tracking changes) */ + version?: string; + + /** Reference solution (not shown to agent, used for validation) */ + solution?: CaseFile[]; + + /** Additional notes or hints (not shown to agent) */ + notes?: string; + + // Metadata added by loader + /** Source file path (added by loader) */ + _sourcePath?: string; + + /** When this case was loaded (added by loader) */ + _loadedAt?: Date; +} + +// ============================================================================= +// Rubric Types (How We Grade) +// ============================================================================= + +/** + * Types of evaluators available + */ +export type EvaluatorType = + | 'command' // Run a shell command, check exit code + | 'pattern' // Regex match on files + | 'benchmark' // Run command, extract numeric metric + | 'diff' // Compare output to expected + | 'llm_judge' // Use LLM to evaluate (subjective criteria) + | 'agent_behavior'; // Evaluate agent behavior metrics + +/** + * Base evaluator configuration + */ +export interface EvaluatorBase { + /** Type of evaluator */ + type: EvaluatorType; + + /** Human-readable name for this check */ + name?: string; + + /** Whether this evaluator is optional (won't fail if it errors) */ + optional?: boolean; + + /** Whether to award partial credit (vs pass/fail) */ + partialCredit?: boolean; + + /** Threshold for passing (0.0-1.0, default 1.0) */ + passThreshold?: number; +} + +/** + * Command evaluator - runs a shell command + */ +export interface CommandEvaluator extends EvaluatorBase { + type: 'command'; + + /** Command to run */ + run: string; + + /** How to parse output (for partial credit) */ + parse?: 'exit_code' | 'json' | 'junit' | 'tap'; + + /** JSONPath expression to extract score (when parse=json) */ + scorePath?: string; + + /** Fail if this pattern is found in output */ + failIfMatch?: string; + + /** Fail if this pattern is NOT found in output */ + failIfNoMatch?: string; +} + +/** + * Pattern evaluator - regex match on files + */ +export interface PatternEvaluator extends EvaluatorBase { + type: 'pattern'; + + /** Glob pattern for files to check */ + files: string; + + /** Fail if this pattern matches */ + failIfMatch?: string; + + /** Fail if this pattern does NOT match */ + requireMatch?: string; + + /** Case-insensitive matching */ + ignoreCase?: boolean; +} + +/** + * Benchmark evaluator - extract numeric metrics + */ +export interface BenchmarkEvaluator extends EvaluatorBase { + type: 'benchmark'; + + /** Command to run */ + run: string; + + /** Name of the metric being measured */ + metric: string; + + /** JSONPath to extract value (if output is JSON) */ + valuePath?: string; + + /** Regex to extract value from output */ + valuePattern?: string; + + /** Minimum acceptable value */ + minValue?: number; + + /** Maximum acceptable value */ + maxValue?: number; + + /** Target value (for partial credit calculation) */ + targetValue?: number; +} + +/** + * Diff evaluator - compare output to expected + */ +export interface DiffEvaluator extends EvaluatorBase { + type: 'diff'; + + /** Command that produces actual output */ + run: string; + + /** Expected output (inline) */ + expected?: string; + + /** Path to file with expected output */ + expectedFile?: string; + + /** Ignore whitespace differences */ + ignoreWhitespace?: boolean; + + /** Ignore case differences */ + ignoreCase?: boolean; +} + +/** + * LLM Judge evaluator - use AI to evaluate subjective criteria + */ +export interface LLMJudgeEvaluator extends EvaluatorBase { + type: 'llm_judge'; + + /** What to evaluate */ + evaluate: 'code_quality' | 'readability' | 'documentation' | 'custom'; + + /** Custom prompt for evaluation (when evaluate=custom) */ + prompt?: string; + + /** Files to include in evaluation context */ + files?: string; + + /** Model to use (default: configured default) */ + model?: string; +} + +/** + * Agent behavior evaluator - measure how the agent worked + */ +export interface AgentBehaviorEvaluator extends EvaluatorBase { + type: 'agent_behavior'; + + /** Which metric to evaluate */ + metric: 'time' | 'tokens' | 'iterations' | 'tool_calls' | 'self_corrections'; + + /** Maximum acceptable value */ + maxValue?: number; + + /** Minimum acceptable value */ + minValue?: number; + + /** Target value (for partial credit) */ + targetValue?: number; +} + +/** + * Union of all evaluator types + */ +export type Evaluator = + | CommandEvaluator + | PatternEvaluator + | BenchmarkEvaluator + | DiffEvaluator + | LLMJudgeEvaluator + | AgentBehaviorEvaluator; + +/** + * A criterion in a rubric (e.g., "correctness", "code_quality") + */ +export interface RubricCriterion { + /** Weight of this criterion (should sum to 100 across all criteria) */ + weight: number; + + /** Human-readable description */ + description?: string; + + /** Evaluators that contribute to this criterion's score */ + evaluators: Evaluator[]; +} + +/** + * A rubric - defines how to grade an agent's response + */ +export interface Rubric { + /** Unique identifier */ + id: string; + + /** Human-readable name */ + name: string; + + /** Description of when to use this rubric */ + description?: string; + + /** Another rubric to extend (inherit criteria from) */ + extends?: string; + + /** The grading criteria */ + criteria: Record; + + // Metadata + /** Source file path (added by loader) */ + _sourcePath?: string; +} + +/** + * Reference to a rubric with optional overrides + */ +export interface RubricReference { + /** ID of rubric to use as base */ + extends: string; + + /** Override specific criteria */ + criteria?: Record>; +} + +// ============================================================================= +// Result Types (What We Measured) +// ============================================================================= + +/** + * Result from a single evaluator + */ +export interface EvaluatorResult { + /** Name of the evaluator */ + name: string; + + /** Type of evaluator */ + type: EvaluatorType; + + /** Score from 0.0 to 1.0 */ + score: number; + + /** Whether this evaluator passed (score >= threshold) */ + passed: boolean; + + /** Evidence (stdout, stderr, or explanation) */ + evidence: string; + + /** Evaluator-specific details */ + details?: Record; + + /** Error message if evaluator failed to run */ + error?: string; + + /** Duration in milliseconds */ + durationMs: number; +} + +/** + * Result for a single criterion + */ +export interface CriterionResult { + /** Name of the criterion */ + name: string; + + /** Weight of this criterion */ + weight: number; + + /** Weighted score (score * weight / 100) */ + weightedScore: number; + + /** Raw score from 0.0 to 1.0 */ + score: number; + + /** Whether this criterion passed */ + passed: boolean; + + /** Results from individual evaluators */ + evaluatorResults: EvaluatorResult[]; +} + +/** + * Agent behavior trace (captured during execution) + */ +export interface AgentTrace { + /** Total execution time in ms */ + totalTimeMs: number; + + /** Total tokens used (input + output) */ + totalTokens: number; + + /** Number of turns/iterations */ + iterations: number; + + /** Tools that were called */ + toolsUsed: string[]; + + /** Number of self-corrections detected */ + selfCorrections: number; + + /** Per-turn details */ + turns?: AgentTurn[]; +} + +/** + * A single turn in the agent's execution + */ +export interface AgentTurn { + /** When this turn started */ + timestamp: Date; + + /** Tokens in (prompt) */ + tokensIn: number; + + /** Tokens out (response) */ + tokensOut: number; + + /** Tools called in this turn */ + toolCalls: string[]; + + /** Whether this turn was a self-correction */ + selfCorrection: boolean; +} + +/** + * Result from evaluating a single case + */ +export interface CaseResult { + /** Case that was evaluated */ + caseId: string; + + /** Overall score from 0 to 100 */ + score: number; + + /** Whether the case passed (score >= pass threshold) */ + passed: boolean; + + /** Results for each criterion */ + criteriaResults: CriterionResult[]; + + /** Agent behavior trace */ + agentTrace?: AgentTrace; + + /** The agent's text response */ + agentResponse?: string; + + /** Tool calls the agent made */ + agentToolCalls?: { name: string; durationMs?: number; success?: boolean }[]; + + /** Model used */ + agentModel?: string; + + /** Token usage */ + agentTokens?: { input: number; output: number; total: number }; + + /** Files produced by the agent (snapshot of workspace after agent runs) */ + agentFiles?: { path: string; content: string; changed: boolean }[]; + + /** Total duration in milliseconds */ + durationMs: number; + + /** Whether it timed out */ + timedOut: boolean; + + /** Error if something went wrong */ + error?: string; + + /** When this result was produced */ + timestamp: Date; +} + +/** + * Result from a full evaluation run + */ +export interface RunResult { + /** Unique run identifier */ + runId: string; + + /** When the run started */ + startedAt: Date; + + /** When the run completed */ + completedAt: Date; + + /** Agent that was evaluated */ + agent: string; + + /** Rubric used */ + rubricId: string; + + /** Results for each case */ + caseResults: CaseResult[]; + + /** Summary statistics */ + summary: RunSummary; +} + +/** + * Summary statistics for a run + */ +export interface RunSummary { + /** Total cases run */ + total: number; + + /** Cases that passed */ + passed: number; + + /** Cases that failed */ + failed: number; + + /** Cases that were skipped */ + skipped: number; + + /** Cases that timed out */ + timedOut: number; + + /** Average score across all cases */ + averageScore: number; + + /** Total duration in milliseconds */ + totalDurationMs: number; +}