Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ TEST_SERVER_URL=http://localhost:2137

# Ollama Configuration
OLLAMA_URL=http://localhost:2142/v1
OLLAMA_MODEL=speakleash/bielik-11b-v3.0-instruct:Q8_0
OLLAMA_MODEL=mistral-small3.2:24b-instruct-2506-q8_0

# vLLM Qwen3-VL Configuration (Vision)
VLLM_QWEN3_VL_URL=http://localhost:2141/v1
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ weaviate-data

# Dev server logs
/logs/

# Evaluation test results
agents/__tests__/evaluation/last-run/
14 changes: 10 additions & 4 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
- **Return types**: Do not explicitly declare return types for functions - let TypeScript infer them automatically (e.g., `const add = (a: number, b: number) => a + b` instead of `const add = (a: number, b: number): number => a + b`)
- **Always use braces**: Always use curly braces `{}` for if statements, even for single-line blocks (e.g., `if (condition) { return value; }` instead of `if (condition) return value;`)
- **No magic numbers**: Extract numeric constants to named constants at the top of the file to make the code self-documenting (e.g., `const DEFAULT_PAGE_SIZE = 10;` instead of using `10` directly in code)
- **User-facing strings must be translated**: All strings visible to users must be internationalized, including UI text, error messages from services, and API responses. Strings should never be hardcoded in components, services, or API handlers. Services must accept locale parameter to provide translated error messages that will be displayed to users.
- **User-facing strings must be translated**: All strings visible to users must be internationalized. This includes:
- UI text in components
- Error messages from services and API responses
- **AI agent responses** (chatNode, productsNode, etc.) - all messages returned to users must use translations from `messages/` files
- Never hardcode user-facing strings in code - always use `getTranslations` or translation files
- Services and agents must accept locale parameter and use it to fetch translated strings
- This makes optional properties more concise and follows TypeScript best practices

## Documentation
Expand Down Expand Up @@ -596,9 +601,10 @@ make status # Check Docker services status
# Logs and Debugging
make logs # View Docker logs (follow mode)

# Testing
make test # Run all tests
make test-watch # Run tests in watch mode
# Testing (always run with TEST_LOCALE=en)
TEST_LOCALE=en make test # Run all tests
TEST_LOCALE=en make test-watch # Run tests in watch mode
TEST_LOCALE=en npm test # Alternative: run tests with npm
make lint # Run ESLint
make type-check # Run TypeScript type checking

Expand Down
69 changes: 69 additions & 0 deletions Jenkinsfile.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
pipeline {
agent any

tools {
nodejs 'Node24'
}

environment {
TEST_LOCALE = 'en'
OLLAMA_URL = 'http://192.168.68.80:11434/v1'
OLLAMA_MODEL = 'mistral-small3.2:24b-instruct-2506-q8_0'
MONGODB_URI = 'mongodb://localhost:27017/cognito-eval'
WEAVIATE_HTTP_HOST = 'localhost'
WEAVIATE_HTTP_PORT = '8080'
WEAVIATE_GRPC_HOST = 'localhost'
WEAVIATE_GRPC_PORT = '50051'
WEAVIATE_SECURE = 'false'
WEAVIATE_API_KEY = ''
}

stages {
stage('Checkout') {
steps {
checkout scm
}
}

stage('Start Infrastructure') {
steps {
sh '''
docker-compose -f docker-compose.eval.yml up -d

echo "Waiting for MongoDB..."
timeout 60 bash -c 'until docker exec cognito-eval-mongo mongosh --eval "db.runCommand({ ping: 1 })" > /dev/null 2>&1; do sleep 2; done'

echo "Waiting for Weaviate..."
timeout 120 bash -c 'until curl -s http://localhost:8080/v1/.well-known/ready > /dev/null 2>&1; do sleep 2; done'

echo "Infrastructure ready!"
'''
}
}

stage('Install Dependencies') {
steps {
sh 'npm ci'
}
}

stage('Run Evaluation Tests') {
steps {
sh 'npm run test:eval'
}
}
}

post {
always {
archiveArtifacts artifacts: 'agents/__tests__/evaluation/last-run/*.json', allowEmptyArchive: true
sh 'docker-compose -f docker-compose.eval.yml down -v || true'
}
failure {
echo 'Evaluation tests failed. Check archived artifacts for details.'
}
success {
echo 'All evaluation tests passed!'
}
}
}
78 changes: 78 additions & 0 deletions agents/__tests__/evaluation/conversationRunner.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { executeChatGraphWithStream, IStreamCallback } from '@/agents/graph/chatGraph';
import { IConversationTurn } from './evaluator';

export interface IConversationScenario {
name: string;
locale: string;
turns: Array<{
userMessage: string;
validateResponse?: (response: string) => boolean;
}>;
expectedBehavior: string;
}

export interface IConversationResult {
scenario: IConversationScenario;
conversation: IConversationTurn[];
success: boolean;
error?: string;
}

const createNoopCallbacks = (): IStreamCallback => ({
onToken: () => {},
onComplete: () => {},
onError: () => {},
});

export const runConversation = async (
scenario: IConversationScenario
): Promise<IConversationResult> => {
const conversation: IConversationTurn[] = [];
const sessionId = `eval-${Date.now()}-${Math.random().toString(36).slice(2)}`;
const callbacks = createNoopCallbacks();

const messages: Array<{ role: string; content: string }> = [];

for (const turn of scenario.turns) {
messages.push({ role: 'user', content: turn.userMessage });
conversation.push({ role: 'user', content: turn.userMessage });

const response = await executeChatGraphWithStream(
sessionId,
scenario.locale,
messages,
callbacks
);

messages.push({ role: 'assistant', content: response });
conversation.push({ role: 'assistant', content: response });

if (turn.validateResponse && !turn.validateResponse(response)) {
return {
scenario,
conversation,
success: false,
error: `Response validation failed for turn: "${turn.userMessage}"`,
};
}
}

return {
scenario,
conversation,
success: true,
};
};

export const runMultipleConversations = async (
scenarios: IConversationScenario[]
): Promise<IConversationResult[]> => {
const results: IConversationResult[] = [];

for (const scenario of scenarios) {
const result = await runConversation(scenario);
results.push(result);
}

return results;
};
166 changes: 166 additions & 0 deletions agents/__tests__/evaluation/evaluator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import { HumanMessage, SystemMessage } from '@langchain/core/messages';
import { createOllamaClient } from '@/services/llm/llm.service';

const EVALUATOR_TEMPERATURE = 0.1;
const EVALUATOR_MAX_TOKENS = 500;

export interface IEvaluationResult {
score: number;
reasoning: string;
passed: boolean;
}

export interface IConversationTurn {
role: 'user' | 'assistant';
content: string;
}

export interface IEvaluationCriteria {
name: string;
description: string;
weight: number;
}

const createEvaluationPrompt = (
criteria: IEvaluationCriteria[],
expectedBehavior: string
) => {
const criteriaList = criteria
.map((c, i) => `${i + 1}. ${c.name} (weight: ${c.weight}): ${c.description}`)
.join('\n');

return `You are an AI evaluator. Your task is to evaluate a conversation between a user and an e-commerce shopping assistant.

EVALUATION CRITERIA:
${criteriaList}

EXPECTED BEHAVIOR:
${expectedBehavior}

SCORING INSTRUCTIONS:
- Score each criterion from 1 to 5:
1 = Very poor, completely fails the criterion
2 = Poor, mostly fails with minor success
3 = Acceptable, meets basic expectations
4 = Good, exceeds expectations in some areas
5 = Excellent, fully meets or exceeds all expectations

- Calculate weighted average score
- Response MUST be in this exact JSON format:
{
"scores": [
{"criterion": "criterion_name", "score": X, "reason": "brief explanation"}
],
"overall_score": X.X,
"reasoning": "overall assessment",
"passed": true/false
}`;
};

const formatConversationForEvaluation = (conversation: IConversationTurn[]) => {
return conversation
.map((turn) => `${turn.role.toUpperCase()}: ${turn.content}`)
.join('\n\n');
};

const parseEvaluationResponse = (response: string): IEvaluationResult => {
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
return {
score: 1,
reasoning: 'Failed to parse evaluation response',
passed: false,
};
}

try {
const parsed = JSON.parse(jsonMatch[0]);

return {
score: parsed.overall_score,
reasoning: parsed.reasoning,
passed: parsed.passed,
};
} catch {
const scoreMatch = response.match(/overall_score["\s:]+(\d+\.?\d*)/);
const reasoningMatch = response.match(/reasoning["\s:]+["']([^"']+)["']/);
const passedMatch = response.match(/passed["\s:]+(\w+)/);

const score = scoreMatch ? parseFloat(scoreMatch[1]) : 1;
const reasoning = reasoningMatch ? reasoningMatch[1] : 'Failed to parse reasoning';
const passed = passedMatch ? passedMatch[1] === 'true' : false;

return { score, reasoning, passed };
}
};

export const evaluateConversation = async (
conversation: IConversationTurn[],
criteria: IEvaluationCriteria[],
expectedBehavior: string
): Promise<IEvaluationResult> => {
const llm = createOllamaClient(EVALUATOR_TEMPERATURE, EVALUATOR_MAX_TOKENS);

const systemPrompt = createEvaluationPrompt(criteria, expectedBehavior);
const conversationText = formatConversationForEvaluation(conversation);

const response = await llm.invoke([
new SystemMessage(systemPrompt),
new HumanMessage(`CONVERSATION TO EVALUATE:\n\n${conversationText}`),
]);

const content = response.content.toString();

return parseEvaluationResponse(content);
};

export const defaultProductSearchCriteria: IEvaluationCriteria[] = [
{
name: 'Relevance',
description: 'Does the assistant return products relevant to the user query?',
weight: 3,
},
{
name: 'Completeness',
description: 'Does the response include necessary product details (name, price, category)?',
weight: 2,
},
{
name: 'Helpfulness',
description: 'Is the assistant helpful in guiding the user to find products?',
weight: 2,
},
{
name: 'Accuracy',
description: 'Are the product details accurate and properly formatted?',
weight: 2,
},
{
name: 'Natural Language',
description: 'Is the response natural and easy to understand?',
weight: 1,
},
];

export const defaultChatCriteria: IEvaluationCriteria[] = [
{
name: 'Appropriateness',
description: 'Is the response appropriate for the user message?',
weight: 3,
},
{
name: 'Helpfulness',
description: 'Does the assistant provide helpful information or guidance?',
weight: 2,
},
{
name: 'Coherence',
description: 'Is the response coherent and logically structured?',
weight: 2,
},
{
name: 'Tone',
description: 'Is the tone friendly and professional?',
weight: 1,
},
];
Loading