KoderFPV · KoderFPV · Jan 31, 2026 · Jan 11, 2026
diff --git a/.env.example b/.env.example
@@ -23,7 +23,7 @@ TEST_SERVER_URL=http://localhost:2137
 
 # Ollama Configuration
 OLLAMA_URL=http://localhost:2142/v1
-OLLAMA_MODEL=speakleash/bielik-11b-v3.0-instruct:Q8_0
+OLLAMA_MODEL=mistral-small3.2:24b-instruct-2506-q8_0
 
 # vLLM Qwen3-VL Configuration (Vision)
 VLLM_QWEN3_VL_URL=http://localhost:2141/v1

diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,6 @@ weaviate-data
 
 # Dev server logs
 /logs/
+
+# Evaluation test results
+agents/__tests__/evaluation/last-run/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -42,7 +42,12 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 - **Return types**: Do not explicitly declare return types for functions - let TypeScript infer them automatically (e.g., `const add = (a: number, b: number) => a + b` instead of `const add = (a: number, b: number): number => a + b`)
 - **Always use braces**: Always use curly braces `{}` for if statements, even for single-line blocks (e.g., `if (condition) { return value; }` instead of `if (condition) return value;`)
 - **No magic numbers**: Extract numeric constants to named constants at the top of the file to make the code self-documenting (e.g., `const DEFAULT_PAGE_SIZE = 10;` instead of using `10` directly in code)
-- **User-facing strings must be translated**: All strings visible to users must be internationalized, including UI text, error messages from services, and API responses. Strings should never be hardcoded in components, services, or API handlers. Services must accept locale parameter to provide translated error messages that will be displayed to users.
+- **User-facing strings must be translated**: All strings visible to users must be internationalized. This includes:
+  - UI text in components
+  - Error messages from services and API responses
+  - **AI agent responses** (chatNode, productsNode, etc.) - all messages returned to users must use translations from `messages/` files
+  - Never hardcode user-facing strings in code - always use `getTranslations` or translation files
+  - Services and agents must accept locale parameter and use it to fetch translated strings
 - This makes optional properties more concise and follows TypeScript best practices
 
 ## Documentation
@@ -596,9 +601,10 @@ make status        # Check Docker services status
 # Logs and Debugging
 make logs          # View Docker logs (follow mode)
 
-# Testing
-make test          # Run all tests
-make test-watch    # Run tests in watch mode
+# Testing (always run with TEST_LOCALE=en)
+TEST_LOCALE=en make test          # Run all tests
+TEST_LOCALE=en make test-watch    # Run tests in watch mode
+TEST_LOCALE=en npm test           # Alternative: run tests with npm
 make lint          # Run ESLint
 make type-check    # Run TypeScript type checking
 

diff --git a/Jenkinsfile.eval b/Jenkinsfile.eval
@@ -0,0 +1,69 @@
+pipeline {
+    agent any
+
+    tools {
+        nodejs 'Node24'
+    }
+
+    environment {
+        TEST_LOCALE = 'en'
+        OLLAMA_URL = 'http://192.168.68.80:11434/v1'
+        OLLAMA_MODEL = 'mistral-small3.2:24b-instruct-2506-q8_0'
+        MONGODB_URI = 'mongodb://localhost:27017/cognito-eval'
+        WEAVIATE_HTTP_HOST = 'localhost'
+        WEAVIATE_HTTP_PORT = '8080'
+        WEAVIATE_GRPC_HOST = 'localhost'
+        WEAVIATE_GRPC_PORT = '50051'
+        WEAVIATE_SECURE = 'false'
+        WEAVIATE_API_KEY = ''
+    }
+
+    stages {
+        stage('Checkout') {
+            steps {
+                checkout scm
+            }
+        }
+
+        stage('Start Infrastructure') {
+            steps {
+                sh '''
+                    docker-compose -f docker-compose.eval.yml up -d
+
+                    echo "Waiting for MongoDB..."
+                    timeout 60 bash -c 'until docker exec cognito-eval-mongo mongosh --eval "db.runCommand({ ping: 1 })" > /dev/null 2>&1; do sleep 2; done'
+
+                    echo "Waiting for Weaviate..."
+                    timeout 120 bash -c 'until curl -s http://localhost:8080/v1/.well-known/ready > /dev/null 2>&1; do sleep 2; done'
+
+                    echo "Infrastructure ready!"
+                '''
+            }
+        }
+
+        stage('Install Dependencies') {
+            steps {
+                sh 'npm ci'
+            }
+        }
+
+        stage('Run Evaluation Tests') {
+            steps {
+                sh 'npm run test:eval'
+            }
+        }
+    }
+
+    post {
+        always {
+            archiveArtifacts artifacts: 'agents/__tests__/evaluation/last-run/*.json', allowEmptyArchive: true
+            sh 'docker-compose -f docker-compose.eval.yml down -v || true'
+        }
+        failure {
+            echo 'Evaluation tests failed. Check archived artifacts for details.'
+        }
+        success {
+            echo 'All evaluation tests passed!'
+        }
+    }
+}
diff --git a/agents/__tests__/evaluation/conversationRunner.ts b/agents/__tests__/evaluation/conversationRunner.ts
@@ -0,0 +1,78 @@
+import { executeChatGraphWithStream, IStreamCallback } from '@/agents/graph/chatGraph';
+import { IConversationTurn } from './evaluator';
+
+export interface IConversationScenario {
+  name: string;
+  locale: string;
+  turns: Array<{
+    userMessage: string;
+    validateResponse?: (response: string) => boolean;
+  }>;
+  expectedBehavior: string;
+}
+
+export interface IConversationResult {
+  scenario: IConversationScenario;
+  conversation: IConversationTurn[];
+  success: boolean;
+  error?: string;
+}
+
+const createNoopCallbacks = (): IStreamCallback => ({
+  onToken: () => {},
+  onComplete: () => {},
+  onError: () => {},
+});
+
+export const runConversation = async (
+  scenario: IConversationScenario
+): Promise<IConversationResult> => {
+  const conversation: IConversationTurn[] = [];
+  const sessionId = `eval-${Date.now()}-${Math.random().toString(36).slice(2)}`;
+  const callbacks = createNoopCallbacks();
+
+  const messages: Array<{ role: string; content: string }> = [];
+
+  for (const turn of scenario.turns) {
+    messages.push({ role: 'user', content: turn.userMessage });
+    conversation.push({ role: 'user', content: turn.userMessage });
+
+    const response = await executeChatGraphWithStream(
+      sessionId,
+      scenario.locale,
+      messages,
+      callbacks
+    );
+
+    messages.push({ role: 'assistant', content: response });
+    conversation.push({ role: 'assistant', content: response });
+
+    if (turn.validateResponse && !turn.validateResponse(response)) {
+      return {
+        scenario,
+        conversation,
+        success: false,
+        error: `Response validation failed for turn: "${turn.userMessage}"`,
+      };
+    }
+  }
+
+  return {
+    scenario,
+    conversation,
+    success: true,
+  };
+};
+
+export const runMultipleConversations = async (
+  scenarios: IConversationScenario[]
+): Promise<IConversationResult[]> => {
+  const results: IConversationResult[] = [];
+
+  for (const scenario of scenarios) {
+    const result = await runConversation(scenario);
+    results.push(result);
+  }
+
+  return results;
+};
diff --git a/agents/__tests__/evaluation/evaluator.ts b/agents/__tests__/evaluation/evaluator.ts
@@ -0,0 +1,166 @@
+import { HumanMessage, SystemMessage } from '@langchain/core/messages';
+import { createOllamaClient } from '@/services/llm/llm.service';
+
+const EVALUATOR_TEMPERATURE = 0.1;
+const EVALUATOR_MAX_TOKENS = 500;
+
+export interface IEvaluationResult {
+  score: number;
+  reasoning: string;
+  passed: boolean;
+}
+
+export interface IConversationTurn {
+  role: 'user' | 'assistant';
+  content: string;
+}
+
+export interface IEvaluationCriteria {
+  name: string;
+  description: string;
+  weight: number;
+}
+
+const createEvaluationPrompt = (
+  criteria: IEvaluationCriteria[],
+  expectedBehavior: string
+) => {
+  const criteriaList = criteria
+    .map((c, i) => `${i + 1}. ${c.name} (weight: ${c.weight}): ${c.description}`)
+    .join('\n');
+
+  return `You are an AI evaluator. Your task is to evaluate a conversation between a user and an e-commerce shopping assistant.
+
+EVALUATION CRITERIA:
+${criteriaList}
+
+EXPECTED BEHAVIOR:
+${expectedBehavior}
+
+SCORING INSTRUCTIONS:
+- Score each criterion from 1 to 5:
+  1 = Very poor, completely fails the criterion
+  2 = Poor, mostly fails with minor success
+  3 = Acceptable, meets basic expectations
+  4 = Good, exceeds expectations in some areas
+  5 = Excellent, fully meets or exceeds all expectations
+
+- Calculate weighted average score
+- Response MUST be in this exact JSON format:
+{
+  "scores": [
+    {"criterion": "criterion_name", "score": X, "reason": "brief explanation"}
+  ],
+  "overall_score": X.X,
+  "reasoning": "overall assessment",
+  "passed": true/false
+}`;
+};
+
+const formatConversationForEvaluation = (conversation: IConversationTurn[]) => {
+  return conversation
+    .map((turn) => `${turn.role.toUpperCase()}: ${turn.content}`)
+    .join('\n\n');
+};
+
+const parseEvaluationResponse = (response: string): IEvaluationResult => {
+  const jsonMatch = response.match(/\{[\s\S]*\}/);
+  if (!jsonMatch) {
+    return {
+      score: 1,
+      reasoning: 'Failed to parse evaluation response',
+      passed: false,
+    };
+  }
+
+  try {
+    const parsed = JSON.parse(jsonMatch[0]);
+
+    return {
+      score: parsed.overall_score,
+      reasoning: parsed.reasoning,
+      passed: parsed.passed,
+    };
+  } catch {
+    const scoreMatch = response.match(/overall_score["\s:]+(\d+\.?\d*)/);
+    const reasoningMatch = response.match(/reasoning["\s:]+["']([^"']+)["']/);
+    const passedMatch = response.match(/passed["\s:]+(\w+)/);
+
+    const score = scoreMatch ? parseFloat(scoreMatch[1]) : 1;
+    const reasoning = reasoningMatch ? reasoningMatch[1] : 'Failed to parse reasoning';
+    const passed = passedMatch ? passedMatch[1] === 'true' : false;
+
+    return { score, reasoning, passed };
+  }
+};
+
+export const evaluateConversation = async (
+  conversation: IConversationTurn[],
+  criteria: IEvaluationCriteria[],
+  expectedBehavior: string
+): Promise<IEvaluationResult> => {
+  const llm = createOllamaClient(EVALUATOR_TEMPERATURE, EVALUATOR_MAX_TOKENS);
+
+  const systemPrompt = createEvaluationPrompt(criteria, expectedBehavior);
+  const conversationText = formatConversationForEvaluation(conversation);
+
+  const response = await llm.invoke([
+    new SystemMessage(systemPrompt),
+    new HumanMessage(`CONVERSATION TO EVALUATE:\n\n${conversationText}`),
+  ]);
+
+  const content = response.content.toString();
+
+  return parseEvaluationResponse(content);
+};
+
+export const defaultProductSearchCriteria: IEvaluationCriteria[] = [
+  {
+    name: 'Relevance',
+    description: 'Does the assistant return products relevant to the user query?',
+    weight: 3,
+  },
+  {
+    name: 'Completeness',
+    description: 'Does the response include necessary product details (name, price, category)?',
+    weight: 2,
+  },
+  {
+    name: 'Helpfulness',
+    description: 'Is the assistant helpful in guiding the user to find products?',
+    weight: 2,
+  },
+  {
+    name: 'Accuracy',
+    description: 'Are the product details accurate and properly formatted?',
+    weight: 2,
+  },
+  {
+    name: 'Natural Language',
+    description: 'Is the response natural and easy to understand?',
+    weight: 1,
+  },
+];
+
+export const defaultChatCriteria: IEvaluationCriteria[] = [
+  {
+    name: 'Appropriateness',
+    description: 'Is the response appropriate for the user message?',
+    weight: 3,
+  },
+  {
+    name: 'Helpfulness',
+    description: 'Does the assistant provide helpful information or guidance?',
+    weight: 2,
+  },
+  {
+    name: 'Coherence',
+    description: 'Is the response coherent and logically structured?',
+    weight: 2,
+  },
+  {
+    name: 'Tone',
+    description: 'Is the tone friendly and professional?',
+    weight: 1,
+  },
+];