From 1c11c211220d1d1d3adbbe6ea0de9df21c152bd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Artur=20S=C5=82omowski?= <artur.slomowski@answear.com>
Date: Mon, 12 Jan 2026 00:41:21 +0100
Subject: [PATCH] feat: Add semantic product search with LLM evaluation tests

- Implement semantic product search using Weaviate text_vector
- Add LLM-based search query extraction from conversation context
- Create LLM-as-judge evaluation framework for e2e testing
- Add translations for product search responses (en/pl)

Product Search:
- searchProductIdsInWeaviate for semantic search via nearText
- productsNode extracts query from full conversation, not just last message
- Returns formatted product list with prices, categories, stock status

Evaluation Framework:
- evaluator.ts: LLM-as-judge using Bielik model (score 1-5)
- conversationRunner.ts: Execute multi-turn conversations
- productSearch.e2e.test.ts: 11 scenarios (single-turn, multi-turn, edge cases)
- Separate vitest config for e2e tests (npm run test:eval)

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
---
 .env.example                                  |   2 +-
 .gitignore                                    |   3 +
 CLAUDE.md                                     |  14 +-
 Jenkinsfile.eval                              |  69 ++++++
 .../evaluation/conversationRunner.ts          |  78 +++++++
 agents/__tests__/evaluation/evaluator.ts      | 166 +++++++++++++
 .../evaluation/productSearch.e2e.test.ts      | 218 ++++++++++++++++++
 agents/__tests__/evaluation/testFixtures.ts   | 133 +++++++++++
 .../evaluation/testResultsReporter.ts         |  62 +++++
 agents/graph/chatGraph.test.ts                | 176 +++++++++++++-
 agents/graph/nodes/chatNode.ts                |   4 +-
 agents/graph/nodes/productsNode.ts            | 133 ++++++++++-
 agents/graph/nodes/routerNode.ts              |   4 +-
 agents/prompts/productsPrompts.ts             |  53 +++++
 agents/utils/translations.ts                  |  57 +++++
 app/[locale]/api/registration/route.ts        |   2 -
 docker-compose.eval.yml                       |  35 +++
 docker-compose.yml                            |  10 +
 docs/JENKINS_EVAL.md                          | 194 ++++++++++++++++
 messages/en.json                              |  11 +
 messages/pl.json                              |  11 +
 models/products/weaviateProductsModel.ts      |  18 ++
 package.json                                  |   1 +
 services/llm/llm.service.test.ts              |  24 +-
 services/llm/llm.service.ts                   |   2 +-
 vitest.config.ts                              |   2 +-
 vitest.e2e.config.ts                          |  28 +++
 27 files changed, 1466 insertions(+), 44 deletions(-)
 create mode 100644 Jenkinsfile.eval
 create mode 100644 agents/__tests__/evaluation/conversationRunner.ts
 create mode 100644 agents/__tests__/evaluation/evaluator.ts
 create mode 100644 agents/__tests__/evaluation/productSearch.e2e.test.ts
 create mode 100644 agents/__tests__/evaluation/testFixtures.ts
 create mode 100644 agents/__tests__/evaluation/testResultsReporter.ts
 create mode 100644 agents/prompts/productsPrompts.ts
 create mode 100644 agents/utils/translations.ts
 create mode 100644 docker-compose.eval.yml
 create mode 100644 docs/JENKINS_EVAL.md
 create mode 100644 vitest.e2e.config.ts

diff --git a/.env.example b/.env.example
index 0d1833e..6569846 100644
--- a/.env.example
+++ b/.env.example
@@ -23,7 +23,7 @@ TEST_SERVER_URL=http://localhost:2137
 
 # Ollama Configuration
 OLLAMA_URL=http://localhost:2142/v1
-OLLAMA_MODEL=speakleash/bielik-11b-v3.0-instruct:Q8_0
+OLLAMA_MODEL=mistral-small3.2:24b-instruct-2506-q8_0
 
 # vLLM Qwen3-VL Configuration (Vision)
 VLLM_QWEN3_VL_URL=http://localhost:2141/v1
diff --git a/.gitignore b/.gitignore
index 6a1121e..4322c5a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,6 @@ weaviate-data
 
 # Dev server logs
 /logs/
+
+# Evaluation test results
+agents/__tests__/evaluation/last-run/
diff --git a/CLAUDE.md b/CLAUDE.md
index 3dfdf36..22f9ae6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -42,7 +42,12 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 - **Return types**: Do not explicitly declare return types for functions - let TypeScript infer them automatically (e.g., `const add = (a: number, b: number) => a + b` instead of `const add = (a: number, b: number): number => a + b`)
 - **Always use braces**: Always use curly braces `{}` for if statements, even for single-line blocks (e.g., `if (condition) { return value; }` instead of `if (condition) return value;`)
 - **No magic numbers**: Extract numeric constants to named constants at the top of the file to make the code self-documenting (e.g., `const DEFAULT_PAGE_SIZE = 10;` instead of using `10` directly in code)
-- **User-facing strings must be translated**: All strings visible to users must be internationalized, including UI text, error messages from services, and API responses. Strings should never be hardcoded in components, services, or API handlers. Services must accept locale parameter to provide translated error messages that will be displayed to users.
+- **User-facing strings must be translated**: All strings visible to users must be internationalized. This includes:
+  - UI text in components
+  - Error messages from services and API responses
+  - **AI agent responses** (chatNode, productsNode, etc.) - all messages returned to users must use translations from `messages/` files
+  - Never hardcode user-facing strings in code - always use `getTranslations` or translation files
+  - Services and agents must accept locale parameter and use it to fetch translated strings
 - This makes optional properties more concise and follows TypeScript best practices
 
 ## Documentation
@@ -596,9 +601,10 @@ make status        # Check Docker services status
 # Logs and Debugging
 make logs          # View Docker logs (follow mode)
 
-# Testing
-make test          # Run all tests
-make test-watch    # Run tests in watch mode
+# Testing (always run with TEST_LOCALE=en)
+TEST_LOCALE=en make test          # Run all tests
+TEST_LOCALE=en make test-watch    # Run tests in watch mode
+TEST_LOCALE=en npm test           # Alternative: run tests with npm
 make lint          # Run ESLint
 make type-check    # Run TypeScript type checking
 
diff --git a/Jenkinsfile.eval b/Jenkinsfile.eval
new file mode 100644
index 0000000..d8735ad
--- /dev/null
+++ b/Jenkinsfile.eval
@@ -0,0 +1,69 @@
+pipeline {
+    agent any
+
+    tools {
+        nodejs 'Node24'
+    }
+
+    environment {
+        TEST_LOCALE = 'en'
+        OLLAMA_URL = 'http://192.168.68.80:11434/v1'
+        OLLAMA_MODEL = 'mistral-small3.2:24b-instruct-2506-q8_0'
+        MONGODB_URI = 'mongodb://localhost:27017/cognito-eval'
+        WEAVIATE_HTTP_HOST = 'localhost'
+        WEAVIATE_HTTP_PORT = '8080'
+        WEAVIATE_GRPC_HOST = 'localhost'
+        WEAVIATE_GRPC_PORT = '50051'
+        WEAVIATE_SECURE = 'false'
+        WEAVIATE_API_KEY = ''
+    }
+
+    stages {
+        stage('Checkout') {
+            steps {
+                checkout scm
+            }
+        }
+
+        stage('Start Infrastructure') {
+            steps {
+                sh '''
+                    docker-compose -f docker-compose.eval.yml up -d
+
+                    echo "Waiting for MongoDB..."
+                    timeout 60 bash -c 'until docker exec cognito-eval-mongo mongosh --eval "db.runCommand({ ping: 1 })" > /dev/null 2>&1; do sleep 2; done'
+
+                    echo "Waiting for Weaviate..."
+                    timeout 120 bash -c 'until curl -s http://localhost:8080/v1/.well-known/ready > /dev/null 2>&1; do sleep 2; done'
+
+                    echo "Infrastructure ready!"
+                '''
+            }
+        }
+
+        stage('Install Dependencies') {
+            steps {
+                sh 'npm ci'
+            }
+        }
+
+        stage('Run Evaluation Tests') {
+            steps {
+                sh 'npm run test:eval'
+            }
+        }
+    }
+
+    post {
+        always {
+            archiveArtifacts artifacts: 'agents/__tests__/evaluation/last-run/*.json', allowEmptyArchive: true
+            sh 'docker-compose -f docker-compose.eval.yml down -v || true'
+        }
+        failure {
+            echo 'Evaluation tests failed. Check archived artifacts for details.'
+        }
+        success {
+            echo 'All evaluation tests passed!'
+        }
+    }
+}
diff --git a/agents/__tests__/evaluation/conversationRunner.ts b/agents/__tests__/evaluation/conversationRunner.ts
new file mode 100644
index 0000000..8d8b0c3
--- /dev/null
+++ b/agents/__tests__/evaluation/conversationRunner.ts
@@ -0,0 +1,78 @@
+import { executeChatGraphWithStream, IStreamCallback } from '@/agents/graph/chatGraph';
+import { IConversationTurn } from './evaluator';
+
+export interface IConversationScenario {
+  name: string;
+  locale: string;
+  turns: Array<{
+    userMessage: string;
+    validateResponse?: (response: string) => boolean;
+  }>;
+  expectedBehavior: string;
+}
+
+export interface IConversationResult {
+  scenario: IConversationScenario;
+  conversation: IConversationTurn[];
+  success: boolean;
+  error?: string;
+}
+
+const createNoopCallbacks = (): IStreamCallback => ({
+  onToken: () => {},
+  onComplete: () => {},
+  onError: () => {},
+});
+
+export const runConversation = async (
+  scenario: IConversationScenario
+): Promise<IConversationResult> => {
+  const conversation: IConversationTurn[] = [];
+  const sessionId = `eval-${Date.now()}-${Math.random().toString(36).slice(2)}`;
+  const callbacks = createNoopCallbacks();
+
+  const messages: Array<{ role: string; content: string }> = [];
+
+  for (const turn of scenario.turns) {
+    messages.push({ role: 'user', content: turn.userMessage });
+    conversation.push({ role: 'user', content: turn.userMessage });
+
+    const response = await executeChatGraphWithStream(
+      sessionId,
+      scenario.locale,
+      messages,
+      callbacks
+    );
+
+    messages.push({ role: 'assistant', content: response });
+    conversation.push({ role: 'assistant', content: response });
+
+    if (turn.validateResponse && !turn.validateResponse(response)) {
+      return {
+        scenario,
+        conversation,
+        success: false,
+        error: `Response validation failed for turn: "${turn.userMessage}"`,
+      };
+    }
+  }
+
+  return {
+    scenario,
+    conversation,
+    success: true,
+  };
+};
+
+export const runMultipleConversations = async (
+  scenarios: IConversationScenario[]
+): Promise<IConversationResult[]> => {
+  const results: IConversationResult[] = [];
+
+  for (const scenario of scenarios) {
+    const result = await runConversation(scenario);
+    results.push(result);
+  }
+
+  return results;
+};
diff --git a/agents/__tests__/evaluation/evaluator.ts b/agents/__tests__/evaluation/evaluator.ts
new file mode 100644
index 0000000..2685fa6
--- /dev/null
+++ b/agents/__tests__/evaluation/evaluator.ts
@@ -0,0 +1,166 @@
+import { HumanMessage, SystemMessage } from '@langchain/core/messages';
+import { createOllamaClient } from '@/services/llm/llm.service';
+
+const EVALUATOR_TEMPERATURE = 0.1;
+const EVALUATOR_MAX_TOKENS = 500;
+
+export interface IEvaluationResult {
+  score: number;
+  reasoning: string;
+  passed: boolean;
+}
+
+export interface IConversationTurn {
+  role: 'user' | 'assistant';
+  content: string;
+}
+
+export interface IEvaluationCriteria {
+  name: string;
+  description: string;
+  weight: number;
+}
+
+const createEvaluationPrompt = (
+  criteria: IEvaluationCriteria[],
+  expectedBehavior: string
+) => {
+  const criteriaList = criteria
+    .map((c, i) => `${i + 1}. ${c.name} (weight: ${c.weight}): ${c.description}`)
+    .join('\n');
+
+  return `You are an AI evaluator. Your task is to evaluate a conversation between a user and an e-commerce shopping assistant.
+
+EVALUATION CRITERIA:
+${criteriaList}
+
+EXPECTED BEHAVIOR:
+${expectedBehavior}
+
+SCORING INSTRUCTIONS:
+- Score each criterion from 1 to 5:
+  1 = Very poor, completely fails the criterion
+  2 = Poor, mostly fails with minor success
+  3 = Acceptable, meets basic expectations
+  4 = Good, exceeds expectations in some areas
+  5 = Excellent, fully meets or exceeds all expectations
+
+- Calculate weighted average score
+- Response MUST be in this exact JSON format:
+{
+  "scores": [
+    {"criterion": "criterion_name", "score": X, "reason": "brief explanation"}
+  ],
+  "overall_score": X.X,
+  "reasoning": "overall assessment",
+  "passed": true/false
+}`;
+};
+
+const formatConversationForEvaluation = (conversation: IConversationTurn[]) => {
+  return conversation
+    .map((turn) => `${turn.role.toUpperCase()}: ${turn.content}`)
+    .join('\n\n');
+};
+
+const parseEvaluationResponse = (response: string): IEvaluationResult => {
+  const jsonMatch = response.match(/\{[\s\S]*\}/);
+  if (!jsonMatch) {
+    return {
+      score: 1,
+      reasoning: 'Failed to parse evaluation response',
+      passed: false,
+    };
+  }
+
+  try {
+    const parsed = JSON.parse(jsonMatch[0]);
+
+    return {
+      score: parsed.overall_score,
+      reasoning: parsed.reasoning,
+      passed: parsed.passed,
+    };
+  } catch {
+    const scoreMatch = response.match(/overall_score["\s:]+(\d+\.?\d*)/);
+    const reasoningMatch = response.match(/reasoning["\s:]+["']([^"']+)["']/);
+    const passedMatch = response.match(/passed["\s:]+(\w+)/);
+
+    const score = scoreMatch ? parseFloat(scoreMatch[1]) : 1;
+    const reasoning = reasoningMatch ? reasoningMatch[1] : 'Failed to parse reasoning';
+    const passed = passedMatch ? passedMatch[1] === 'true' : false;
+
+    return { score, reasoning, passed };
+  }
+};
+
+export const evaluateConversation = async (
+  conversation: IConversationTurn[],
+  criteria: IEvaluationCriteria[],
+  expectedBehavior: string
+): Promise<IEvaluationResult> => {
+  const llm = createOllamaClient(EVALUATOR_TEMPERATURE, EVALUATOR_MAX_TOKENS);
+
+  const systemPrompt = createEvaluationPrompt(criteria, expectedBehavior);
+  const conversationText = formatConversationForEvaluation(conversation);
+
+  const response = await llm.invoke([
+    new SystemMessage(systemPrompt),
+    new HumanMessage(`CONVERSATION TO EVALUATE:\n\n${conversationText}`),
+  ]);
+
+  const content = response.content.toString();
+
+  return parseEvaluationResponse(content);
+};
+
+export const defaultProductSearchCriteria: IEvaluationCriteria[] = [
+  {
+    name: 'Relevance',
+    description: 'Does the assistant return products relevant to the user query?',
+    weight: 3,
+  },
+  {
+    name: 'Completeness',
+    description: 'Does the response include necessary product details (name, price, category)?',
+    weight: 2,
+  },
+  {
+    name: 'Helpfulness',
+    description: 'Is the assistant helpful in guiding the user to find products?',
+    weight: 2,
+  },
+  {
+    name: 'Accuracy',
+    description: 'Are the product details accurate and properly formatted?',
+    weight: 2,
+  },
+  {
+    name: 'Natural Language',
+    description: 'Is the response natural and easy to understand?',
+    weight: 1,
+  },
+];
+
+export const defaultChatCriteria: IEvaluationCriteria[] = [
+  {
+    name: 'Appropriateness',
+    description: 'Is the response appropriate for the user message?',
+    weight: 3,
+  },
+  {
+    name: 'Helpfulness',
+    description: 'Does the assistant provide helpful information or guidance?',
+    weight: 2,
+  },
+  {
+    name: 'Coherence',
+    description: 'Is the response coherent and logically structured?',
+    weight: 2,
+  },
+  {
+    name: 'Tone',
+    description: 'Is the tone friendly and professional?',
+    weight: 1,
+  },
+];
diff --git a/agents/__tests__/evaluation/productSearch.e2e.test.ts b/agents/__tests__/evaluation/productSearch.e2e.test.ts
new file mode 100644
index 0000000..c2d0b1e
--- /dev/null
+++ b/agents/__tests__/evaluation/productSearch.e2e.test.ts
@@ -0,0 +1,218 @@
+import { describe, it, expect, beforeAll, afterAll } from 'vitest';
+import {
+  evaluateConversation,
+  defaultProductSearchCriteria,
+  defaultChatCriteria,
+  IEvaluationResult,
+  IConversationTurn,
+} from './evaluator';
+import { runConversation, IConversationScenario } from './conversationRunner';
+import { clearLastRunDirectory, saveFailedTest } from './testResultsReporter';
+import { setupTestProducts, teardownTestProducts } from './testFixtures';
+
+const MINIMUM_PASSING_SCORE = 4.0;
+
+beforeAll(async () => {
+  clearLastRunDirectory();
+  await setupTestProducts();
+}, 60000);
+
+afterAll(async () => {
+  await teardownTestProducts();
+}, 30000);
+
+const productSearchScenarios: IConversationScenario[] = [
+  {
+    name: 'Simple laptop search',
+    locale: 'en',
+    turns: [{ userMessage: 'Show me laptops' }],
+    expectedBehavior:
+      'The assistant should return a list of products including laptops. Response should include product names and prices. Some non-laptop products may appear due to semantic search - this is acceptable.',
+  },
+  {
+    name: 'Smartphone search',
+    locale: 'en',
+    turns: [{ userMessage: 'I want to buy a smartphone' }],
+    expectedBehavior:
+      'The assistant should return products including smartphones. Response should include product names and prices. Some related products may appear - this is acceptable.',
+  },
+  {
+    name: 'Smartphone search in Polish',
+    locale: 'pl',
+    turns: [{ userMessage: 'Pokaż mi smartfony' }],
+    expectedBehavior:
+      'The assistant should return products including smartphones. Response should include product names and prices. The system should understand Polish language queries.',
+  },
+  {
+    name: 'Gaming peripherals search',
+    locale: 'en',
+    turns: [{ userMessage: 'Show me gaming keyboards and gaming mice' }],
+    expectedBehavior:
+      'The assistant should return products related to gaming peripherals. Response should include product names and prices. Some related gaming products may appear - this is acceptable.',
+  },
+  {
+    name: 'Audio equipment search',
+    locale: 'en',
+    turns: [{ userMessage: 'Show me headphones' }],
+    expectedBehavior:
+      'The assistant should return products including headphones or audio equipment. Response should include product names and prices.',
+  },
+];
+
+const multiTurnScenarios: IConversationScenario[] = [
+  {
+    name: 'Multi-turn gaming setup',
+    locale: 'en',
+    turns: [
+      { userMessage: 'I want something for gaming' },
+      { userMessage: 'PC accessories like keyboard and mouse' },
+    ],
+    expectedBehavior:
+      'The assistant should return products related to gaming or PC accessories. Response should include product names and prices.',
+  },
+];
+
+describe('Product Search E2E Evaluation', () => {
+  describe.each(productSearchScenarios)('Scenario: $name', (scenario) => {
+    let evaluationResult: IEvaluationResult;
+    let conversation: IConversationTurn[];
+
+    beforeAll(async () => {
+      const conversationResult = await runConversation(scenario);
+      conversation = conversationResult.conversation;
+
+      console.log(`\n=== Conversation: ${scenario.name} ===`);
+      conversation.forEach((turn) => {
+        console.log(`${turn.role.toUpperCase()}: ${turn.content}`);
+      });
+
+      expect(conversationResult.success).toBe(true);
+
+      evaluationResult = await evaluateConversation(
+        conversation,
+        defaultProductSearchCriteria,
+        scenario.expectedBehavior
+      );
+
+      console.log(`\nEvaluation Score: ${evaluationResult.score}`);
+      console.log(`Reasoning: ${evaluationResult.reasoning}\n`);
+
+      if (evaluationResult.score < MINIMUM_PASSING_SCORE) {
+        saveFailedTest(scenario, conversation, evaluationResult);
+      }
+    }, 120000);
+
+    it('should pass LLM evaluation with score >= 3.5', () => {
+      expect(evaluationResult.score).toBeGreaterThanOrEqual(MINIMUM_PASSING_SCORE);
+      expect(evaluationResult.passed).toBe(true);
+    });
+
+    it('should have valid reasoning', () => {
+      expect(evaluationResult.reasoning).toBeTruthy();
+      expect(evaluationResult.reasoning.length).toBeGreaterThan(10);
+    });
+  });
+});
+
+describe('Multi-Turn Conversation E2E Evaluation', () => {
+  describe.each(multiTurnScenarios)('Scenario: $name', (scenario) => {
+    let evaluationResult: IEvaluationResult;
+    let conversation: IConversationTurn[];
+
+    beforeAll(async () => {
+      const conversationResult = await runConversation(scenario);
+      conversation = conversationResult.conversation;
+
+      console.log(`\n=== Multi-Turn: ${scenario.name} ===`);
+      conversation.forEach((turn) => {
+        console.log(`${turn.role.toUpperCase()}: ${turn.content}`);
+      });
+
+      expect(conversationResult.success).toBe(true);
+
+      evaluationResult = await evaluateConversation(
+        conversation,
+        defaultProductSearchCriteria,
+        scenario.expectedBehavior
+      );
+
+      console.log(`\nEvaluation Score: ${evaluationResult.score}`);
+      console.log(`Reasoning: ${evaluationResult.reasoning}\n`);
+
+      if (evaluationResult.score < MINIMUM_PASSING_SCORE) {
+        saveFailedTest(scenario, conversation, evaluationResult);
+      }
+    }, 180000);
+
+    it('should pass LLM evaluation with score >= 3.5', () => {
+      expect(evaluationResult.score).toBeGreaterThanOrEqual(MINIMUM_PASSING_SCORE);
+    });
+
+    it('should have valid reasoning', () => {
+      expect(evaluationResult.reasoning).toBeTruthy();
+      expect(evaluationResult.reasoning.length).toBeGreaterThan(10);
+    });
+  });
+});
+
+describe('Edge Cases E2E Evaluation', () => {
+  const edgeCaseScenarios: Array<{
+    scenario: IConversationScenario;
+    minScore: number;
+  }> = [
+    {
+      scenario: {
+        name: 'No product intent - greeting',
+        locale: 'en',
+        turns: [{ userMessage: 'Hello, how are you?' }],
+        expectedBehavior:
+          'The assistant should respond with a friendly greeting. Should NOT try to search for products since there is no product intent. A chat response is acceptable.',
+      },
+      minScore: MINIMUM_PASSING_SCORE,
+    },
+    {
+      scenario: {
+        name: 'Ambiguous query',
+        locale: 'en',
+        turns: [{ userMessage: 'I need something' }],
+        expectedBehavior:
+          'The assistant should either ask for clarification or indicate that it needs more information to search for products. Any reasonable response is acceptable.',
+      },
+      minScore: MINIMUM_PASSING_SCORE,
+    },
+  ];
+
+  describe.each(edgeCaseScenarios)('Edge Case: $scenario.name', ({ scenario, minScore }) => {
+    let evaluationResult: IEvaluationResult;
+    let conversation: IConversationTurn[];
+
+    beforeAll(async () => {
+      const conversationResult = await runConversation(scenario);
+      conversation = conversationResult.conversation;
+
+      console.log(`\n=== Edge Case: ${scenario.name} ===`);
+      conversation.forEach((turn) => {
+        console.log(`${turn.role.toUpperCase()}: ${turn.content}`);
+      });
+
+      expect(conversationResult.success).toBe(true);
+
+      evaluationResult = await evaluateConversation(
+        conversation,
+        defaultChatCriteria,
+        scenario.expectedBehavior
+      );
+
+      console.log(`\nEvaluation Score: ${evaluationResult.score}`);
+      console.log(`Reasoning: ${evaluationResult.reasoning}\n`);
+
+      if (evaluationResult.score < minScore) {
+        saveFailedTest(scenario, conversation, evaluationResult);
+      }
+    }, 120000);
+
+    it('should handle edge case appropriately', () => {
+      expect(evaluationResult.score).toBeGreaterThanOrEqual(minScore);
+    });
+  });
+});
diff --git a/agents/__tests__/evaluation/testFixtures.ts b/agents/__tests__/evaluation/testFixtures.ts
new file mode 100644
index 0000000..ad858d8
--- /dev/null
+++ b/agents/__tests__/evaluation/testFixtures.ts
@@ -0,0 +1,133 @@
+import { IProductCreateInput, IProduct } from '@/domain/product';
+import { createProduct, PRODUCTS_COLLECTION } from '@/models/products/productsModel';
+import { addProductToWeaviate } from '@/models/products/weaviateProductsModel';
+import { connectToMongo } from '@/clients/mongodb/mongodb';
+import { connectToWeaviate } from '@/clients/weaviate/weaviate';
+
+export const TEST_PRODUCTS: IProductCreateInput[] = [
+  {
+    name: 'Gaming Laptop Pro X1',
+    description: 'High-performance gaming laptop with RTX 4080, 32GB RAM, 1TB SSD. Perfect for gaming and content creation.',
+    price: 4999.99,
+    sku: 'LAPTOP-GAMING-001',
+    stock: 15,
+    category: 'Laptops',
+    isActive: true,
+  },
+  {
+    name: 'Business Laptop Elite',
+    description: 'Professional laptop for business use. Intel i7, 16GB RAM, 512GB SSD. Lightweight and portable.',
+    price: 3499.99,
+    sku: 'LAPTOP-BIZ-001',
+    stock: 25,
+    category: 'Laptops',
+    isActive: true,
+  },
+  {
+    name: 'Budget Laptop Basic',
+    description: 'Affordable laptop for everyday tasks. Intel i5, 8GB RAM, 256GB SSD.',
+    price: 1999.99,
+    sku: 'LAPTOP-BASIC-001',
+    stock: 50,
+    category: 'Laptops',
+    isActive: true,
+  },
+  {
+    name: 'Samsung Galaxy S24 Ultra',
+    description: 'Flagship smartphone with 200MP camera, 12GB RAM, 512GB storage. AI-powered features.',
+    price: 5499.99,
+    sku: 'PHONE-SAM-001',
+    stock: 30,
+    category: 'Smartphones',
+    isActive: true,
+  },
+  {
+    name: 'iPhone 15 Pro Max',
+    description: 'Apple flagship phone with A17 Pro chip, titanium design, 256GB storage.',
+    price: 5999.99,
+    sku: 'PHONE-APPLE-001',
+    stock: 20,
+    category: 'Smartphones',
+    isActive: true,
+  },
+  {
+    name: 'Xiaomi 14 Pro',
+    description: 'Premium smartphone with Leica camera, Snapdragon 8 Gen 3, 256GB storage.',
+    price: 3999.99,
+    sku: 'PHONE-XIAOMI-001',
+    stock: 40,
+    category: 'Smartphones',
+    isActive: true,
+  },
+  {
+    name: 'Mechanical Gaming Keyboard RGB',
+    description: 'Mechanical keyboard with Cherry MX switches, RGB backlight, programmable keys.',
+    price: 599.99,
+    sku: 'KB-GAMING-001',
+    stock: 100,
+    category: 'Gaming Peripherals',
+    isActive: true,
+  },
+  {
+    name: 'Gaming Mouse Pro',
+    description: 'High-precision gaming mouse with 25000 DPI sensor, RGB lighting, 8 programmable buttons.',
+    price: 299.99,
+    sku: 'MOUSE-GAMING-001',
+    stock: 80,
+    category: 'Gaming Peripherals',
+    isActive: true,
+  },
+  {
+    name: 'Sony WH-1000XM5 Headphones',
+    description: 'Premium wireless noise-cancelling headphones with 30-hour battery life.',
+    price: 1499.99,
+    sku: 'AUDIO-SONY-001',
+    stock: 35,
+    category: 'Audio',
+    isActive: true,
+  },
+  {
+    name: 'AirPods Pro 2',
+    description: 'Apple wireless earbuds with active noise cancellation and spatial audio.',
+    price: 1199.99,
+    sku: 'AUDIO-APPLE-001',
+    stock: 45,
+    category: 'Audio',
+    isActive: true,
+  },
+];
+
+let createdProducts: IProduct[] = [];
+
+export const setupTestProducts = async (): Promise<void> => {
+  console.log('[SETUP] Creating test products...');
+
+  const weaviateClient = await connectToWeaviate();
+
+  for (const productData of TEST_PRODUCTS) {
+    const product = await createProduct(productData);
+    await addProductToWeaviate(weaviateClient, product);
+    createdProducts.push(product);
+  }
+
+  console.log(`[SETUP] Created ${createdProducts.length} test products`);
+};
+
+export const teardownTestProducts = async (): Promise<void> => {
+  console.log('[TEARDOWN] Cleaning up test products...');
+
+  const db = await connectToMongo();
+  const collection = db.collection(PRODUCTS_COLLECTION);
+
+  const productIds = createdProducts.map((p) => p._id);
+
+  if (productIds.length > 0) {
+    const { ObjectId } = await import('mongodb');
+    await collection.deleteMany({
+      _id: { $in: productIds.map((id) => new ObjectId(id)) },
+    });
+  }
+
+  createdProducts = [];
+  console.log('[TEARDOWN] Test products cleaned up');
+};
diff --git a/agents/__tests__/evaluation/testResultsReporter.ts b/agents/__tests__/evaluation/testResultsReporter.ts
new file mode 100644
index 0000000..2639d0a
--- /dev/null
+++ b/agents/__tests__/evaluation/testResultsReporter.ts
@@ -0,0 +1,62 @@
+import fs from 'fs';
+import path from 'path';
+import { IConversationTurn, IEvaluationResult } from './evaluator';
+import { IConversationScenario } from './conversationRunner';
+
+const LAST_RUN_DIR = path.join(__dirname, 'last-run');
+
+export interface ITestResult {
+  scenario: IConversationScenario;
+  conversation: IConversationTurn[];
+  evaluation: IEvaluationResult;
+}
+
+const ensureDirectoryExists = (dirPath: string) => {
+  if (!fs.existsSync(dirPath)) {
+    fs.mkdirSync(dirPath, { recursive: true });
+  }
+};
+
+const sanitizeFilename = (name: string) => {
+  return name.replace(/[^a-zA-Z0-9-_]/g, '_').toLowerCase();
+};
+
+export const clearLastRunDirectory = () => {
+  if (fs.existsSync(LAST_RUN_DIR)) {
+    const files = fs.readdirSync(LAST_RUN_DIR);
+    for (const file of files) {
+      fs.unlinkSync(path.join(LAST_RUN_DIR, file));
+    }
+  }
+  ensureDirectoryExists(LAST_RUN_DIR);
+};
+
+export const saveFailedTest = (
+  scenario: IConversationScenario,
+  conversation: IConversationTurn[],
+  evaluation: IEvaluationResult
+) => {
+  ensureDirectoryExists(LAST_RUN_DIR);
+
+  const filename = `${sanitizeFilename(scenario.name)}.json`;
+  const filepath = path.join(LAST_RUN_DIR, filename);
+
+  const output = {
+    scenario: {
+      name: scenario.name,
+      locale: scenario.locale,
+      expectedBehavior: scenario.expectedBehavior,
+      turns: scenario.turns.map((t) => t.userMessage),
+    },
+    conversation,
+    evaluation: {
+      score: evaluation.score,
+      passed: evaluation.passed,
+      reasoning: evaluation.reasoning,
+    },
+    timestamp: new Date().toISOString(),
+  };
+
+  fs.writeFileSync(filepath, JSON.stringify(output, null, 2), 'utf-8');
+  console.log(`[FAILED] Saved to: ${filepath}`);
+};
diff --git a/agents/graph/chatGraph.test.ts b/agents/graph/chatGraph.test.ts
index 1ec3d0a..6a779a5 100644
--- a/agents/graph/chatGraph.test.ts
+++ b/agents/graph/chatGraph.test.ts
@@ -9,15 +9,57 @@ vi.mock('@/services/logger/graphLogger', () => ({
   },
 }));
 
+vi.mock('@/agents/utils/translations', () => ({
+  getAgentTranslations: vi.fn(() => (key: string, params?: Record<string, string | number>) => {
+    const translations: Record<string, string> = {
+      noQueryDetected: 'No product query detected. Try describing what you are looking for.',
+      noProductsFound: 'No products found matching your query.',
+      foundProducts: 'Found {count} products:',
+      inStock: 'In stock',
+      outOfStock: 'Out of stock',
+      category: 'Category',
+      searchError: 'An error occurred while searching for products. Please try again later.',
+    };
+    let result = translations[key] || key;
+    if (params) {
+      Object.entries(params).forEach(([k, v]) => {
+        result = result.replace(`{${k}}`, String(v));
+      });
+    }
+    return result;
+  }),
+}));
+
+vi.mock('@/clients/weaviate/weaviate', () => ({
+  connectToWeaviate: vi.fn(() => Promise.resolve({})),
+}));
+
+vi.mock('@/clients/mongodb/mongodb', () => ({
+  connectToMongo: vi.fn(() => Promise.resolve({})),
+}));
+
+vi.mock('@/models/products/weaviateProductsModel', () => ({
+  searchProductIdsInWeaviate: vi.fn(() => Promise.resolve([])),
+}));
+
+vi.mock('@/models/products/productsModel', () => ({
+  getProductById: vi.fn(() => Promise.resolve(null)),
+}));
+
 const mockLlmInvoke = vi.fn();
 
 vi.mock('@/services/llm/llm.service', () => ({
-  createBielikClient: vi.fn(() => ({
+  createOllamaClient: vi.fn(() => ({
     invoke: mockLlmInvoke,
   })),
 }));
 
 import { executeChatGraphWithStream, IStreamCallback } from './chatGraph';
+import { searchProductIdsInWeaviate } from '@/models/products/weaviateProductsModel';
+import { getProductById } from '@/models/products/productsModel';
+
+const mockSearchProductIds = vi.mocked(searchProductIdsInWeaviate);
+const mockGetProductById = vi.mocked(getProductById);
 
 describe('chatGraph', () => {
   let mockCallbacks: IStreamCallback;
@@ -49,7 +91,9 @@ describe('chatGraph', () => {
     });
 
     it('should route product query to products agent', async () => {
-      mockLlmInvoke.mockResolvedValueOnce({ content: 'products' });
+      mockLlmInvoke
+        .mockResolvedValueOnce({ content: 'products' })
+        .mockResolvedValueOnce({ content: 'laptop' });
 
       const result = await executeChatGraphWithStream(
         'session-123',
@@ -58,11 +102,13 @@ describe('chatGraph', () => {
         mockCallbacks
       );
 
-      expect(result).toContain('Product search functionality');
+      expect(result).toContain('No products found matching your query');
     });
 
     it('should route to products agent when router returns "product"', async () => {
-      mockLlmInvoke.mockResolvedValueOnce({ content: 'product' });
+      mockLlmInvoke
+        .mockResolvedValueOnce({ content: 'product' })
+        .mockResolvedValueOnce({ content: 'produkt szczegóły' });
 
       const result = await executeChatGraphWithStream(
         'session-123',
@@ -71,7 +117,7 @@ describe('chatGraph', () => {
         mockCallbacks
       );
 
-      expect(result).toContain('implementacji');
+      expect(result).toContain('No products found');
     });
 
     it('should handle tool call and execute weather tool', async () => {
@@ -170,21 +216,124 @@ describe('chatGraph', () => {
       expect(result).toContain('laptop');
     });
 
-    it('should use Polish locale for products fallback message', async () => {
-      mockLlmInvoke.mockResolvedValueOnce({ content: 'products' });
+    it('should return no query detected when LLM returns EMPTY', async () => {
+      mockLlmInvoke
+        .mockResolvedValueOnce({ content: 'products' })
+        .mockResolvedValueOnce({ content: 'EMPTY' });
 
       const result = await executeChatGraphWithStream(
         'session-123',
         'pl',
-        [{ role: 'user', content: 'Pokaż mi laptopy' }],
+        [{ role: 'user', content: 'Cześć' }],
         mockCallbacks
       );
 
-      expect(result).toContain('Funkcja wyszukiwania produktów');
+      expect(result).toContain('No product query detected');
     });
 
-    it('should use English locale for products fallback message', async () => {
-      mockLlmInvoke.mockResolvedValueOnce({ content: 'products' });
+    it('should return no products found when Weaviate returns empty results', async () => {
+      mockLlmInvoke
+        .mockResolvedValueOnce({ content: 'products' })
+        .mockResolvedValueOnce({ content: 'laptop' });
+
+      const result = await executeChatGraphWithStream(
+        'session-123',
+        'en',
+        [{ role: 'user', content: 'Show me laptops' }],
+        mockCallbacks
+      );
+
+      expect(result).toContain('No products found matching your query');
+    });
+
+    it('should return formatted products when found in database', async () => {
+      const mockProduct = {
+        _id: 'product-1',
+        name: 'Gaming Laptop Pro',
+        description: 'High performance gaming laptop',
+        price: 4999.99,
+        sku: 'LAPTOP-001',
+        stock: 10,
+        category: 'Electronics',
+        isActive: true,
+        deleted: false,
+        createdAt: new Date(),
+        updatedAt: new Date(),
+      };
+
+      mockLlmInvoke
+        .mockResolvedValueOnce({ content: 'products' })
+        .mockResolvedValueOnce({ content: 'gaming laptop' });
+
+      mockSearchProductIds.mockResolvedValueOnce(['product-1']);
+      mockGetProductById.mockResolvedValueOnce(mockProduct);
+
+      const result = await executeChatGraphWithStream(
+        'session-123',
+        'en',
+        [{ role: 'user', content: 'Show me gaming laptops' }],
+        mockCallbacks
+      );
+
+      expect(result).toContain('Found 1 products:');
+      expect(result).toContain('Gaming Laptop Pro');
+      expect(result).toContain('4999.99');
+      expect(result).toContain('Electronics');
+      expect(result).toContain('In stock');
+    });
+
+    it('should filter out deleted and inactive products', async () => {
+      const activeProduct = {
+        _id: 'product-1',
+        name: 'Active Laptop',
+        description: 'Available laptop',
+        price: 3000,
+        sku: 'LAPTOP-001',
+        stock: 5,
+        category: 'Laptops',
+        isActive: true,
+        deleted: false,
+        createdAt: new Date(),
+        updatedAt: new Date(),
+      };
+
+      const deletedProduct = {
+        _id: 'product-2',
+        name: 'Deleted Laptop',
+        description: 'Deleted laptop',
+        price: 2000,
+        sku: 'LAPTOP-002',
+        stock: 0,
+        category: 'Laptops',
+        isActive: true,
+        deleted: true,
+        createdAt: new Date(),
+        updatedAt: new Date(),
+      };
+
+      const inactiveProduct = {
+        _id: 'product-3',
+        name: 'Inactive Laptop',
+        description: 'Inactive laptop',
+        price: 1000,
+        sku: 'LAPTOP-003',
+        stock: 3,
+        category: 'Laptops',
+        isActive: false,
+        deleted: false,
+        createdAt: new Date(),
+        updatedAt: new Date(),
+      };
+
+      mockLlmInvoke
+        .mockResolvedValueOnce({ content: 'products' })
+        .mockResolvedValueOnce({ content: 'laptop' });
+
+      mockSearchProductIds.mockResolvedValueOnce(['product-1', 'product-2', 'product-3']);
+      mockGetProductById
+        .mockResolvedValueOnce(activeProduct)
+        .mockResolvedValueOnce(deletedProduct)
+        .mockResolvedValueOnce(inactiveProduct);
 
       const result = await executeChatGraphWithStream(
         'session-123',
@@ -193,7 +342,10 @@ describe('chatGraph', () => {
         mockCallbacks
       );
 
-      expect(result).toContain('Product search functionality');
+      expect(result).toContain('Found 1 products:');
+      expect(result).toContain('Active Laptop');
+      expect(result).not.toContain('Deleted Laptop');
+      expect(result).not.toContain('Inactive Laptop');
     });
 
     it('should default to chat when router returns unknown agent', async () => {
diff --git a/agents/graph/nodes/chatNode.ts b/agents/graph/nodes/chatNode.ts
index 3428778..dfe1ec1 100644
--- a/agents/graph/nodes/chatNode.ts
+++ b/agents/graph/nodes/chatNode.ts
@@ -1,5 +1,5 @@
 import { SystemMessage, AIMessage } from '@langchain/core/messages';
-import { createBielikClient } from '@/services/llm/llm.service';
+import { createOllamaClient } from '@/services/llm/llm.service';
 import { createChatSystemPrompt } from '@/agents/prompts/chatPrompts';
 import { graphLogger } from '@/services/logger/graphLogger';
 import { IGraphState } from '@/agents/graph/state';
@@ -10,7 +10,7 @@ const CHAT_TEMPERATURE = 0.7;
 const CHAT_MAX_TOKENS = 500;
 
 export const chatNode = async (state: IGraphState) => {
-  const llm = createBielikClient(CHAT_TEMPERATURE, CHAT_MAX_TOKENS);
+  const llm = createOllamaClient(CHAT_TEMPERATURE, CHAT_MAX_TOKENS);
   const locale = state.locale || 'en';
   const systemPrompt = createChatSystemPrompt(locale);
 
diff --git a/agents/graph/nodes/productsNode.ts b/agents/graph/nodes/productsNode.ts
index 3ca132c..1762e7a 100644
--- a/agents/graph/nodes/productsNode.ts
+++ b/agents/graph/nodes/productsNode.ts
@@ -1,15 +1,134 @@
-import { AIMessage } from '@langchain/core/messages';
+import { AIMessage, HumanMessage, SystemMessage, BaseMessage } from '@langchain/core/messages';
 import { graphLogger } from '@/services/logger/graphLogger';
 import { IGraphState } from '@/agents/graph/state';
+import { isHumanMessage } from '@/agents/utils/messageUtils';
+import { connectToWeaviate } from '@/clients/weaviate/weaviate';
+import { connectToMongo } from '@/clients/mongodb/mongodb';
+import { searchProductIdsInWeaviate } from '@/models/products/weaviateProductsModel';
+import { getProductById } from '@/models/products/productsModel';
+import { createOllamaClient } from '@/services/llm/llm.service';
+import { createProductSearchQueryPrompt } from '@/agents/prompts/productsPrompts';
+import { IProduct } from '@/domain/product';
+import { getAgentTranslations } from '@/agents/utils/translations';
+
+const SEARCH_LIMIT = 5;
+const MAX_CONTEXT_MESSAGES = 10;
+const QUERY_EXTRACTION_TEMPERATURE = 0.1;
+const QUERY_EXTRACTION_MAX_TOKENS = 100;
+
+const formatConversation = (messages: BaseMessage[]) => {
+  return messages
+    .map((m) => {
+      const role = isHumanMessage(m) ? 'User' : 'Assistant';
+      return `${role}: ${m.content?.toString() || ''}`;
+    })
+    .join('\n');
+};
+
+const extractSearchQuery = async (messages: BaseMessage[], locale: string) => {
+  const recentMessages = messages.slice(-MAX_CONTEXT_MESSAGES);
+  const conversationText = formatConversation(recentMessages);
+
+  const llm = createOllamaClient(QUERY_EXTRACTION_TEMPERATURE, QUERY_EXTRACTION_MAX_TOKENS);
+  const systemPrompt = createProductSearchQueryPrompt(locale);
+
+  const response = await llm.invoke([
+    new SystemMessage(systemPrompt),
+    new HumanMessage(`Conversation:\n${conversationText}`),
+  ]);
+
+  const query = response.content.toString().trim();
+
+  if (query === 'EMPTY' || query.length === 0) {
+    return null;
+  }
+
+  return query;
+};
+
+interface IProductsTranslations {
+  noQueryDetected: string;
+  noProductsFound: string;
+  foundProducts: string;
+  inStock: string;
+  outOfStock: string;
+  category: string;
+  searchError: string;
+}
+
+const formatProductsResponse = (products: IProduct[], t: IProductsTranslations) => {
+  if (products.length === 0) {
+    return t.noProductsFound;
+  }
+
+  const header = t.foundProducts.replace('{count}', String(products.length)) + '\n\n';
+
+  const productsList = products
+    .map((p, i) => {
+      const price = p.price.toFixed(2);
+      const stockText = p.stock > 0 ? t.inStock : t.outOfStock;
+      return `${i + 1}. **${p.name}** - ${price} zł\n   ${p.description}\n   ${t.category}: ${p.category} | ${stockText}`;
+    })
+    .join('\n\n');
+
+  return header + productsList;
+};
 
 export const productsNode = async (state: IGraphState) => {
   const locale = state.locale || 'en';
-  const fallbackMessage =
-    locale === 'pl'
-      ? 'Funkcja wyszukiwania produktów jest obecnie w trakcie implementacji. Proszę spróbować później.'
-      : 'Product search functionality is currently under implementation. Please try again later.';
+  const t = getAgentTranslations(locale, 'agents.products');
+
+  const translations: IProductsTranslations = {
+    noQueryDetected: t('noQueryDetected'),
+    noProductsFound: t('noProductsFound'),
+    foundProducts: t('foundProducts'),
+    inStock: t('inStock'),
+    outOfStock: t('outOfStock'),
+    category: t('category'),
+    searchError: t('searchError'),
+  };
+
+  graphLogger.info('products', 'Starting product search');
+
+  try {
+    const searchQuery = await extractSearchQuery(state.messages, locale);
+
+    if (!searchQuery) {
+      const message = translations.noQueryDetected;
+      return { messages: [new AIMessage(message)], response: message };
+    }
+
+    graphLogger.info('products', `Extracted search query: "${searchQuery}"`);
+
+    const weaviateClient = await connectToWeaviate();
+    const productIds = await searchProductIdsInWeaviate(weaviateClient, searchQuery, SEARCH_LIMIT);
+
+    graphLogger.info('products', `Found ${productIds.length} product IDs in Weaviate`);
+
+    if (productIds.length === 0) {
+      const message = formatProductsResponse([], translations);
+      return { messages: [new AIMessage(message)], response: message };
+    }
+
+    const db = await connectToMongo();
+    const products: IProduct[] = [];
+
+    for (const mongoId of productIds) {
+      const product = await getProductById(db, mongoId);
+      if (product && !product.deleted && product.isActive) {
+        products.push(product);
+      }
+    }
+
+    graphLogger.info('products', `Retrieved ${products.length} products from MongoDB`);
 
-  graphLogger.info('products', 'Returning fallback message');
+    const responseMessage = formatProductsResponse(products, translations);
+    return { messages: [new AIMessage(responseMessage)], response: responseMessage };
+  } catch (error) {
+    const errorMsg = error instanceof Error ? error.message : 'Unknown error';
+    graphLogger.error('products', `Search failed: ${errorMsg}`);
 
-  return { messages: [new AIMessage(fallbackMessage)], response: fallbackMessage };
+    const message = translations.searchError;
+    return { messages: [new AIMessage(message)], response: message };
+  }
 };
diff --git a/agents/graph/nodes/routerNode.ts b/agents/graph/nodes/routerNode.ts
index 2f4e780..6da025a 100644
--- a/agents/graph/nodes/routerNode.ts
+++ b/agents/graph/nodes/routerNode.ts
@@ -1,5 +1,5 @@
 import { SystemMessage } from '@langchain/core/messages';
-import { createBielikClient } from '@/services/llm/llm.service';
+import { createOllamaClient } from '@/services/llm/llm.service';
 import { createRouterSystemPrompt } from '@/agents/prompts/routerPrompts';
 import { graphLogger } from '@/services/logger/graphLogger';
 import { IGraphState } from '@/agents/graph/state';
@@ -10,7 +10,7 @@ const ROUTER_MAX_TOKENS = 50;
 const MAX_CONTEXT_MESSAGES = 10;
 
 export const routerNode = async (state: IGraphState) => {
-  const llm = createBielikClient(ROUTER_TEMPERATURE, ROUTER_MAX_TOKENS);
+  const llm = createOllamaClient(ROUTER_TEMPERATURE, ROUTER_MAX_TOKENS);
   const locale = state.locale || 'en';
   const systemPrompt = createRouterSystemPrompt(locale);
 
diff --git a/agents/prompts/productsPrompts.ts b/agents/prompts/productsPrompts.ts
new file mode 100644
index 0000000..3016b72
--- /dev/null
+++ b/agents/prompts/productsPrompts.ts
@@ -0,0 +1,53 @@
+export const createProductSearchQueryPrompt = (locale: string) => {
+  const prompts: Record<string, string> = {
+    en: `You are a search query extractor. Analyze the conversation between User and Assistant and extract product search keywords.
+
+Rules:
+- Extract product keywords from the ENTIRE conversation context (names, categories, features)
+- Combine information from multiple messages to build a complete query
+- Output ONLY the keywords separated by spaces
+- Do not add explanations or extra text
+- If the conversation has no product-related content, output "EMPTY"
+
+Examples:
+
+Conversation:
+User: Show me laptops
+→ "laptop laptops computer"
+
+Conversation:
+User: Hi, how are you?
+→ "EMPTY"
+
+Conversation:
+User: I need something for gaming
+Assistant: PC or console?
+User: PC, with good graphics
+→ "gaming PC graphics GPU computer"`,
+    pl: `Jesteś ekstraktorem zapytań wyszukiwania. Przeanalizuj konwersację między User a Assistant i wyciągnij słowa kluczowe do wyszukiwania produktów.
+
+Zasady:
+- Wyciągaj słowa kluczowe z CAŁEJ konwersacji (nazwy, kategorie, cechy)
+- Łącz informacje z wielu wiadomości żeby zbudować kompletne zapytanie
+- Wypisz TYLKO słowa kluczowe oddzielone spacjami
+- Nie dodawaj wyjaśnień ani dodatkowego tekstu
+- Jeśli konwersacja nie zawiera treści związanej z produktami, wypisz "EMPTY"
+
+Przykłady:
+
+Konwersacja:
+User: Pokaż mi laptopy
+→ "laptop laptopy komputer"
+
+Konwersacja:
+User: Cześć, jak się masz?
+→ "EMPTY"
+
+Konwersacja:
+User: Potrzebuję czegoś do grania
+Assistant: PC czy konsola?
+User: PC, z dobrą grafiką
+→ "gaming PC grafika GPU komputer"`,
+  };
+  return prompts[locale] || prompts.en;
+};
diff --git a/agents/utils/translations.ts b/agents/utils/translations.ts
new file mode 100644
index 0000000..4bd4362
--- /dev/null
+++ b/agents/utils/translations.ts
@@ -0,0 +1,57 @@
+import enMessages from '@/messages/en.json';
+import plMessages from '@/messages/pl.json';
+
+type NestedMessages = {
+  [key: string]: string | NestedMessages;
+};
+
+const messages: Record<string, NestedMessages> = {
+  en: enMessages as NestedMessages,
+  pl: plMessages as NestedMessages,
+};
+
+const getNestedValue = (obj: NestedMessages, path: string): string | undefined => {
+  const keys = path.split('.');
+  let current: string | NestedMessages = obj;
+
+  for (const key of keys) {
+    if (typeof current !== 'object' || current === null) {
+      return undefined;
+    }
+    current = current[key];
+  }
+
+  return typeof current === 'string' ? current : undefined;
+};
+
+export interface IAgentTranslator {
+  (key: string, params?: Record<string, string | number>): string;
+}
+
+export const getAgentTranslations = (
+  locale: string,
+  namespace: string
+): IAgentTranslator => {
+  const localeMessages = messages[locale] || messages.en;
+
+  return (key: string, params?: Record<string, string | number>): string => {
+    const fullPath = `${namespace}.${key}`;
+    let value = getNestedValue(localeMessages, fullPath);
+
+    if (!value) {
+      value = getNestedValue(messages.en, fullPath);
+    }
+
+    if (!value) {
+      return key;
+    }
+
+    if (params) {
+      Object.entries(params).forEach(([paramKey, paramValue]) => {
+        value = value!.replace(`{${paramKey}}`, String(paramValue));
+      });
+    }
+
+    return value;
+  };
+};
diff --git a/app/[locale]/api/registration/route.ts b/app/[locale]/api/registration/route.ts
index 54ef52b..6976305 100644
--- a/app/[locale]/api/registration/route.ts
+++ b/app/[locale]/api/registration/route.ts
@@ -32,8 +32,6 @@ export const POST = async (request: NextRequest) => {
       { status: 201 }
     );
   } catch (error) {
-    console.error('Registration error:', error);
-
     if (error instanceof ZodError) {
       return NextResponse.json(
         {
diff --git a/docker-compose.eval.yml b/docker-compose.eval.yml
new file mode 100644
index 0000000..e12a4ea
--- /dev/null
+++ b/docker-compose.eval.yml
@@ -0,0 +1,35 @@
+version: "3.8"
+
+services:
+  mongo:
+    image: mongo:7
+    container_name: cognito-eval-mongo
+    ports:
+      - "27017:27017"
+    environment:
+      - MONGO_INITDB_DATABASE=cognito-eval
+    tmpfs:
+      - /data/db
+
+  weaviate:
+    image: docker.io/semitechnologies/weaviate:1.30.1
+    container_name: cognito-eval-weaviate
+    ports:
+      - "8080:8080"
+      - "50051:50051"
+    environment:
+      LOG_LEVEL: "warning"
+      QUERY_DEFAULTS_LIMIT: 100
+      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true"
+      PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
+      ENABLE_MODULES: "text2vec-transformers"
+      CLUSTER_HOSTNAME: "node1"
+      TRANSFORMERS_INFERENCE_API: "http://text2vec-snowflake:8080"
+    depends_on:
+      - text2vec-snowflake
+    tmpfs:
+      - /var/lib/weaviate
+
+  text2vec-snowflake:
+    image: kodercloud/snowflake-l-2-0-weaviate
+    container_name: cognito-eval-vectorizer
diff --git a/docker-compose.yml b/docker-compose.yml
index 96f25e4..f003f1e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -78,6 +78,16 @@ services:
               device_ids: ["0"]
               capabilities: [gpu]
 
+  ollama-pull:
+    image: ollama/ollama
+    depends_on:
+      - ollama
+    volumes:
+      - ollama-data:/root/.ollama
+    entrypoint: ["/bin/sh", "-c", "sleep 5 && ollama pull mistral-small3.2:24b-instruct-2506-q8_0"]
+    environment:
+      - OLLAMA_HOST=ollama:11434
+
 volumes:
   mongo-data:
   weaviate-data:
diff --git a/docs/JENKINS_EVAL.md b/docs/JENKINS_EVAL.md
new file mode 100644
index 0000000..c79628d
--- /dev/null
+++ b/docs/JENKINS_EVAL.md
@@ -0,0 +1,194 @@
+# Jenkins Evaluation Tests Setup
+
+This guide explains how to set up and run LLM evaluation tests on Jenkins.
+
+## Prerequisites
+
+### Jenkins Server Requirements
+
+- Jenkins 2.x or newer
+- Docker and Docker Compose installed on Jenkins agent
+- NodeJS Plugin installed in Jenkins
+- Access to Ollama server at `192.168.68.80:11434`
+
+### Required Jenkins Plugins
+
+1. **NodeJS Plugin** - for Node.js installation management
+2. **Pipeline** - for Jenkinsfile support
+3. **Docker Pipeline** (optional) - for better Docker integration
+
+## Setup Steps
+
+### 1. Install NodeJS Plugin
+
+1. Go to **Manage Jenkins** → **Plugins** → **Available plugins**
+2. Search for "NodeJS"
+3. Install "NodeJS Plugin"
+4. Restart Jenkins if required
+
+### 2. Configure Node.js Tool
+
+1. Go to **Manage Jenkins** → **Tools**
+2. Scroll to **NodeJS installations**
+3. Click **Add NodeJS**
+4. Configure:
+   - **Name**: `Node24` (must match exactly)
+   - **Version**: Select Node.js 24.x
+   - **Global npm packages to install**: (leave empty)
+5. Click **Save**
+
+### 3. Create Jenkins Pipeline Job
+
+1. Click **New Item**
+2. Enter job name: `cognito-eval-tests`
+3. Select **Pipeline**
+4. Click **OK**
+
+### 4. Configure Pipeline
+
+In the job configuration:
+
+#### Option A: Pipeline from SCM (Recommended)
+
+1. In **Pipeline** section, select **Pipeline script from SCM**
+2. **SCM**: Git
+3. **Repository URL**: `https://github.com/KoderFPV/Cognito.git`
+4. **Branch**: `*/feature/product-search-with-evaluation` (or `*/main`)
+5. **Script Path**: `Jenkinsfile.eval`
+6. Click **Save**
+
+#### Option B: Inline Pipeline Script
+
+1. In **Pipeline** section, select **Pipeline script**
+2. Copy contents of `Jenkinsfile.eval` into the script area
+3. Click **Save**
+
+### 5. Verify Ollama Server
+
+Ensure Ollama is running and accessible:
+
+```bash
+curl http://192.168.68.80:11434/api/tags
+```
+
+Verify the model is available:
+
+```bash
+curl http://192.168.68.80:11434/api/tags | grep mistral-small3.2
+```
+
+If model is not present, pull it:
+
+```bash
+curl http://192.168.68.80:11434/api/pull -d '{"name": "mistral-small3.2:24b-instruct-2506-q8_0"}'
+```
+
+### 6. Run the Pipeline
+
+1. Open the job
+2. Click **Build Now**
+3. Monitor progress in **Console Output**
+
+## Pipeline Stages
+
+| Stage | Description | Duration |
+|-------|-------------|----------|
+| Checkout | Clone repository | ~10s |
+| Start Infrastructure | Start MongoDB + Weaviate containers | ~60-120s |
+| Install Dependencies | Run `npm ci` | ~30-60s |
+| Run Evaluation Tests | Execute LLM evaluation tests | ~5-10min |
+
+## Environment Variables
+
+The pipeline sets these environment variables:
+
+| Variable | Value | Description |
+|----------|-------|-------------|
+| `TEST_LOCALE` | `en` | Test locale |
+| `OLLAMA_URL` | `http://192.168.68.80:11434/v1` | Ollama API endpoint |
+| `OLLAMA_MODEL` | `mistral-small3.2:24b-instruct-2506-q8_0` | LLM model |
+| `MONGODB_URI` | `mongodb://localhost:27017/cognito-eval` | MongoDB connection |
+| `WEAVIATE_HTTP_HOST` | `localhost` | Weaviate host |
+| `WEAVIATE_HTTP_PORT` | `8080` | Weaviate HTTP port |
+
+## Artifacts
+
+After each run, failed test results are archived:
+
+- Location: **Build Artifacts** → `agents/__tests__/evaluation/last-run/`
+- Format: JSON files with conversation and evaluation details
+
+## Troubleshooting
+
+### MongoDB fails to start
+
+```bash
+# Check container logs
+docker logs cognito-eval-mongo
+
+# Manually test
+docker run --rm mongo:7 mongosh --eval "db.runCommand({ ping: 1 })"
+```
+
+### Weaviate fails to start
+
+```bash
+# Check container logs
+docker logs cognito-eval-weaviate
+docker logs cognito-eval-vectorizer
+
+# Verify readiness endpoint
+curl http://localhost:8080/v1/.well-known/ready
+```
+
+### Ollama connection refused
+
+1. Verify Ollama is running on `192.168.68.80`
+2. Check firewall rules allow port `11434`
+3. Test connectivity from Jenkins agent:
+   ```bash
+   curl http://192.168.68.80:11434/api/tags
+   ```
+
+### Node.js tool not found
+
+Error: `Tool type "nodejs" does not have an install of "Node24" configured`
+
+Solution: Follow step 2 above to configure Node.js tool with exact name `Node24`.
+
+### Docker permission denied
+
+```bash
+# Add Jenkins user to docker group
+sudo usermod -aG docker jenkins
+sudo systemctl restart jenkins
+```
+
+## Customization
+
+### Change Ollama Server
+
+Edit `Jenkinsfile.eval`:
+
+```groovy
+environment {
+    OLLAMA_URL = 'http://YOUR_OLLAMA_IP:11434/v1'
+}
+```
+
+### Change Model
+
+Edit `Jenkinsfile.eval`:
+
+```groovy
+environment {
+    OLLAMA_MODEL = 'your-model-name'
+}
+```
+
+### Schedule Automatic Runs
+
+In job configuration, add **Build Triggers**:
+
+- **Build periodically**: `H 2 * * *` (daily at 2 AM)
+- **Poll SCM**: `H/15 * * * *` (every 15 minutes)
diff --git a/messages/en.json b/messages/en.json
index 15aff0a..7b87f82 100644
--- a/messages/en.json
+++ b/messages/en.json
@@ -166,5 +166,16 @@
       "agentError": "AI agent encountered an error",
       "processingFailed": "Failed to process message"
     }
+  },
+  "agents": {
+    "products": {
+      "noQueryDetected": "No product query detected. Try describing what you are looking for.",
+      "noProductsFound": "No products found matching your query.",
+      "foundProducts": "Found {count} products:",
+      "inStock": "In stock",
+      "outOfStock": "Out of stock",
+      "category": "Category",
+      "searchError": "An error occurred while searching for products. Please try again later."
+    }
   }
 }
diff --git a/messages/pl.json b/messages/pl.json
index 9666569..01050b1 100644
--- a/messages/pl.json
+++ b/messages/pl.json
@@ -166,5 +166,16 @@
       "agentError": "Agent AI napotkał błąd",
       "processingFailed": "Nie udało się przetworzyć wiadomości"
     }
+  },
+  "agents": {
+    "products": {
+      "noQueryDetected": "Nie wykryłem zapytania o produkty. Spróbuj opisać czego szukasz.",
+      "noProductsFound": "Nie znaleziono produktów pasujących do Twojego zapytania.",
+      "foundProducts": "Znaleziono {count} produktów:",
+      "inStock": "Dostępny",
+      "outOfStock": "Niedostępny",
+      "category": "Kategoria",
+      "searchError": "Wystąpił błąd podczas wyszukiwania produktów. Spróbuj ponownie później."
+    }
   }
 }
diff --git a/models/products/weaviateProductsModel.ts b/models/products/weaviateProductsModel.ts
index 33b630a..b080d89 100644
--- a/models/products/weaviateProductsModel.ts
+++ b/models/products/weaviateProductsModel.ts
@@ -44,3 +44,21 @@ export const deleteProductFromWeaviate = async (
   const whereFilter = collection.filter.byProperty('mongoId').equal(productId);
   await collection.data.deleteMany(whereFilter);
 };
+
+const SEARCH_LIMIT = 5;
+
+export const searchProductIdsInWeaviate = async (
+  client: WeaviateClient,
+  query: string,
+  limit: number
+): Promise<string[]> => {
+  const collection = client.collections.get(PRODUCTS_COLLECTION);
+
+  const result = await collection.query.nearText(query, {
+    targetVector: 'text_vector',
+    limit,
+    returnProperties: ['mongoId'],
+  });
+
+  return result.objects.map((obj) => (obj.properties as unknown as IWeaviateProduct).mongoId);
+};
diff --git a/package.json b/package.json
index b2752bc..8ab851a 100644
--- a/package.json
+++ b/package.json
@@ -14,6 +14,7 @@
     "test": "vitest",
     "test:watch": "vitest --watch",
     "test:coverage": "vitest --coverage",
+    "test:eval": "vitest --config vitest.e2e.config.ts",
     "test:e2e": "playwright test",
     "test:e2e:ui": "playwright test --ui",
     "test:e2e:headed": "playwright test --headed",
diff --git a/services/llm/llm.service.test.ts b/services/llm/llm.service.test.ts
index d57b28c..c751704 100644
--- a/services/llm/llm.service.test.ts
+++ b/services/llm/llm.service.test.ts
@@ -8,7 +8,7 @@ vi.mock('@langchain/openai', () => ({
   ChatOpenAI: mockChatOpenAI,
 }));
 
-import { createBielikClient } from './llm.service';
+import { createOllamaClient } from './llm.service';
 
 describe('llm.service', () => {
   const originalEnv = process.env;
@@ -25,9 +25,9 @@ describe('llm.service', () => {
   describe('getOllamaConfig (tested via client creation)', () => {
     it('should throw error when OLLAMA_URL is not set', () => {
       delete process.env.OLLAMA_URL;
-      process.env.OLLAMA_MODEL = 'speakleash/bielik-11b-v3.0-instruct:Q8_0';
+      process.env.OLLAMA_MODEL = 'mistral-small3.2:24b-instruct-2506-q8_0';
 
-      expect(() => createBielikClient(0.7, 500)).toThrow(
+      expect(() => createOllamaClient(0.7, 500)).toThrow(
         'OLLAMA_URL environment variable is not set'
       );
     });
@@ -36,30 +36,30 @@ describe('llm.service', () => {
       process.env.OLLAMA_URL = 'http://localhost:2141/v1';
       delete process.env.OLLAMA_MODEL;
 
-      expect(() => createBielikClient(0.7, 500)).toThrow(
+      expect(() => createOllamaClient(0.7, 500)).toThrow(
         'OLLAMA_MODEL environment variable is not set'
       );
     });
   });
 
-  describe('createBielikClient', () => {
+  describe('createOllamaClient', () => {
     beforeEach(() => {
       process.env.OLLAMA_URL = 'http://localhost:2141/v1';
-      process.env.OLLAMA_MODEL = 'speakleash/bielik-11b-v3.0-instruct:Q8_0';
+      process.env.OLLAMA_MODEL = 'mistral-small3.2:24b-instruct-2506-q8_0';
     });
 
     it('should create ChatOpenAI client with correct model from env', () => {
-      createBielikClient(0.7, 500);
+      createOllamaClient(0.7, 500);
 
       expect(mockChatOpenAI).toHaveBeenCalledWith(
         expect.objectContaining({
-          model: 'speakleash/bielik-11b-v3.0-instruct:Q8_0',
+          model: 'mistral-small3.2:24b-instruct-2506-q8_0',
         })
       );
     });
 
     it('should create ChatOpenAI client with provided temperature', () => {
-      createBielikClient(0.5, 500);
+      createOllamaClient(0.5, 500);
 
       expect(mockChatOpenAI).toHaveBeenCalledWith(
         expect.objectContaining({
@@ -69,7 +69,7 @@ describe('llm.service', () => {
     });
 
     it('should create ChatOpenAI client with provided maxTokens', () => {
-      createBielikClient(0.7, 1000);
+      createOllamaClient(0.7, 1000);
 
       expect(mockChatOpenAI).toHaveBeenCalledWith(
         expect.objectContaining({
@@ -79,7 +79,7 @@ describe('llm.service', () => {
     });
 
     it('should create ChatOpenAI client with correct baseURL', () => {
-      createBielikClient(0.7, 500);
+      createOllamaClient(0.7, 500);
 
       expect(mockChatOpenAI).toHaveBeenCalledWith(
         expect.objectContaining({
@@ -91,7 +91,7 @@ describe('llm.service', () => {
     });
 
     it('should create ChatOpenAI client with ollama as apiKey', () => {
-      createBielikClient(0.7, 500);
+      createOllamaClient(0.7, 500);
 
       expect(mockChatOpenAI).toHaveBeenCalledWith(
         expect.objectContaining({
diff --git a/services/llm/llm.service.ts b/services/llm/llm.service.ts
index 0dff3cb..e65b8bf 100644
--- a/services/llm/llm.service.ts
+++ b/services/llm/llm.service.ts
@@ -15,7 +15,7 @@ const getOllamaConfig = () => {
   return { ollamaUrl, ollamaModel };
 };
 
-export const createBielikClient = (temperature: number, maxTokens: number) => {
+export const createOllamaClient = (temperature: number, maxTokens: number) => {
   const { ollamaUrl, ollamaModel } = getOllamaConfig();
 
   return new ChatOpenAI({
diff --git a/vitest.config.ts b/vitest.config.ts
index e45cca3..3689ca5 100644
--- a/vitest.config.ts
+++ b/vitest.config.ts
@@ -9,7 +9,7 @@ export default defineConfig({
     globals: true,
     setupFiles: ['./test/setup.ts'],
     include: ['**/*.{test,spec}.{ts,tsx}'],
-    exclude: ['node_modules', '.next', 'e2e/**'],
+    exclude: ['node_modules', '.next', 'e2e/**', '**/*.e2e.test.ts'],
     coverage: {
       provider: 'v8',
       reporter: ['text', 'json', 'html'],
diff --git a/vitest.e2e.config.ts b/vitest.e2e.config.ts
new file mode 100644
index 0000000..a93e7d2
--- /dev/null
+++ b/vitest.e2e.config.ts
@@ -0,0 +1,28 @@
+import { defineConfig } from 'vitest/config';
+import path from 'path';
+import { config } from 'dotenv';
+
+config({ path: '.env.local' });
+
+export default defineConfig({
+  test: {
+    environment: 'node',
+    globals: true,
+    include: ['agents/__tests__/evaluation/**/*.e2e.test.ts'],
+    exclude: ['node_modules', '.next'],
+    testTimeout: 180000,
+    hookTimeout: 180000,
+    pool: 'forks',
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    poolOptions: {
+      forks: {
+        singleFork: true,
+      },
+    },
+  } as any,
+  resolve: {
+    alias: {
+      '@': path.resolve(__dirname, './'),
+    },
+  },
+});