diff --git a/packages/Classification_backend/app/services/ocr/__init__.py b/packages/Classification_backend/app/services/ocr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/Classification_backend/app/services/ocr/analyzer/__init__.py b/packages/Classification_backend/app/services/ocr/analyzer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/Classification_backend/app/services/ocr/analyzer/analyze.py b/packages/Classification_backend/app/services/ocr/analyzer/analyze.py
new file mode 100644
index 0000000..83df118
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/analyzer/analyze.py
@@ -0,0 +1,146 @@
+# analyzer/analyze.py
+
+from schema_base import FileResults
+from utils.logger import setup_logger
+from analyzer.llm import DocumentLLM
+from documents.document_type.prompt import document_type_prompt
+from documents.document_type.schema import DocumentCategoryAndType
+from typing import Optional, List, Tuple
+import difflib
+
+logger = setup_logger(__name__)
+
+
+def _normalize_value(s: str, allowed: Optional[List[str]] = None) -> str:
+    """
+    Normalize a string to lowercase with underscores and optionally fuzzy match
+    against a provided allowed list.
+    """
+    base = str(s or "unknown").strip().lower().replace(" ", "_").replace("-", "_")
+    if allowed:
+        match = difflib.get_close_matches(base, allowed, n=1, cutoff=0.8)
+        if match:
+            return match[0]
+    return base
+
+
+def _normalize_pair(
+    cat: str, typ: str,
+    allowed_cats: Optional[List[str]] = None,
+    allowed_types: Optional[List[str]] = None
+) -> Tuple[str, str]:
+    return (
+        _normalize_value(cat, allowed_cats),
+        _normalize_value(typ, allowed_types)
+    )
+
+
+def analyze_document(
+    file_results: FileResults,
+    allowed_pairs: Optional[List[dict]] = None,
+    stop_on_failure: bool = False
+) -> Optional[FileResults]:
+    """
+    Runs classification on the provided file_results (uses page images).
+    Returns updated FileResults with normalized category/type and status.
+    """
+    if not file_results or not file_results.properties or not file_results.properties.page_paths:
+        logger.debug("No file_results or no page_paths present")
+        return None
+
+    # Normalize allowed_pairs set and track normalized values for fuzzy matching
+    allowed_pairs_set = set()
+    allowed_cats = []
+    allowed_types = []
+    if allowed_pairs:
+        for p in allowed_pairs:
+            try:
+                c = p.get("document_category")
+                t = p.get("document_type")
+            except Exception:
+                continue
+            cat_n = _normalize_value(c)
+            typ_n = _normalize_value(t)
+            allowed_pairs_set.add((cat_n, typ_n))
+            allowed_cats.append(cat_n)
+            allowed_types.append(typ_n)
+
+    document_llm = DocumentLLM()
+    logger.info("Analyzing pages: %s", file_results.properties.page_paths)
+
+    response = document_llm.call_llm_api(
+        prompt=document_type_prompt,
+        image_path=file_results.properties.page_paths
+    )
+    parsed = response.get("parsed")
+    raw = response.get("raw")
+    error = response.get("error")
+
+    # default fallback
+    file_results.document_category_details.document_category = "unknown"
+    file_results.document_category_details.document_type = "unknown"
+    try:
+        file_results.document_category_details.status = "unknown"
+        file_results.document_category_details.note = None
+    except Exception:
+        pass
+
+    if parsed:
+        try:
+            # Validate/coerce with Pydantic model
+            doc_cat_type = DocumentCategoryAndType.model_validate(parsed)
+            cat_raw = str(doc_cat_type.document_category)
+            typ_raw = str(doc_cat_type.document_type)
+
+            # Normalize with fuzzy matching
+            cat_n, typ_n = _normalize_pair(cat_raw, typ_raw, allowed_cats, allowed_types)
+
+            # Determine status relative to allowed_pairs
+            if allowed_pairs_set and (cat_n, typ_n) not in allowed_pairs_set:
+                status = "extra"
+                note = "Pair not in allowed list (user-provided)"
+            else:
+                status = "classified"
+                note = None
+
+            # Write back
+            file_results.document_category_details.document_category = cat_n
+            file_results.document_category_details.document_type = typ_n
+            try:
+                file_results.document_category_details.status = status
+                file_results.document_category_details.note = note
+            except Exception:
+                pass
+
+            # Store raw LLM + parsed outputs
+            file_results.ocr_results = {
+                "llm_raw": raw,
+                "llm_parsed": parsed,
+                "status": status,
+                "note": note
+            }
+
+            logger.info("Classification result: %s, %s -> %s", cat_raw, typ_raw, status)
+            return file_results
+
+        except Exception as e:
+            logger.exception("Failed to validate LLM parsed output: %s", e)
+            file_results.document_category_details.document_category = "unknown"
+            file_results.document_category_details.document_type = "unknown"
+            file_results.document_category_details.status = "unknown"
+            file_results.document_category_details.note = "validation_failed"
+            file_results.ocr_results = {"llm_raw": raw, "error": "validation_failed"}
+            if stop_on_failure:
+                return None
+            return file_results
+
+    else:
+        logger.warning("LLM returned no parsed JSON (error=%s)", error)
+        file_results.document_category_details.document_category = "unknown"
+        file_results.document_category_details.document_type = "unknown"
+        file_results.document_category_details.status = "unknown"
+        file_results.document_category_details.note = error or "no_parsed_response"
+        file_results.ocr_results = {"error": error}
+        if stop_on_failure:
+            return None
+        return file_results
diff --git a/packages/Classification_backend/app/services/ocr/analyzer/llm.py b/packages/Classification_backend/app/services/ocr/analyzer/llm.py
new file mode 100644
index 0000000..2b3f65c
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/analyzer/llm.py
@@ -0,0 +1,67 @@
+# analyzer/llm.py
+from pydantic import BaseModel
+from dotenv import load_dotenv
+import os
+import json
+import re
+import time
+from vertexai.generative_models import Part, Image, GenerativeModel
+import vertexai
+from utils.logger import setup_logger
+
+logger = setup_logger(__name__)
+load_dotenv()
+
+project = os.getenv("GOOGLE_CLOUD_PROJECT")
+location = os.getenv("GOOGLE_CLOUD_LOCATION")
+if project and location:
+    try:
+        vertexai.init(project=project, location=location)
+    except Exception:
+        logger.exception("vertexai.init failed (maybe running locally without credentials).")
+
+class DocumentLLM(BaseModel):
+    """
+    Wrapper around Vertex/Gemini model to pass images + prompt and return parsed + raw outputs.
+    """
+
+    def call_llm_api(self, prompt: str, image_path: list[str], retries: int = 2, backoff: float = 1.0) -> dict:
+        """
+        Calls the generative model with images + prompt.
+        Returns: { "parsed":(<json|None>), "raw": <raw_text or None>, "error": <error_str|None> }
+        """
+        try:
+            model = GenerativeModel(model_name="gemini-2.0-flash-001")
+        except Exception as e:
+            # if model creation fails, raise up (caller will fallback)
+            logger.exception("Failed to create GenerativeModel: %s", e)
+            raise
+
+        text_part = Part.from_text(prompt)
+        image_parts = []
+        for p in image_path:
+            try:
+                image_parts.append(Part.from_image(Image.load_from_file(p)))
+            except Exception:
+                logger.exception("Failed loading image for LLM: %s", p)
+                # still proceed (model may accept less images)
+        last_exc = None
+        for attempt in range(retries + 1):
+            try:
+                response = model.generate_content([*image_parts, text_part])
+                raw_text = response.text
+                # First try direct JSON parse
+                try:
+                    parsed = json.loads(raw_text)
+                    return {"parsed": parsed, "raw": raw_text, "error": None}
+                except Exception:
+                    # strip fenced code blocks and try again
+                    cleaned = re.sub(r"^```json\s*|\s*```$", "", raw_text, flags=re.MULTILINE)
+                    parsed = json.loads(cleaned)
+                    return {"parsed": parsed, "raw": raw_text, "error": None}
+            except Exception as e:
+                last_exc = e
+                logger.warning("LLM call failed (attempt %d/%d): %s", attempt + 1, retries + 1, str(e))
+                time.sleep(backoff * (2 ** attempt))
+        logger.exception("LLM all retries failed: %s", last_exc)
+        return {"parsed": None, "raw": None, "error": str(last_exc)}
diff --git a/packages/Classification_backend/app/services/ocr/documents/__init__.py b/packages/Classification_backend/app/services/ocr/documents/__init__.py
new file mode 100644
index 0000000..a587144
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/documents/__init__.py
@@ -0,0 +1,2 @@
+from .prompt import document_type_prompt
+from .schema import DocumentCategoryAndType
\ No newline at end of file
diff --git a/packages/Classification_backend/app/services/ocr/documents/document_type/prompt.py b/packages/Classification_backend/app/services/ocr/documents/document_type/prompt.py
new file mode 100644
index 0000000..76203c6
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/documents/document_type/prompt.py
@@ -0,0 +1,48 @@
+document_type_prompt = """
+Document Type Identification Agent Prompt
+You are a document classification assistant.
+
+You will be given one or more images of a document. Analyze carefully and output the most appropriate
+document_category and document_type.
+
+Canonical categories and example types:
+
+{
+  "identity_verification_document": ["passport","driving_license","national_identity_card","other"],
+  "bank_statement": ["bank_statement","other"],
+  "income_document": ["payslip","p60","contract_of_employment","other"],
+  "expenditure": ["bank_statement","other"]
+}
+
+Examples:
+
+Example 1:
+Input: passport image
+Output:
+{"document_category": "identity_verification_document","document_type": "passport"}
+
+Example 2:
+Input: payslip
+Output:
+{"document_category": "income_document","document_type": "payslip"}
+
+Example 3:
+Input: bank statement
+Output:
+{"document_category": "bank_statement","document_type": "bank_statement"}
+
+Example 4:
+Input: driving licence
+Output:
+{"document_category": "identity_verification_document","document_type": "driving_license"}
+
+Example 5:
+Input: irrelevant or unclear
+Output:
+{"document_category": "unknown","document_type": "unknown"}
+
+Instructions:
+- Always choose from canonical values if possible.
+- If unsure, use "unknown".
+- Respond with a single JSON object only, no extra commentary.
+"""
diff --git a/packages/Classification_backend/app/services/ocr/documents/document_type/schema.py b/packages/Classification_backend/app/services/ocr/documents/document_type/schema.py
new file mode 100644
index 0000000..678b691
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/documents/document_type/schema.py
@@ -0,0 +1,89 @@
+# schema.py
+from pydantic import BaseModel, Field, field_validator
+from typing import Any
+from schema_base import StrEnumBase
+
+
+class DocumentCategoryEnum(StrEnumBase):
+    IDENTITY_VERIFICATION_DOCUMENT = "identity_verification_document"
+    BANK_STATEMENT = "bank_statement"
+    INCOME_DOCUMENT = "income_document"
+    EXPENDITURE = "expenditure"
+    CREDIT_REPORT = "credit_report"
+    OTHER = "other"
+    UNKNOWN = "unknown"
+
+
+class DocumentTypeEnum(StrEnumBase):
+    PASSPORT = "passport"
+    DRIVING_LICENSE = "driving_license"
+    NATIONAL_IDENTITY_CARD = "national_identity_card"
+    BANK_STATEMENT = "bank_statement"
+    PAYSLIP = "payslip"
+    P60 = "p60"
+    CONTRACT_OF_EMPLOYMENT = "contract_of_employment"
+    MARRIAGE_CERTIFICATE = "marriage_certificate"
+    PRE_MATERNITY_PAYSLIP = "pre_maternity_payslip"
+    PENSION_PAYSLIP = "pension_payslip"
+    ANNUAL_PENSION_STATEMENT = "pension_annual_statement"
+    EMPLOYER_LETTER = "letter_from_employer"
+    CREDIT_TRANSUNION = "transunion"
+    CREDIT_EXPERIAN = "experian"
+    OTHER = "other"
+    UNKNOWN = "unknown"
+
+
+# mapping dictionaries to fix name mismatches
+CATEGORY_MAPPING = {
+    "income": DocumentCategoryEnum.INCOME_DOCUMENT.value,
+    "income_document": DocumentCategoryEnum.INCOME_DOCUMENT.value,
+    "id proof": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value,
+    "identity": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value,
+    "identity_document": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value,
+    "identity_verification_document": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value,
+    "expenditure": DocumentCategoryEnum.EXPENDITURE.value,
+    "bank statement": DocumentCategoryEnum.BANK_STATEMENT.value,
+    "credit report": DocumentCategoryEnum.CREDIT_REPORT.value,
+}
+
+TYPE_MAPPING = {
+    "pay slip": DocumentTypeEnum.PAYSLIP.value,
+    "payslip": DocumentTypeEnum.PAYSLIP.value,
+    "p60": DocumentTypeEnum.P60.value,
+    "marriage certificate": DocumentTypeEnum.MARRIAGE_CERTIFICATE.value,
+    "contract of employment": DocumentTypeEnum.CONTRACT_OF_EMPLOYMENT.value,
+    "previous contract of employment": DocumentTypeEnum.CONTRACT_OF_EMPLOYMENT.value,
+    "pre maternity pay slip": DocumentTypeEnum.PRE_MATERNITY_PAYSLIP.value,
+    "annual pension scheme statement": DocumentTypeEnum.ANNUAL_PENSION_STATEMENT.value,
+    "confirmation of pension scheme": DocumentTypeEnum.ANNUAL_PENSION_STATEMENT.value,
+    "pension pay slip": DocumentTypeEnum.PENSION_PAYSLIP.value,
+    "pension annual statement": DocumentTypeEnum.ANNUAL_PENSION_STATEMENT.value,
+    "uk paaport": DocumentTypeEnum.PASSPORT.value,  # typo fixed
+    "passport": DocumentTypeEnum.PASSPORT.value,
+    "share code": DocumentTypeEnum.NATIONAL_IDENTITY_CARD.value,
+    "indefinite leave to remain": DocumentTypeEnum.NATIONAL_IDENTITY_CARD.value,
+    "bank statements": DocumentTypeEnum.BANK_STATEMENT.value,
+    "transunion": DocumentTypeEnum.CREDIT_TRANSUNION.value,
+    "experian": DocumentTypeEnum.CREDIT_EXPERIAN.value,
+}
+
+
+class DocumentCategoryAndType(BaseModel):
+    document_category: str = Field(..., description="Category of the document")
+    document_type: str = Field(..., description="Type of the document")
+
+    @field_validator("document_category", mode="before")
+    @classmethod
+    def _coerce_category(cls, v: Any) -> str:
+        if v is None:
+            return DocumentCategoryEnum.UNKNOWN.value
+        s = str(v).strip().lower()
+        return CATEGORY_MAPPING.get(s, s.replace(" ", "_").replace("-", "_"))
+
+    @field_validator("document_type", mode="before")
+    @classmethod
+    def _coerce_type(cls, v: Any) -> str:
+        if v is None:
+            return DocumentTypeEnum.UNKNOWN.value
+        s = str(v).strip().lower()
+        return TYPE_MAPPING.get(s, s.replace(" ", "_").replace("-", "_"))
diff --git a/packages/Classification_backend/app/services/ocr/requirements.txt b/packages/Classification_backend/app/services/ocr/requirements.txt
new file mode 100644
index 0000000..672f3ca
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/requirements.txt
@@ -0,0 +1,103 @@
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+attrs==25.3.0
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+comm==0.2.2
+debugpy==1.8.14
+decorator==5.2.1
+distro==1.9.0
+docstring_parser==0.13
+et_xmlfile==2.0.0
+executing==2.2.0
+fastapi==0.115.12
+gitdb==4.0.12
+GitPython==3.1.44
+google-api-core==2.24.2
+google-auth==2.38.0
+google-cloud-aiplatform==1.88.0
+google-cloud-bigquery==3.31.0
+google-cloud-core==2.4.3
+google-cloud-resource-manager==1.14.2
+google-cloud-storage==2.19.0
+google-crc32c==1.7.1
+google-genai==1.10.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.70.0
+grpc-google-iam-v1==0.14.2
+grpcio==1.71.0
+grpcio-status==1.71.0
+h11==0.14.0
+httpcore==1.0.8
+httpx==0.28.1
+idna==3.10
+ipykernel==6.29.5
+ipython==9.1.0
+ipython_pygments_lexers==1.1.1
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.9.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+MarkupSafe==3.0.2
+matplotlib-inline==0.1.7
+narwhals==1.34.1
+nest-asyncio==1.6.0
+numpy==2.2.4
+openai==1.73.0
+openpyxl==3.1.5
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pdf2image==1.17.0
+pexpect==4.9.0
+pillow==11.1.0
+platformdirs==4.3.7
+prompt_toolkit==3.0.50
+proto-plus==1.26.1
+protobuf==5.29.4
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pydantic==2.11.3
+pydantic_core==2.33.1
+pydeck==0.9.1
+Pygments==2.19.1
+PyMuPDF==1.25.5
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+pytz==2025.2
+pyzmq==26.4.0
+referencing==0.36.2
+requests==2.32.3
+rpds-py==0.24.0
+rsa==4.2
+shapely==2.1.0
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.46.1
+streamlit==1.44.1
+tenacity==9.1.2
+toml==0.10.2
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+watchdog==6.0.0
+wcwidth==0.2.13
+websockets==15.0.1
diff --git a/packages/Classification_backend/app/services/ocr/run_engine.py b/packages/Classification_backend/app/services/ocr/run_engine.py
new file mode 100644
index 0000000..94eb1a9
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/run_engine.py
@@ -0,0 +1,239 @@
+import os
+import json
+import uuid
+import shutil
+import hashlib
+import asyncio
+import time
+import tempfile
+from typing import List, Optional
+from uuid import UUID as UUIDType
+
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field, ValidationError
+from utils.process_file import save_upload_to_temp, process_file, DEFAULT_MAX_UPLOAD_BYTES, DEFAULT_MAX_PDF_PAGES
+from analyzer.analyze import analyze_document
+from schema_base import FileResults
+from documents.document_type.schema import DocumentCategoryAndType
+from utils.logger import setup_logger
+
+logger = setup_logger(__name__)
+
+app = FastAPI()
+
+# Configurable limits
+MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", DEFAULT_MAX_UPLOAD_BYTES))
+MAX_PDF_PAGES = int(os.getenv("MAX_PDF_PAGES", DEFAULT_MAX_PDF_PAGES))
+FUZZY_MATCHING = os.getenv("FUZZY_MATCHING", "true").lower() in ("1", "true", "yes")
+
+# In-memory cache (sha256 -> classification dict)
+_FILE_CACHE: dict[str, dict] = {}
+
+
+def _sha256_of_file(path: str, chunk: int = 65536) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for c in iter(lambda: f.read(chunk), b""):
+            h.update(c)
+    return h.hexdigest()
+
+
+def _norm(s: Optional[str]) -> str:
+    if not s:
+        return "unknown"
+    return str(s).strip().lower().replace(" ", "_").replace("-", "_")
+
+
+def coerce_category_type_pair(cat: str, typ: str) -> tuple[str, str]:
+    """
+    Normalize strings and coerce bank_statement -> bank_statement category if type indicates bank.
+    """
+    c = _norm(cat)
+    t = _norm(typ)
+    # some heuristics
+    if "bank" in t:
+        c = "bank_statement"
+    if c in ("id_proof", "id", "identity"):
+        c = "identity_verification_document"
+    if c == "income":
+        c = "income_document"
+    return c, t
+
+
+@app.post("/analyze")
+async def analyze(
+    payload: str = Form(...),            # JSON string with Application_id, Application_type, total_list_of_documents, required_documents
+    file: UploadFile = File(...)        # single file per request (can be extended to List[UploadFile])
+):
+    """
+    New /analyze endpoint:
+    - payload: JSON string with keys:
+        - Application_id
+        - Application_type
+        - total_list_of_documents: [{document_category, document_type}, ...]
+        - required_documents: [{document_category, document_type, is_optional}, ...]
+    - file: uploaded file (single)
+    Returns:
+      {
+        Application_id,
+        Application_type,
+        classification_overall_result,
+        classification_results: [ {document_category, document_type, optional, result, reason, matched_filename?}, ... ]
+      }
+    """
+    # parse payload JSON
+    try:
+        data = json.loads(payload)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"payload must be valid JSON: {e}")
+
+    # validate minimal fields
+    if "Application_id" not in data or "Application_type" not in data:
+        raise HTTPException(status_code=400, detail="payload must include Application_id and Application_type")
+
+    total_list = data.get("total_list_of_documents", [])
+    required_list = data.get("required_documents", [])
+
+    # prepare allowed_pairs for analyzer
+    allowed_pairs = []
+    for p in total_list:
+        # tolerate various key names
+        cat = p.get("document_category") or p.get("category")
+        typ = p.get("document_type") or p.get("type")
+        if cat is None or typ is None:
+            continue
+        c, t = coerce_category_type_pair(cat, typ)
+        allowed_pairs.append({"document_category": c, "document_type": t})
+
+    # process uploaded file (save and run OCR + LLM)
+    fname = file.filename or "uploaded_file"
+    saved_path = None
+    file_results_obj = None
+    sha = None
+
+    try:
+        # save upload to temp (streamed)
+        saved_path = await asyncio.to_thread(save_upload_to_temp, file, None, MAX_UPLOAD_BYTES)
+        sha = _sha256_of_file(saved_path)
+
+        # try cache
+        if sha in _FILE_CACHE:
+            logger.info("Cache hit for file %s", fname)
+            cached = _FILE_CACHE[sha]
+            classified_docs = [{
+                "filename": fname,
+                "file_path": saved_path,
+                "status": cached.get("status", "classified"),
+                "document_category": cached["document_category"],
+                "document_type": cached["document_type"],
+                "note": cached.get("note")
+            }]
+        else:
+            # process file -> produces FileResults with properties.page_paths etc.
+            file_results_obj = await asyncio.to_thread(process_file, None, saved_path, MAX_PDF_PAGES)
+            if not file_results_obj or not file_results_obj.properties.file_present or not file_results_obj.properties.page_paths:
+                raise HTTPException(status_code=400, detail="file not present or contains no pages")
+
+            # analyze_document will attach document_category/document_type and status/note inside file_results_obj
+            analyzed = await asyncio.to_thread(analyze_document, file_results_obj, allowed_pairs)
+
+            if not analyzed:
+                raise HTTPException(status_code=500, detail="analysis failed")
+
+            # read values (fall back to ocr_results if needed)
+            cat = getattr(analyzed.document_category_details, "document_category", "unknown")
+            typ = getattr(analyzed.document_category_details, "document_type", "unknown")
+            # try to get status/note; else check ocr_results
+            status = getattr(analyzed.document_category_details, "status", None)
+            note = getattr(analyzed.document_category_details, "note", None)
+            if not status:
+                ocr = getattr(analyzed, "ocr_results", {}) or {}
+                status = ocr.get("status", "classified" if cat != "unknown" else "unknown")
+                note = note or ocr.get("note")
+
+            # normalize
+            cat, typ = coerce_category_type_pair(cat, typ)
+
+            classified_docs = [{
+                "filename": fname,
+                "file_path": saved_path,
+                "status": status if status else "classified",
+                "document_category": cat,
+                "document_type": typ,
+                "note": note
+            }]
+
+            # cache summary
+            _FILE_CACHE[sha] = {
+                "document_category": cat,
+                "document_type": typ,
+                "status": status,
+                "note": note,
+                "timestamp": time.time()
+            }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Error processing uploaded file %s: %s", fname, e)
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # cleanup process_file temp directories if present
+        try:
+            if file_results_obj and file_results_obj.properties and getattr(file_results_obj.properties, "file_dir", None):
+                shutil.rmtree(file_results_obj.properties.file_dir, ignore_errors=True)
+            else:
+                # if saved_path lives in a tmp dir created by save_upload_to_temp, try to remove parent
+                if saved_path:
+                    parent = os.path.dirname(saved_path)
+                    if parent and "tmp" in parent:
+                        shutil.rmtree(parent, ignore_errors=True)
+        except Exception:
+            logger.exception("Cleanup failed for file %s", fname)
+
+    # Now compute results for each required document (strict both category & type)
+    classification_results = []
+    overall_ok = True
+
+    # helper to check presence in classified_docs
+    def _is_present(req_cat: str, req_typ: str) -> (bool, Optional[str]):
+        req_c, req_t = coerce_category_type_pair(req_cat, req_typ)
+        for d in classified_docs:
+            # only consider classified and classified_extra and cached classified
+            if d.get("status") not in (None, "classified", "classified_extra", "cache", "classified_cached"):
+                # still allow matching if status unknown? we stick to classified/classified_extra/cache
+                pass
+            if d.get("document_category") == req_c and d.get("document_type") == req_t:
+                return True, d.get("filename")
+        return False, None
+
+    for r in required_list:
+        req_cat = r.get("document_category")
+        req_typ = r.get("document_type")
+        is_opt = bool(r.get("is_optional", False))
+
+        present, matched_filename = _is_present(req_cat, req_typ)
+        if not present and not is_opt:
+            overall_ok = False
+
+        reason = f"{req_typ} is present" if present else f"{req_typ} is missing"
+
+        entry = {
+            "document_category": req_cat,
+            "document_type": req_typ,
+            "optional": is_opt,
+            "result": present,
+            "reason": reason
+        }
+        if present and matched_filename:
+            entry["matched_filename"] = matched_filename
+
+        classification_results.append(entry)
+
+    return JSONResponse({
+        "Application_id": data.get("Application_id"),
+        "Application_type": data.get("Application_type"),
+        "classification_overall_result": overall_ok,
+        "classification_results": classification_results
+    }, status_code=200)
diff --git a/packages/Classification_backend/app/services/ocr/schema_base.py b/packages/Classification_backend/app/services/ocr/schema_base.py
new file mode 100644
index 0000000..0af9fb4
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/schema_base.py
@@ -0,0 +1,44 @@
+from pydantic import BaseModel, Field
+from enum import Enum
+
+
+class StrEnumBase(str, Enum):
+    """Base enum with str-like behaviour."""
+    def __str__(self):
+        return self.value
+
+    @classmethod
+    def keys(cls):
+        return [m.name for m in cls]
+
+    @classmethod
+    def values(cls):
+        return [str(m) for m in cls]
+
+    @classmethod
+    def items(cls):
+        return [(m.name, str(m)) for m in cls]
+
+
+class FileProperties(BaseModel):
+    file_path: str = Field(default="", description="Local file path")
+    file_dir: str = Field(default="", description="Temp dir for file artifacts")
+    file_type: str = Field(default="", description="File type (pdf, png, jpeg)")
+    pages: int = Field(default=0, description="Number of pages/images")
+    page_paths: list[str] = Field(default_factory=list, description="Paths to per-page images")
+    file_present: bool = Field(default=False, description="Whether file exists")
+
+
+class DocumentCategoryDetails(BaseModel):
+    """Classification results for one file."""
+    document_category: str | None = Field(default=None, description="e.g. income_document")
+    document_type: str | None = Field(default=None, description="e.g. payslip")
+    status: str | None = Field(default=None, description="classified / extra / unknown")
+    note: str | None = Field(default=None, description="Any extra comment")
+
+
+class FileResults(BaseModel):
+    """Aggregated result (file metadata + classification)."""
+    properties: FileProperties = FileProperties()
+    document_category_details: DocumentCategoryDetails = DocumentCategoryDetails()
+    ocr_results: dict | None = Field(default=None, description="Optional OCR/LLM raw output")
diff --git a/packages/Classification_backend/app/services/ocr/utils/__init__.py b/packages/Classification_backend/app/services/ocr/utils/__init__.py
new file mode 100644
index 0000000..5098ec4
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/utils/__init__.py
@@ -0,0 +1 @@
+from .logger import setup_logger
\ No newline at end of file
diff --git a/packages/Classification_backend/app/services/ocr/utils/image_utils.py b/packages/Classification_backend/app/services/ocr/utils/image_utils.py
new file mode 100644
index 0000000..8b462e2
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/utils/image_utils.py
@@ -0,0 +1,78 @@
+import base64
+from io import BytesIO
+import pymupdf
+from PIL import Image
+import streamlit as st
+import os
+from datetime import datetime
+
+
+def generate_metadata(file_path):
+    """Generate metadata dictionary from file path and properties"""
+    file_stat = os.stat(file_path)
+    file_name = os.path.basename(file_path)
+    parent_dir = os.path.basename(os.path.dirname(file_path))
+
+    metadata = {
+        "File Name": file_name,
+        "Directory": parent_dir,
+        "File Size": f"{file_stat.st_size / 1024:.2f} KB",
+        "Last Modified": datetime.fromtimestamp(file_stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'),
+        "Created": datetime.fromtimestamp(file_stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S'),
+        "File Extension": os.path.splitext(file_name)[1],
+        "Full Path": file_path
+    }
+
+    # Add image-specific metadata if it's an image
+    if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
+        try:
+            with Image.open(file_path) as img:
+                metadata.update({
+                    "Image Size": f"{img.size[0]}x{img.size[1]}",
+                    "Image Mode": img.mode,
+                    "Image Format": img.format
+                })
+        except Exception as e:
+            st.error(f"Error reading image metadata: {str(e)}")
+
+    # Add PDF-specific metadata if it's a PDF
+    elif file_name.lower().endswith('.pdf'):
+        try:
+            doc = pymupdf.Document(file_path)
+            metadata.update({
+                "Page Count": len(doc),
+                "PDF Version": doc.pdf_version,
+                "Document Info": doc.metadata if doc.metadata else "No PDF metadata available"
+            })
+        except Exception as e:
+            st.error(f"Error reading PDF metadata: {str(e)}")
+
+    return metadata
+
+
+def load_pdf_as_image(file_path):
+    # Open PDF file
+    doc = pymupdf.Document(file_path)
+
+    # Get the first page
+    page = doc[0]
+
+    # Convert to image
+    pix = page.get_pixmap()
+
+    # Convert to PIL Image
+    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+
+    return img
+
+
+def im_2_b64(image):
+    buff = BytesIO()
+    image.save(buff, format="JPEG")
+    img_str = base64.b64encode(buff.getvalue()).decode("utf-8")
+    return img_str
+
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
diff --git a/packages/Classification_backend/app/services/ocr/utils/logger.py b/packages/Classification_backend/app/services/ocr/utils/logger.py
new file mode 100644
index 0000000..333ed1f
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/utils/logger.py
@@ -0,0 +1,43 @@
+import logging
+import os
+from datetime import datetime
+# from config import LOGS_DIR
+# Create logs directory if it doesn't exist
+
+LOGS_DIR = "logs_directory"
+os.makedirs(LOGS_DIR, exist_ok=True)
+# Generate filename with timestamp
+log_filename = os.path.join(
+    LOGS_DIR, f"app_{datetime.now().strftime('%Y%m%d')}.log")
+
+
+def setup_logger(name):
+    """
+    Create a logger with the specified name that writes to both file and console
+    """
+    logger = logging.getLogger(name)
+    # Only configure if it hasn't been configured yet
+    if not logger.handlers:
+        logger.setLevel(logging.DEBUG)
+        # Create file handler
+        file_handler = logging.FileHandler(log_filename)
+        file_handler.setLevel(logging.DEBUG)
+        file_formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        file_handler.setFormatter(file_formatter)
+        # Create console handler
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.INFO)  # Less verbose for console
+        console_formatter = logging.Formatter(
+            '%(levelname)s - %(name)s - %(message)s'
+        )
+        console_handler.setFormatter(console_formatter)
+        # Add handlers to logger
+        logger.addHandler(file_handler)
+        logger.addHandler(console_handler)
+    return logger
+
+
+logger = setup_logger(__name__)
diff --git a/packages/Classification_backend/app/services/ocr/utils/process_file.py b/packages/Classification_backend/app/services/ocr/utils/process_file.py
new file mode 100644
index 0000000..01ebdc2
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/utils/process_file.py
@@ -0,0 +1,158 @@
+import tempfile, os
+import secrets
+import string
+from typing import Optional
+from fastapi import UploadFile
+
+from schema_base import FileProperties, FileResults, DocumentCategoryDetails
+from .logger import setup_logger
+import pdf2image
+from PIL import Image
+
+# Ensure tempdir is set (change if needed)
+os.environ["TMPDIR"] = "D:/temp"
+tempfile.tempdir = "D:/temp"
+
+logger = setup_logger(__name__)
+
+# Limits (configurable)
+DEFAULT_MAX_UPLOAD_BYTES = 30 * 1024 * 1024  # 30 MB
+DEFAULT_MAX_PDF_PAGES = 8
+DEFAULT_MAX_IMAGE_DIM = 2500  # px, longest side
+
+
+class FileTypes(str):
+    PNG = "png"
+    JPEG = "jpeg"
+    PDF = "pdf"
+
+
+def generate_random_string(length: int = 16) -> str:
+    chars = string.ascii_letters + string.digits
+    return ''.join(secrets.choice(chars) for _ in range(length))
+
+
+def identify_file_type_by_magic(file_path: str) -> Optional[str]:
+    """Detect png/jpg/pdf using file header magic numbers."""
+    with open(file_path, "rb") as f:
+        header = f.read(10)
+    if header.startswith(b'\x89PNG\r\n\x1a\n'):
+        return FileTypes.PNG
+    if header.startswith(b'\xff\xd8\xff'):
+        return FileTypes.JPEG
+    if header.startswith(b'%PDF'):
+        return FileTypes.PDF
+    return None
+
+
+def identify_file_type(file_path: str) -> Optional[str]:
+    ext = os.path.splitext(file_path)[1].lower().lstrip(".")
+    if ext in ("png",):
+        return FileTypes.PNG
+    if ext in ("jpg", "jpeg"):
+        return FileTypes.JPEG
+    if ext == "pdf":
+        return FileTypes.PDF
+    return identify_file_type_by_magic(file_path)
+
+
+def save_upload_to_temp(upload_file: UploadFile, tmp_dir: Optional[str] = None,
+                        max_bytes: int = DEFAULT_MAX_UPLOAD_BYTES) -> str:
+    """
+    Save uploaded file to a temporary folder (with size limit).
+    Returns full path.
+    """
+    base_dir = tmp_dir or tempfile.mkdtemp()
+    os.makedirs(base_dir, exist_ok=True)
+
+    _, ext = os.path.splitext(upload_file.filename or "")
+    filename = f"{generate_random_string()}{ext}"
+    full_path = os.path.join(base_dir, filename)
+
+    total = 0
+    chunk_size = 64 * 1024
+    with open(full_path, "wb") as out:
+        while True:
+            chunk = upload_file.file.read(chunk_size)
+            if not chunk:
+                break
+            total += len(chunk)
+            if total > max_bytes:
+                out.close()
+                try:
+                    os.remove(full_path)
+                except Exception:
+                    pass
+                raise ValueError(
+                    f"File '{upload_file.filename}' too large. Max allowed is {max_bytes} bytes."
+                )
+            out.write(chunk)
+
+    # reset file pointer
+    try:
+        upload_file.file.seek(0)
+    except Exception:
+        pass
+
+    return full_path
+
+
+def process_pdf(file_path: str,
+                max_pages: int = DEFAULT_MAX_PDF_PAGES,
+                max_image_dim: int = DEFAULT_MAX_IMAGE_DIM) -> list[str]:
+    """Convert PDF to PNG pages (downscales if > max_image_dim)."""
+    images = pdf2image.convert_from_path(file_path)
+    file_paths: list[str] = []
+    for i, img in enumerate(images):
+        if i >= max_pages:
+            break
+        if max(img.size) > max_image_dim:
+            img.thumbnail((max_image_dim, max_image_dim))
+        img_path = f"{file_path}_page_{i}.png"
+        img.save(img_path, "PNG")
+        file_paths.append(img_path)
+    return file_paths
+
+
+def process_file(url: Optional[str] = None,
+                 file_path: Optional[str] = None,
+                 max_pages: int = DEFAULT_MAX_PDF_PAGES) -> FileResults:
+    """
+    Process file into FileResults (detect type, convert PDF to images).
+    """
+    file_properties = FileProperties()
+    file_properties.file_dir = tempfile.mkdtemp()
+    file_properties.file_path = file_path or os.path.join(file_properties.file_dir, generate_random_string())
+
+    # if url provided, download
+    if url is not None:
+        import requests
+        r = requests.get(url)
+        if r.status_code == 200:
+            with open(file_properties.file_path, "wb") as f:
+                f.write(r.content)
+        else:
+            raise RuntimeError(f"Failed to download {url}: {r.status_code}")
+
+    file_properties.file_present = os.path.exists(file_properties.file_path)
+
+    if file_properties.file_present:
+        ftype = identify_file_type(file_properties.file_path)
+        file_properties.file_type = ftype or ""
+        if ftype in (FileTypes.PNG, FileTypes.JPEG):
+            file_properties.page_paths = [file_properties.file_path]
+        elif ftype == FileTypes.PDF:
+            file_properties.page_paths = process_pdf(file_properties.file_path, max_pages=max_pages)
+        else:
+            file_properties.page_paths = []
+        file_properties.pages = len(file_properties.page_paths)
+
+    # wrap
+    file_results = FileResults(
+        properties=file_properties,
+        document_category_details=DocumentCategoryDetails(),  # includes status/note now
+        ocr_results=None
+    )
+
+    logger.info("file_results: %s", file_results.model_dump())
+    return file_results
diff --git a/packages/Classification_backend/app/services/ocr/utils/test_process_file.py b/packages/Classification_backend/app/services/ocr/utils/test_process_file.py
new file mode 100644
index 0000000..233f42e
--- /dev/null
+++ b/packages/Classification_backend/app/services/ocr/utils/test_process_file.py
@@ -0,0 +1,106 @@
+import os
+import tempfile
+import shutil
+import pytest
+from unittest import mock
+from ocr.schema import FileProperties
+
+from document_reader.ocr.utils.process_file import (
+    process_file,
+    FileTypes,
+    generate_random_string,
+    identify_file_type,
+    identify_file_type_by_magic,
+    process_pdf,
+    download_file,
+)
+
+@pytest.fixture
+def temp_png_file():
+    temp_dir = tempfile.mkdtemp()
+    file_path = os.path.join(temp_dir, "test.png")
+    # Minimal valid PNG header
+    with open(file_path, "wb") as f:
+        f.write(b'\x89PNG\r\n\x1a\n' + b'\x00' * 10)
+    yield file_path
+    shutil.rmtree(temp_dir)
+
+@pytest.fixture
+def temp_jpeg_file():
+    temp_dir = tempfile.mkdtemp()
+    file_path = os.path.join(temp_dir, "test.jpg")
+    # Minimal valid JPEG header
+    with open(file_path, "wb") as f:
+        f.write(b'\xff\xd8\xff' + b'\x00' * 10)
+    yield file_path
+    shutil.rmtree(temp_dir)
+
+@pytest.fixture
+def temp_pdf_file():
+    temp_dir = tempfile.mkdtemp()
+    file_path = os.path.join(temp_dir, "test.pdf")
+    # Minimal valid PDF header
+    with open(file_path, "wb") as f:
+        f.write(b'%PDF-1.4\n%')
+    yield file_path
+    shutil.rmtree(temp_dir)
+
+def test_generate_random_string_length():
+    s = generate_random_string(24)
+    assert isinstance(s, str)
+    assert len(s) == 24
+
+def test_identify_file_type_png(temp_png_file):
+    assert identify_file_type(temp_png_file) == FileTypes.PNG
+
+def test_identify_file_type_jpeg(temp_jpeg_file):
+    assert identify_file_type(temp_jpeg_file) == FileTypes.JPEG
+
+def test_identify_file_type_pdf(temp_pdf_file):
+    assert identify_file_type(temp_pdf_file) == FileTypes.PDF
+
+def test_identify_file_type_by_magic_png(temp_png_file):
+    assert identify_file_type_by_magic(temp_png_file) == FileTypes.PNG
+
+def test_identify_file_type_by_magic_jpeg(temp_jpeg_file):
+    assert identify_file_type_by_magic(temp_jpeg_file) == FileTypes.JPEG
+
+def test_identify_file_type_by_magic_pdf(temp_pdf_file):
+    assert identify_file_type_by_magic(temp_pdf_file) == FileTypes.PDF
+
+def test_process_file_png(temp_png_file):
+    props = process_file(file_path=temp_png_file)
+    assert props.file_present is True
+    assert props.file_type == FileTypes.PNG
+    assert props.pages == 1
+    assert props.page_paths == [props.file_path]
+
+def test_process_file_jpeg(temp_jpeg_file):
+    props = process_file(file_path=temp_jpeg_file)
+    assert props.file_present is True
+    assert props.file_type == FileTypes.JPEG
+    assert props.pages == 1
+    assert props.page_paths == [props.file_path]
+
+def test_process_file_pdf_calls_process_pdf(temp_pdf_file):
+    # Patch process_pdf to avoid actual PDF processing
+    with mock.patch("document_reader.ocr.utils.process_file.process_pdf", return_value=["page1.png", "page2.png"]):
+        props = process_file(file_path=temp_pdf_file)
+        assert props.file_present is True
+        assert props.file_type == FileTypes.PDF
+        assert props.pages == 2
+        assert props.page_paths == ["page1.png", "page2.png"]
+
+def test_download_file_success(tmp_path, requests_mock):
+    url = "http://example.com/test.png"
+    file_path = tmp_path / "test.png"
+    requests_mock.get(url, content=b"abc", status_code=200)
+    download_file(url, str(file_path))
+    assert file_path.read_bytes() == b"abc"
+
+def test_download_file_failure(tmp_path, requests_mock):
+    url = "http://example.com/test.png"
+    file_path = tmp_path / "test.png"
+    requests_mock.get(url, status_code=404)
+    with pytest.raises(Exception):
+        download_file(url, str(file_path))
\ No newline at end of file