diff --git a/packages/Classification_backend/app/services/ocr/__init__.py b/packages/Classification_backend/app/services/ocr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/Classification_backend/app/services/ocr/analyzer/__init__.py b/packages/Classification_backend/app/services/ocr/analyzer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/Classification_backend/app/services/ocr/analyzer/analyze.py b/packages/Classification_backend/app/services/ocr/analyzer/analyze.py new file mode 100644 index 0000000..83df118 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/analyzer/analyze.py @@ -0,0 +1,146 @@ +# analyzer/analyze.py + +from schema_base import FileResults +from utils.logger import setup_logger +from analyzer.llm import DocumentLLM +from documents.document_type.prompt import document_type_prompt +from documents.document_type.schema import DocumentCategoryAndType +from typing import Optional, List, Tuple +import difflib + +logger = setup_logger(__name__) + + +def _normalize_value(s: str, allowed: Optional[List[str]] = None) -> str: + """ + Normalize a string to lowercase with underscores and optionally fuzzy match + against a provided allowed list. + """ + base = str(s or "unknown").strip().lower().replace(" ", "_").replace("-", "_") + if allowed: + match = difflib.get_close_matches(base, allowed, n=1, cutoff=0.8) + if match: + return match[0] + return base + + +def _normalize_pair( + cat: str, typ: str, + allowed_cats: Optional[List[str]] = None, + allowed_types: Optional[List[str]] = None +) -> Tuple[str, str]: + return ( + _normalize_value(cat, allowed_cats), + _normalize_value(typ, allowed_types) + ) + + +def analyze_document( + file_results: FileResults, + allowed_pairs: Optional[List[dict]] = None, + stop_on_failure: bool = False +) -> Optional[FileResults]: + """ + Runs classification on the provided file_results (uses page images). + Returns updated FileResults with normalized category/type and status. + """ + if not file_results or not file_results.properties or not file_results.properties.page_paths: + logger.debug("No file_results or no page_paths present") + return None + + # Normalize allowed_pairs set and track normalized values for fuzzy matching + allowed_pairs_set = set() + allowed_cats = [] + allowed_types = [] + if allowed_pairs: + for p in allowed_pairs: + try: + c = p.get("document_category") + t = p.get("document_type") + except Exception: + continue + cat_n = _normalize_value(c) + typ_n = _normalize_value(t) + allowed_pairs_set.add((cat_n, typ_n)) + allowed_cats.append(cat_n) + allowed_types.append(typ_n) + + document_llm = DocumentLLM() + logger.info("Analyzing pages: %s", file_results.properties.page_paths) + + response = document_llm.call_llm_api( + prompt=document_type_prompt, + image_path=file_results.properties.page_paths + ) + parsed = response.get("parsed") + raw = response.get("raw") + error = response.get("error") + + # default fallback + file_results.document_category_details.document_category = "unknown" + file_results.document_category_details.document_type = "unknown" + try: + file_results.document_category_details.status = "unknown" + file_results.document_category_details.note = None + except Exception: + pass + + if parsed: + try: + # Validate/coerce with Pydantic model + doc_cat_type = DocumentCategoryAndType.model_validate(parsed) + cat_raw = str(doc_cat_type.document_category) + typ_raw = str(doc_cat_type.document_type) + + # Normalize with fuzzy matching + cat_n, typ_n = _normalize_pair(cat_raw, typ_raw, allowed_cats, allowed_types) + + # Determine status relative to allowed_pairs + if allowed_pairs_set and (cat_n, typ_n) not in allowed_pairs_set: + status = "extra" + note = "Pair not in allowed list (user-provided)" + else: + status = "classified" + note = None + + # Write back + file_results.document_category_details.document_category = cat_n + file_results.document_category_details.document_type = typ_n + try: + file_results.document_category_details.status = status + file_results.document_category_details.note = note + except Exception: + pass + + # Store raw LLM + parsed outputs + file_results.ocr_results = { + "llm_raw": raw, + "llm_parsed": parsed, + "status": status, + "note": note + } + + logger.info("Classification result: %s, %s -> %s", cat_raw, typ_raw, status) + return file_results + + except Exception as e: + logger.exception("Failed to validate LLM parsed output: %s", e) + file_results.document_category_details.document_category = "unknown" + file_results.document_category_details.document_type = "unknown" + file_results.document_category_details.status = "unknown" + file_results.document_category_details.note = "validation_failed" + file_results.ocr_results = {"llm_raw": raw, "error": "validation_failed"} + if stop_on_failure: + return None + return file_results + + else: + logger.warning("LLM returned no parsed JSON (error=%s)", error) + file_results.document_category_details.document_category = "unknown" + file_results.document_category_details.document_type = "unknown" + file_results.document_category_details.status = "unknown" + file_results.document_category_details.note = error or "no_parsed_response" + file_results.ocr_results = {"error": error} + if stop_on_failure: + return None + return file_results diff --git a/packages/Classification_backend/app/services/ocr/analyzer/llm.py b/packages/Classification_backend/app/services/ocr/analyzer/llm.py new file mode 100644 index 0000000..2b3f65c --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/analyzer/llm.py @@ -0,0 +1,67 @@ +# analyzer/llm.py +from pydantic import BaseModel +from dotenv import load_dotenv +import os +import json +import re +import time +from vertexai.generative_models import Part, Image, GenerativeModel +import vertexai +from utils.logger import setup_logger + +logger = setup_logger(__name__) +load_dotenv() + +project = os.getenv("GOOGLE_CLOUD_PROJECT") +location = os.getenv("GOOGLE_CLOUD_LOCATION") +if project and location: + try: + vertexai.init(project=project, location=location) + except Exception: + logger.exception("vertexai.init failed (maybe running locally without credentials).") + +class DocumentLLM(BaseModel): + """ + Wrapper around Vertex/Gemini model to pass images + prompt and return parsed + raw outputs. + """ + + def call_llm_api(self, prompt: str, image_path: list[str], retries: int = 2, backoff: float = 1.0) -> dict: + """ + Calls the generative model with images + prompt. + Returns: { "parsed":(), "raw": , "error": } + """ + try: + model = GenerativeModel(model_name="gemini-2.0-flash-001") + except Exception as e: + # if model creation fails, raise up (caller will fallback) + logger.exception("Failed to create GenerativeModel: %s", e) + raise + + text_part = Part.from_text(prompt) + image_parts = [] + for p in image_path: + try: + image_parts.append(Part.from_image(Image.load_from_file(p))) + except Exception: + logger.exception("Failed loading image for LLM: %s", p) + # still proceed (model may accept less images) + last_exc = None + for attempt in range(retries + 1): + try: + response = model.generate_content([*image_parts, text_part]) + raw_text = response.text + # First try direct JSON parse + try: + parsed = json.loads(raw_text) + return {"parsed": parsed, "raw": raw_text, "error": None} + except Exception: + # strip fenced code blocks and try again + cleaned = re.sub(r"^```json\s*|\s*```$", "", raw_text, flags=re.MULTILINE) + parsed = json.loads(cleaned) + return {"parsed": parsed, "raw": raw_text, "error": None} + except Exception as e: + last_exc = e + logger.warning("LLM call failed (attempt %d/%d): %s", attempt + 1, retries + 1, str(e)) + time.sleep(backoff * (2 ** attempt)) + logger.exception("LLM all retries failed: %s", last_exc) + return {"parsed": None, "raw": None, "error": str(last_exc)} diff --git a/packages/Classification_backend/app/services/ocr/documents/__init__.py b/packages/Classification_backend/app/services/ocr/documents/__init__.py new file mode 100644 index 0000000..a587144 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/documents/__init__.py @@ -0,0 +1,2 @@ +from .prompt import document_type_prompt +from .schema import DocumentCategoryAndType \ No newline at end of file diff --git a/packages/Classification_backend/app/services/ocr/documents/document_type/prompt.py b/packages/Classification_backend/app/services/ocr/documents/document_type/prompt.py new file mode 100644 index 0000000..76203c6 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/documents/document_type/prompt.py @@ -0,0 +1,48 @@ +document_type_prompt = """ +Document Type Identification Agent Prompt +You are a document classification assistant. + +You will be given one or more images of a document. Analyze carefully and output the most appropriate +document_category and document_type. + +Canonical categories and example types: + +{ + "identity_verification_document": ["passport","driving_license","national_identity_card","other"], + "bank_statement": ["bank_statement","other"], + "income_document": ["payslip","p60","contract_of_employment","other"], + "expenditure": ["bank_statement","other"] +} + +Examples: + +Example 1: +Input: passport image +Output: +{"document_category": "identity_verification_document","document_type": "passport"} + +Example 2: +Input: payslip +Output: +{"document_category": "income_document","document_type": "payslip"} + +Example 3: +Input: bank statement +Output: +{"document_category": "bank_statement","document_type": "bank_statement"} + +Example 4: +Input: driving licence +Output: +{"document_category": "identity_verification_document","document_type": "driving_license"} + +Example 5: +Input: irrelevant or unclear +Output: +{"document_category": "unknown","document_type": "unknown"} + +Instructions: +- Always choose from canonical values if possible. +- If unsure, use "unknown". +- Respond with a single JSON object only, no extra commentary. +""" diff --git a/packages/Classification_backend/app/services/ocr/documents/document_type/schema.py b/packages/Classification_backend/app/services/ocr/documents/document_type/schema.py new file mode 100644 index 0000000..678b691 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/documents/document_type/schema.py @@ -0,0 +1,89 @@ +# schema.py +from pydantic import BaseModel, Field, field_validator +from typing import Any +from schema_base import StrEnumBase + + +class DocumentCategoryEnum(StrEnumBase): + IDENTITY_VERIFICATION_DOCUMENT = "identity_verification_document" + BANK_STATEMENT = "bank_statement" + INCOME_DOCUMENT = "income_document" + EXPENDITURE = "expenditure" + CREDIT_REPORT = "credit_report" + OTHER = "other" + UNKNOWN = "unknown" + + +class DocumentTypeEnum(StrEnumBase): + PASSPORT = "passport" + DRIVING_LICENSE = "driving_license" + NATIONAL_IDENTITY_CARD = "national_identity_card" + BANK_STATEMENT = "bank_statement" + PAYSLIP = "payslip" + P60 = "p60" + CONTRACT_OF_EMPLOYMENT = "contract_of_employment" + MARRIAGE_CERTIFICATE = "marriage_certificate" + PRE_MATERNITY_PAYSLIP = "pre_maternity_payslip" + PENSION_PAYSLIP = "pension_payslip" + ANNUAL_PENSION_STATEMENT = "pension_annual_statement" + EMPLOYER_LETTER = "letter_from_employer" + CREDIT_TRANSUNION = "transunion" + CREDIT_EXPERIAN = "experian" + OTHER = "other" + UNKNOWN = "unknown" + + +# mapping dictionaries to fix name mismatches +CATEGORY_MAPPING = { + "income": DocumentCategoryEnum.INCOME_DOCUMENT.value, + "income_document": DocumentCategoryEnum.INCOME_DOCUMENT.value, + "id proof": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value, + "identity": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value, + "identity_document": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value, + "identity_verification_document": DocumentCategoryEnum.IDENTITY_VERIFICATION_DOCUMENT.value, + "expenditure": DocumentCategoryEnum.EXPENDITURE.value, + "bank statement": DocumentCategoryEnum.BANK_STATEMENT.value, + "credit report": DocumentCategoryEnum.CREDIT_REPORT.value, +} + +TYPE_MAPPING = { + "pay slip": DocumentTypeEnum.PAYSLIP.value, + "payslip": DocumentTypeEnum.PAYSLIP.value, + "p60": DocumentTypeEnum.P60.value, + "marriage certificate": DocumentTypeEnum.MARRIAGE_CERTIFICATE.value, + "contract of employment": DocumentTypeEnum.CONTRACT_OF_EMPLOYMENT.value, + "previous contract of employment": DocumentTypeEnum.CONTRACT_OF_EMPLOYMENT.value, + "pre maternity pay slip": DocumentTypeEnum.PRE_MATERNITY_PAYSLIP.value, + "annual pension scheme statement": DocumentTypeEnum.ANNUAL_PENSION_STATEMENT.value, + "confirmation of pension scheme": DocumentTypeEnum.ANNUAL_PENSION_STATEMENT.value, + "pension pay slip": DocumentTypeEnum.PENSION_PAYSLIP.value, + "pension annual statement": DocumentTypeEnum.ANNUAL_PENSION_STATEMENT.value, + "uk paaport": DocumentTypeEnum.PASSPORT.value, # typo fixed + "passport": DocumentTypeEnum.PASSPORT.value, + "share code": DocumentTypeEnum.NATIONAL_IDENTITY_CARD.value, + "indefinite leave to remain": DocumentTypeEnum.NATIONAL_IDENTITY_CARD.value, + "bank statements": DocumentTypeEnum.BANK_STATEMENT.value, + "transunion": DocumentTypeEnum.CREDIT_TRANSUNION.value, + "experian": DocumentTypeEnum.CREDIT_EXPERIAN.value, +} + + +class DocumentCategoryAndType(BaseModel): + document_category: str = Field(..., description="Category of the document") + document_type: str = Field(..., description="Type of the document") + + @field_validator("document_category", mode="before") + @classmethod + def _coerce_category(cls, v: Any) -> str: + if v is None: + return DocumentCategoryEnum.UNKNOWN.value + s = str(v).strip().lower() + return CATEGORY_MAPPING.get(s, s.replace(" ", "_").replace("-", "_")) + + @field_validator("document_type", mode="before") + @classmethod + def _coerce_type(cls, v: Any) -> str: + if v is None: + return DocumentTypeEnum.UNKNOWN.value + s = str(v).strip().lower() + return TYPE_MAPPING.get(s, s.replace(" ", "_").replace("-", "_")) diff --git a/packages/Classification_backend/app/services/ocr/requirements.txt b/packages/Classification_backend/app/services/ocr/requirements.txt new file mode 100644 index 0000000..672f3ca --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/requirements.txt @@ -0,0 +1,103 @@ +altair==5.5.0 +annotated-types==0.7.0 +anyio==4.9.0 +asttokens==3.0.0 +attrs==25.3.0 +blinker==1.9.0 +cachetools==5.5.2 +certifi==2025.1.31 +charset-normalizer==3.4.1 +click==8.1.8 +comm==0.2.2 +debugpy==1.8.14 +decorator==5.2.1 +distro==1.9.0 +docstring_parser==0.13 +et_xmlfile==2.0.0 +executing==2.2.0 +fastapi==0.115.12 +gitdb==4.0.12 +GitPython==3.1.44 +google-api-core==2.24.2 +google-auth==2.38.0 +google-cloud-aiplatform==1.88.0 +google-cloud-bigquery==3.31.0 +google-cloud-core==2.4.3 +google-cloud-resource-manager==1.14.2 +google-cloud-storage==2.19.0 +google-crc32c==1.7.1 +google-genai==1.10.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.70.0 +grpc-google-iam-v1==0.14.2 +grpcio==1.71.0 +grpcio-status==1.71.0 +h11==0.14.0 +httpcore==1.0.8 +httpx==0.28.1 +idna==3.10 +ipykernel==6.29.5 +ipython==9.1.0 +ipython_pygments_lexers==1.1.1 +jedi==0.19.2 +Jinja2==3.1.6 +jiter==0.9.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +MarkupSafe==3.0.2 +matplotlib-inline==0.1.7 +narwhals==1.34.1 +nest-asyncio==1.6.0 +numpy==2.2.4 +openai==1.73.0 +openpyxl==3.1.5 +packaging==24.2 +pandas==2.2.3 +parso==0.8.4 +pdf2image==1.17.0 +pexpect==4.9.0 +pillow==11.1.0 +platformdirs==4.3.7 +prompt_toolkit==3.0.50 +proto-plus==1.26.1 +protobuf==5.29.4 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pyarrow==19.0.1 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pydantic==2.11.3 +pydantic_core==2.33.1 +pydeck==0.9.1 +Pygments==2.19.1 +PyMuPDF==1.25.5 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +pytz==2025.2 +pyzmq==26.4.0 +referencing==0.36.2 +requests==2.32.3 +rpds-py==0.24.0 +rsa==4.2 +shapely==2.1.0 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +stack-data==0.6.3 +starlette==0.46.1 +streamlit==1.44.1 +tenacity==9.1.2 +toml==0.10.2 +tornado==6.4.2 +tqdm==4.67.1 +traitlets==5.14.3 +typing-inspection==0.4.0 +typing_extensions==4.13.2 +tzdata==2025.2 +urllib3==2.4.0 +watchdog==6.0.0 +wcwidth==0.2.13 +websockets==15.0.1 diff --git a/packages/Classification_backend/app/services/ocr/run_engine.py b/packages/Classification_backend/app/services/ocr/run_engine.py new file mode 100644 index 0000000..94eb1a9 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/run_engine.py @@ -0,0 +1,239 @@ +import os +import json +import uuid +import shutil +import hashlib +import asyncio +import time +import tempfile +from typing import List, Optional +from uuid import UUID as UUIDType + +from fastapi import FastAPI, HTTPException, UploadFile, File, Form +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field, ValidationError +from utils.process_file import save_upload_to_temp, process_file, DEFAULT_MAX_UPLOAD_BYTES, DEFAULT_MAX_PDF_PAGES +from analyzer.analyze import analyze_document +from schema_base import FileResults +from documents.document_type.schema import DocumentCategoryAndType +from utils.logger import setup_logger + +logger = setup_logger(__name__) + +app = FastAPI() + +# Configurable limits +MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", DEFAULT_MAX_UPLOAD_BYTES)) +MAX_PDF_PAGES = int(os.getenv("MAX_PDF_PAGES", DEFAULT_MAX_PDF_PAGES)) +FUZZY_MATCHING = os.getenv("FUZZY_MATCHING", "true").lower() in ("1", "true", "yes") + +# In-memory cache (sha256 -> classification dict) +_FILE_CACHE: dict[str, dict] = {} + + +def _sha256_of_file(path: str, chunk: int = 65536) -> str: + h = hashlib.sha256() + with open(path, "rb") as f: + for c in iter(lambda: f.read(chunk), b""): + h.update(c) + return h.hexdigest() + + +def _norm(s: Optional[str]) -> str: + if not s: + return "unknown" + return str(s).strip().lower().replace(" ", "_").replace("-", "_") + + +def coerce_category_type_pair(cat: str, typ: str) -> tuple[str, str]: + """ + Normalize strings and coerce bank_statement -> bank_statement category if type indicates bank. + """ + c = _norm(cat) + t = _norm(typ) + # some heuristics + if "bank" in t: + c = "bank_statement" + if c in ("id_proof", "id", "identity"): + c = "identity_verification_document" + if c == "income": + c = "income_document" + return c, t + + +@app.post("/analyze") +async def analyze( + payload: str = Form(...), # JSON string with Application_id, Application_type, total_list_of_documents, required_documents + file: UploadFile = File(...) # single file per request (can be extended to List[UploadFile]) +): + """ + New /analyze endpoint: + - payload: JSON string with keys: + - Application_id + - Application_type + - total_list_of_documents: [{document_category, document_type}, ...] + - required_documents: [{document_category, document_type, is_optional}, ...] + - file: uploaded file (single) + Returns: + { + Application_id, + Application_type, + classification_overall_result, + classification_results: [ {document_category, document_type, optional, result, reason, matched_filename?}, ... ] + } + """ + # parse payload JSON + try: + data = json.loads(payload) + except Exception as e: + raise HTTPException(status_code=400, detail=f"payload must be valid JSON: {e}") + + # validate minimal fields + if "Application_id" not in data or "Application_type" not in data: + raise HTTPException(status_code=400, detail="payload must include Application_id and Application_type") + + total_list = data.get("total_list_of_documents", []) + required_list = data.get("required_documents", []) + + # prepare allowed_pairs for analyzer + allowed_pairs = [] + for p in total_list: + # tolerate various key names + cat = p.get("document_category") or p.get("category") + typ = p.get("document_type") or p.get("type") + if cat is None or typ is None: + continue + c, t = coerce_category_type_pair(cat, typ) + allowed_pairs.append({"document_category": c, "document_type": t}) + + # process uploaded file (save and run OCR + LLM) + fname = file.filename or "uploaded_file" + saved_path = None + file_results_obj = None + sha = None + + try: + # save upload to temp (streamed) + saved_path = await asyncio.to_thread(save_upload_to_temp, file, None, MAX_UPLOAD_BYTES) + sha = _sha256_of_file(saved_path) + + # try cache + if sha in _FILE_CACHE: + logger.info("Cache hit for file %s", fname) + cached = _FILE_CACHE[sha] + classified_docs = [{ + "filename": fname, + "file_path": saved_path, + "status": cached.get("status", "classified"), + "document_category": cached["document_category"], + "document_type": cached["document_type"], + "note": cached.get("note") + }] + else: + # process file -> produces FileResults with properties.page_paths etc. + file_results_obj = await asyncio.to_thread(process_file, None, saved_path, MAX_PDF_PAGES) + if not file_results_obj or not file_results_obj.properties.file_present or not file_results_obj.properties.page_paths: + raise HTTPException(status_code=400, detail="file not present or contains no pages") + + # analyze_document will attach document_category/document_type and status/note inside file_results_obj + analyzed = await asyncio.to_thread(analyze_document, file_results_obj, allowed_pairs) + + if not analyzed: + raise HTTPException(status_code=500, detail="analysis failed") + + # read values (fall back to ocr_results if needed) + cat = getattr(analyzed.document_category_details, "document_category", "unknown") + typ = getattr(analyzed.document_category_details, "document_type", "unknown") + # try to get status/note; else check ocr_results + status = getattr(analyzed.document_category_details, "status", None) + note = getattr(analyzed.document_category_details, "note", None) + if not status: + ocr = getattr(analyzed, "ocr_results", {}) or {} + status = ocr.get("status", "classified" if cat != "unknown" else "unknown") + note = note or ocr.get("note") + + # normalize + cat, typ = coerce_category_type_pair(cat, typ) + + classified_docs = [{ + "filename": fname, + "file_path": saved_path, + "status": status if status else "classified", + "document_category": cat, + "document_type": typ, + "note": note + }] + + # cache summary + _FILE_CACHE[sha] = { + "document_category": cat, + "document_type": typ, + "status": status, + "note": note, + "timestamp": time.time() + } + + except HTTPException: + raise + except Exception as e: + logger.exception("Error processing uploaded file %s: %s", fname, e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + # cleanup process_file temp directories if present + try: + if file_results_obj and file_results_obj.properties and getattr(file_results_obj.properties, "file_dir", None): + shutil.rmtree(file_results_obj.properties.file_dir, ignore_errors=True) + else: + # if saved_path lives in a tmp dir created by save_upload_to_temp, try to remove parent + if saved_path: + parent = os.path.dirname(saved_path) + if parent and "tmp" in parent: + shutil.rmtree(parent, ignore_errors=True) + except Exception: + logger.exception("Cleanup failed for file %s", fname) + + # Now compute results for each required document (strict both category & type) + classification_results = [] + overall_ok = True + + # helper to check presence in classified_docs + def _is_present(req_cat: str, req_typ: str) -> (bool, Optional[str]): + req_c, req_t = coerce_category_type_pair(req_cat, req_typ) + for d in classified_docs: + # only consider classified and classified_extra and cached classified + if d.get("status") not in (None, "classified", "classified_extra", "cache", "classified_cached"): + # still allow matching if status unknown? we stick to classified/classified_extra/cache + pass + if d.get("document_category") == req_c and d.get("document_type") == req_t: + return True, d.get("filename") + return False, None + + for r in required_list: + req_cat = r.get("document_category") + req_typ = r.get("document_type") + is_opt = bool(r.get("is_optional", False)) + + present, matched_filename = _is_present(req_cat, req_typ) + if not present and not is_opt: + overall_ok = False + + reason = f"{req_typ} is present" if present else f"{req_typ} is missing" + + entry = { + "document_category": req_cat, + "document_type": req_typ, + "optional": is_opt, + "result": present, + "reason": reason + } + if present and matched_filename: + entry["matched_filename"] = matched_filename + + classification_results.append(entry) + + return JSONResponse({ + "Application_id": data.get("Application_id"), + "Application_type": data.get("Application_type"), + "classification_overall_result": overall_ok, + "classification_results": classification_results + }, status_code=200) diff --git a/packages/Classification_backend/app/services/ocr/schema_base.py b/packages/Classification_backend/app/services/ocr/schema_base.py new file mode 100644 index 0000000..0af9fb4 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/schema_base.py @@ -0,0 +1,44 @@ +from pydantic import BaseModel, Field +from enum import Enum + + +class StrEnumBase(str, Enum): + """Base enum with str-like behaviour.""" + def __str__(self): + return self.value + + @classmethod + def keys(cls): + return [m.name for m in cls] + + @classmethod + def values(cls): + return [str(m) for m in cls] + + @classmethod + def items(cls): + return [(m.name, str(m)) for m in cls] + + +class FileProperties(BaseModel): + file_path: str = Field(default="", description="Local file path") + file_dir: str = Field(default="", description="Temp dir for file artifacts") + file_type: str = Field(default="", description="File type (pdf, png, jpeg)") + pages: int = Field(default=0, description="Number of pages/images") + page_paths: list[str] = Field(default_factory=list, description="Paths to per-page images") + file_present: bool = Field(default=False, description="Whether file exists") + + +class DocumentCategoryDetails(BaseModel): + """Classification results for one file.""" + document_category: str | None = Field(default=None, description="e.g. income_document") + document_type: str | None = Field(default=None, description="e.g. payslip") + status: str | None = Field(default=None, description="classified / extra / unknown") + note: str | None = Field(default=None, description="Any extra comment") + + +class FileResults(BaseModel): + """Aggregated result (file metadata + classification).""" + properties: FileProperties = FileProperties() + document_category_details: DocumentCategoryDetails = DocumentCategoryDetails() + ocr_results: dict | None = Field(default=None, description="Optional OCR/LLM raw output") diff --git a/packages/Classification_backend/app/services/ocr/utils/__init__.py b/packages/Classification_backend/app/services/ocr/utils/__init__.py new file mode 100644 index 0000000..5098ec4 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/utils/__init__.py @@ -0,0 +1 @@ +from .logger import setup_logger \ No newline at end of file diff --git a/packages/Classification_backend/app/services/ocr/utils/image_utils.py b/packages/Classification_backend/app/services/ocr/utils/image_utils.py new file mode 100644 index 0000000..8b462e2 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/utils/image_utils.py @@ -0,0 +1,78 @@ +import base64 +from io import BytesIO +import pymupdf +from PIL import Image +import streamlit as st +import os +from datetime import datetime + + +def generate_metadata(file_path): + """Generate metadata dictionary from file path and properties""" + file_stat = os.stat(file_path) + file_name = os.path.basename(file_path) + parent_dir = os.path.basename(os.path.dirname(file_path)) + + metadata = { + "File Name": file_name, + "Directory": parent_dir, + "File Size": f"{file_stat.st_size / 1024:.2f} KB", + "Last Modified": datetime.fromtimestamp(file_stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'), + "Created": datetime.fromtimestamp(file_stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S'), + "File Extension": os.path.splitext(file_name)[1], + "Full Path": file_path + } + + # Add image-specific metadata if it's an image + if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')): + try: + with Image.open(file_path) as img: + metadata.update({ + "Image Size": f"{img.size[0]}x{img.size[1]}", + "Image Mode": img.mode, + "Image Format": img.format + }) + except Exception as e: + st.error(f"Error reading image metadata: {str(e)}") + + # Add PDF-specific metadata if it's a PDF + elif file_name.lower().endswith('.pdf'): + try: + doc = pymupdf.Document(file_path) + metadata.update({ + "Page Count": len(doc), + "PDF Version": doc.pdf_version, + "Document Info": doc.metadata if doc.metadata else "No PDF metadata available" + }) + except Exception as e: + st.error(f"Error reading PDF metadata: {str(e)}") + + return metadata + + +def load_pdf_as_image(file_path): + # Open PDF file + doc = pymupdf.Document(file_path) + + # Get the first page + page = doc[0] + + # Convert to image + pix = page.get_pixmap() + + # Convert to PIL Image + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + return img + + +def im_2_b64(image): + buff = BytesIO() + image.save(buff, format="JPEG") + img_str = base64.b64encode(buff.getvalue()).decode("utf-8") + return img_str + + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') diff --git a/packages/Classification_backend/app/services/ocr/utils/logger.py b/packages/Classification_backend/app/services/ocr/utils/logger.py new file mode 100644 index 0000000..333ed1f --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/utils/logger.py @@ -0,0 +1,43 @@ +import logging +import os +from datetime import datetime +# from config import LOGS_DIR +# Create logs directory if it doesn't exist + +LOGS_DIR = "logs_directory" +os.makedirs(LOGS_DIR, exist_ok=True) +# Generate filename with timestamp +log_filename = os.path.join( + LOGS_DIR, f"app_{datetime.now().strftime('%Y%m%d')}.log") + + +def setup_logger(name): + """ + Create a logger with the specified name that writes to both file and console + """ + logger = logging.getLogger(name) + # Only configure if it hasn't been configured yet + if not logger.handlers: + logger.setLevel(logging.DEBUG) + # Create file handler + file_handler = logging.FileHandler(log_filename) + file_handler.setLevel(logging.DEBUG) + file_formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + file_handler.setFormatter(file_formatter) + # Create console handler + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) # Less verbose for console + console_formatter = logging.Formatter( + '%(levelname)s - %(name)s - %(message)s' + ) + console_handler.setFormatter(console_formatter) + # Add handlers to logger + logger.addHandler(file_handler) + logger.addHandler(console_handler) + return logger + + +logger = setup_logger(__name__) diff --git a/packages/Classification_backend/app/services/ocr/utils/process_file.py b/packages/Classification_backend/app/services/ocr/utils/process_file.py new file mode 100644 index 0000000..01ebdc2 --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/utils/process_file.py @@ -0,0 +1,158 @@ +import tempfile, os +import secrets +import string +from typing import Optional +from fastapi import UploadFile + +from schema_base import FileProperties, FileResults, DocumentCategoryDetails +from .logger import setup_logger +import pdf2image +from PIL import Image + +# Ensure tempdir is set (change if needed) +os.environ["TMPDIR"] = "D:/temp" +tempfile.tempdir = "D:/temp" + +logger = setup_logger(__name__) + +# Limits (configurable) +DEFAULT_MAX_UPLOAD_BYTES = 30 * 1024 * 1024 # 30 MB +DEFAULT_MAX_PDF_PAGES = 8 +DEFAULT_MAX_IMAGE_DIM = 2500 # px, longest side + + +class FileTypes(str): + PNG = "png" + JPEG = "jpeg" + PDF = "pdf" + + +def generate_random_string(length: int = 16) -> str: + chars = string.ascii_letters + string.digits + return ''.join(secrets.choice(chars) for _ in range(length)) + + +def identify_file_type_by_magic(file_path: str) -> Optional[str]: + """Detect png/jpg/pdf using file header magic numbers.""" + with open(file_path, "rb") as f: + header = f.read(10) + if header.startswith(b'\x89PNG\r\n\x1a\n'): + return FileTypes.PNG + if header.startswith(b'\xff\xd8\xff'): + return FileTypes.JPEG + if header.startswith(b'%PDF'): + return FileTypes.PDF + return None + + +def identify_file_type(file_path: str) -> Optional[str]: + ext = os.path.splitext(file_path)[1].lower().lstrip(".") + if ext in ("png",): + return FileTypes.PNG + if ext in ("jpg", "jpeg"): + return FileTypes.JPEG + if ext == "pdf": + return FileTypes.PDF + return identify_file_type_by_magic(file_path) + + +def save_upload_to_temp(upload_file: UploadFile, tmp_dir: Optional[str] = None, + max_bytes: int = DEFAULT_MAX_UPLOAD_BYTES) -> str: + """ + Save uploaded file to a temporary folder (with size limit). + Returns full path. + """ + base_dir = tmp_dir or tempfile.mkdtemp() + os.makedirs(base_dir, exist_ok=True) + + _, ext = os.path.splitext(upload_file.filename or "") + filename = f"{generate_random_string()}{ext}" + full_path = os.path.join(base_dir, filename) + + total = 0 + chunk_size = 64 * 1024 + with open(full_path, "wb") as out: + while True: + chunk = upload_file.file.read(chunk_size) + if not chunk: + break + total += len(chunk) + if total > max_bytes: + out.close() + try: + os.remove(full_path) + except Exception: + pass + raise ValueError( + f"File '{upload_file.filename}' too large. Max allowed is {max_bytes} bytes." + ) + out.write(chunk) + + # reset file pointer + try: + upload_file.file.seek(0) + except Exception: + pass + + return full_path + + +def process_pdf(file_path: str, + max_pages: int = DEFAULT_MAX_PDF_PAGES, + max_image_dim: int = DEFAULT_MAX_IMAGE_DIM) -> list[str]: + """Convert PDF to PNG pages (downscales if > max_image_dim).""" + images = pdf2image.convert_from_path(file_path) + file_paths: list[str] = [] + for i, img in enumerate(images): + if i >= max_pages: + break + if max(img.size) > max_image_dim: + img.thumbnail((max_image_dim, max_image_dim)) + img_path = f"{file_path}_page_{i}.png" + img.save(img_path, "PNG") + file_paths.append(img_path) + return file_paths + + +def process_file(url: Optional[str] = None, + file_path: Optional[str] = None, + max_pages: int = DEFAULT_MAX_PDF_PAGES) -> FileResults: + """ + Process file into FileResults (detect type, convert PDF to images). + """ + file_properties = FileProperties() + file_properties.file_dir = tempfile.mkdtemp() + file_properties.file_path = file_path or os.path.join(file_properties.file_dir, generate_random_string()) + + # if url provided, download + if url is not None: + import requests + r = requests.get(url) + if r.status_code == 200: + with open(file_properties.file_path, "wb") as f: + f.write(r.content) + else: + raise RuntimeError(f"Failed to download {url}: {r.status_code}") + + file_properties.file_present = os.path.exists(file_properties.file_path) + + if file_properties.file_present: + ftype = identify_file_type(file_properties.file_path) + file_properties.file_type = ftype or "" + if ftype in (FileTypes.PNG, FileTypes.JPEG): + file_properties.page_paths = [file_properties.file_path] + elif ftype == FileTypes.PDF: + file_properties.page_paths = process_pdf(file_properties.file_path, max_pages=max_pages) + else: + file_properties.page_paths = [] + file_properties.pages = len(file_properties.page_paths) + + # wrap + file_results = FileResults( + properties=file_properties, + document_category_details=DocumentCategoryDetails(), # includes status/note now + ocr_results=None + ) + + logger.info("file_results: %s", file_results.model_dump()) + return file_results diff --git a/packages/Classification_backend/app/services/ocr/utils/test_process_file.py b/packages/Classification_backend/app/services/ocr/utils/test_process_file.py new file mode 100644 index 0000000..233f42e --- /dev/null +++ b/packages/Classification_backend/app/services/ocr/utils/test_process_file.py @@ -0,0 +1,106 @@ +import os +import tempfile +import shutil +import pytest +from unittest import mock +from ocr.schema import FileProperties + +from document_reader.ocr.utils.process_file import ( + process_file, + FileTypes, + generate_random_string, + identify_file_type, + identify_file_type_by_magic, + process_pdf, + download_file, +) + +@pytest.fixture +def temp_png_file(): + temp_dir = tempfile.mkdtemp() + file_path = os.path.join(temp_dir, "test.png") + # Minimal valid PNG header + with open(file_path, "wb") as f: + f.write(b'\x89PNG\r\n\x1a\n' + b'\x00' * 10) + yield file_path + shutil.rmtree(temp_dir) + +@pytest.fixture +def temp_jpeg_file(): + temp_dir = tempfile.mkdtemp() + file_path = os.path.join(temp_dir, "test.jpg") + # Minimal valid JPEG header + with open(file_path, "wb") as f: + f.write(b'\xff\xd8\xff' + b'\x00' * 10) + yield file_path + shutil.rmtree(temp_dir) + +@pytest.fixture +def temp_pdf_file(): + temp_dir = tempfile.mkdtemp() + file_path = os.path.join(temp_dir, "test.pdf") + # Minimal valid PDF header + with open(file_path, "wb") as f: + f.write(b'%PDF-1.4\n%') + yield file_path + shutil.rmtree(temp_dir) + +def test_generate_random_string_length(): + s = generate_random_string(24) + assert isinstance(s, str) + assert len(s) == 24 + +def test_identify_file_type_png(temp_png_file): + assert identify_file_type(temp_png_file) == FileTypes.PNG + +def test_identify_file_type_jpeg(temp_jpeg_file): + assert identify_file_type(temp_jpeg_file) == FileTypes.JPEG + +def test_identify_file_type_pdf(temp_pdf_file): + assert identify_file_type(temp_pdf_file) == FileTypes.PDF + +def test_identify_file_type_by_magic_png(temp_png_file): + assert identify_file_type_by_magic(temp_png_file) == FileTypes.PNG + +def test_identify_file_type_by_magic_jpeg(temp_jpeg_file): + assert identify_file_type_by_magic(temp_jpeg_file) == FileTypes.JPEG + +def test_identify_file_type_by_magic_pdf(temp_pdf_file): + assert identify_file_type_by_magic(temp_pdf_file) == FileTypes.PDF + +def test_process_file_png(temp_png_file): + props = process_file(file_path=temp_png_file) + assert props.file_present is True + assert props.file_type == FileTypes.PNG + assert props.pages == 1 + assert props.page_paths == [props.file_path] + +def test_process_file_jpeg(temp_jpeg_file): + props = process_file(file_path=temp_jpeg_file) + assert props.file_present is True + assert props.file_type == FileTypes.JPEG + assert props.pages == 1 + assert props.page_paths == [props.file_path] + +def test_process_file_pdf_calls_process_pdf(temp_pdf_file): + # Patch process_pdf to avoid actual PDF processing + with mock.patch("document_reader.ocr.utils.process_file.process_pdf", return_value=["page1.png", "page2.png"]): + props = process_file(file_path=temp_pdf_file) + assert props.file_present is True + assert props.file_type == FileTypes.PDF + assert props.pages == 2 + assert props.page_paths == ["page1.png", "page2.png"] + +def test_download_file_success(tmp_path, requests_mock): + url = "http://example.com/test.png" + file_path = tmp_path / "test.png" + requests_mock.get(url, content=b"abc", status_code=200) + download_file(url, str(file_path)) + assert file_path.read_bytes() == b"abc" + +def test_download_file_failure(tmp_path, requests_mock): + url = "http://example.com/test.png" + file_path = tmp_path / "test.png" + requests_mock.get(url, status_code=404) + with pytest.raises(Exception): + download_file(url, str(file_path)) \ No newline at end of file