From b5874cc0cbd62add13c36ac57ec7ccf18bd13dd5 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Mon, 1 Dec 2025 21:26:58 -0600
Subject: [PATCH 1/9] chore: add tests

---
 .github/workflows/{docker.yaml => ci.yaml} |  50 ++++-
 input.bib                                  | 112 +++++++++++
 test/files/test.txt => input.txt           |   0
 main.py                                    |  47 +----
 preprocessing/__init__.py                  |   0
 preprocessing/loader.py                    |  14 +-
 requirements.txt                           |   1 +
 test/__init__.py                           |   0
 test/conftest.py                           |  20 ++
 test/files/expected_dtm_from_bib.pkl       | Bin 0 -> 7546 bytes
 test/files/expected_dtm_from_txt.pkl       | Bin 0 -> 1081 bytes
 test/files/expected_vocab_from_bib.pkl     | Bin 0 -> 9047 bytes
 test/files/expected_vocab_from_txt.pkl     | Bin 0 -> 400 bytes
 test/files/input.txt                       |   4 +
 test/test_full.py                          | 213 +++++++++++++++++++++
 test/test_loaders.py                       |  33 ++++
 test/test_normalize.py                     |  17 ++
 test/test_preprocessor_unit.py             |  26 +++
 18 files changed, 487 insertions(+), 50 deletions(-)
 rename .github/workflows/{docker.yaml => ci.yaml} (52%)
 create mode 100644 input.bib
 rename test/files/test.txt => input.txt (100%)
 create mode 100644 preprocessing/__init__.py
 create mode 100644 test/__init__.py
 create mode 100644 test/conftest.py
 create mode 100644 test/files/expected_dtm_from_bib.pkl
 create mode 100644 test/files/expected_dtm_from_txt.pkl
 create mode 100644 test/files/expected_vocab_from_bib.pkl
 create mode 100644 test/files/expected_vocab_from_txt.pkl
 create mode 100644 test/files/input.txt
 create mode 100644 test/test_full.py
 create mode 100644 test/test_loaders.py
 create mode 100644 test/test_normalize.py
 create mode 100644 test/test_preprocessor_unit.py

diff --git a/.github/workflows/docker.yaml b/.github/workflows/ci.yaml
similarity index 52%
rename from .github/workflows/docker.yaml
rename to .github/workflows/ci.yaml
index df0d4cf..3939887 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/ci.yaml
@@ -1,4 +1,5 @@
-name: Docker
+name: CI
+
 on:
   push:
     branches:
@@ -10,9 +11,55 @@ env:
   IMAGE_NAME: ${{ github.repository }}
 
 jobs:
+  lint-python:
+    name: Lint Python
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Run flake8
+        uses: py-actions/flake8@v2
+
+  test:
+    runs-on: ubuntu-latest
+    needs: lint-python
+    services:
+      minio:
+        image: lazybit/minio
+        ports:
+          - 9000:9000
+        env:
+          MINIO_ROOT_USER: minioadmin
+          MINIO_ROOT_PASSWORD: minioadmin
+        options: >-
+          --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
+          --health-interval 5s
+          --health-retries 5
+          --health-timeout 5s
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Run Tests
+        run: pytest -vv
+
   build:
     name: Build docker image
     runs-on: ubuntu-latest
+    needs: test
     permissions:
       contents: read
       packages: write
@@ -42,3 +89,4 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
+
diff --git a/input.bib b/input.bib
new file mode 100644
index 0000000..f525305
--- /dev/null
+++ b/input.bib
@@ -0,0 +1,112 @@
+﻿
+@article{ WOS:001016714700004,
+Author = {White, Joel},
+Title = {Theoretical and Practical Paralogisms of Digital Immortality},
+Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY},
+Year = {2022},
+Volume = {9},
+Number = {2, SI},
+Pages = {155-172},
+Month = {JUL 3},
+Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic
+   and popular relevance; specific naive metaphysical ideas, such as
+   immortality, have returned with this rise. This article refrains from
+   any ethical or political assessment of transhumanism. Still, it
+   critiques the exact metaphysical or idealistic nature of transhumanism
+   and its pursuit of digital immortality: the idea that, through
+   technological advancements, precisely in Artificial General
+   Intelligence, an immortal virtual ``self{''} will become possible. The
+   article follows the form of Immanuel Kant's ``Paralogisms{''} from the
+   Critique of Pure Reason, where Kant is concerned with the substantial,
+   immortal nature of the soul and its experiential impossibility. The
+   article will offer theoretical and practical paralogisms (false logical
+   inferences), arguing that the transhumanist claim that digital
+   immortality is possible fundamentally stems from two incorrect major
+   premises. The first concerns the substantial nature of information,
+   which informs the theoretical paralogisms; the second concerns infinite
+   transformation (pure plasticity), which informs the practical
+   paralogisms},
+Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD},
+Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND},
+Type = {Article},
+Language = {English},
+DOI = {10.1080/20539320.2022.2150463},
+ISSN = {2053-9320},
+EISSN = {2053-9339},
+Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms;
+   Digital Immortality},
+Research-Areas = {Philosophy},
+Web-of-Science-Categories  = {Philosophy},
+Author-Email = {jhmw01@gmail.com},
+ORCID-Numbers = {White, Joel/0000-0001-6460-0564},
+Number-of-Cited-References = {30},
+Times-Cited = {0},
+Usage-Count-Last-180-days = {3},
+Usage-Count-Since-2013 = {15},
+Journal-ISO = {J. Aesthet. Phenomenol.},
+Doc-Delivery-Number = {K5GF0},
+Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)},
+Unique-ID = {WOS:001016714700004},
+DA = {2025-06-26},
+}
+
+@article{ WOS:001322577100012,
+Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir},
+Title = {Sociopolitical Challenges to Digital Transformation of Rural
+   Communities: Learnings from a Case Study From Manipur, India},
+Journal = {IT PROFESSIONAL},
+Year = {2024},
+Volume = {26},
+Number = {4},
+Pages = {42-47},
+Month = {JUL-AUG},
+Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized
+   the inclusive growth of digital networks and digital public goods,
+   utilizing a multistakeholder systems approach. Similarly, the
+   information and communications technology (ICT) Innovation and
+   Intervention Program of the Government of India's Digital North East
+   Vision 2022 has also emphasized a need for inclusive growth of ICT in
+   the Northeast Region. In line with the above, this article presents
+   insights from a field study conducted in the rural parts of Manipur,
+   India, which incidentally can be found to be applicable to many rural
+   parts of the developing world. The article envisions a community-driven
+   sociodigital transformation of the Northeast Region of India. In this
+   quest, the article highlights sociopolitical challenges for digital
+   transformation and provides insights for inclusive ICT in such
+   regions-infrastructure as a utility for every citizen, smart governance
+   and services on demand, digital empowerment of citizens, social welfare,
+   capacity building, and community engagement.},
+Publisher = {IEEE COMPUTER SOC},
+Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA},
+Type = {Article},
+Language = {English},
+Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India.
+   Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India.
+   Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India.
+   Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.},
+DOI = {10.1109/MITP.2024.3433459},
+ISSN = {1520-9202},
+EISSN = {1941-045X},
+Keywords = {Technological innovation; Digital transformation; Government; Buildings;
+   Asia; Africa; Information and communication technology},
+Research-Areas = {Computer Science; Telecommunications},
+Web-of-Science-Categories  = {Computer Science, Information Systems; Computer Science, Software
+   Engineering; Telecommunications},
+Author-Email = {vkant@iitk.ac.in
+   sanjrampk@iiti.ac.in
+   sudhir.dixit@ieee.org},
+Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of
+   Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT
+   System); Indian Institute of Technology (IIT) - Indore},
+ResearcherID-Numbers = {/ITU-6308-2023},
+ORCID-Numbers = {/0000-0002-6215-7500},
+Number-of-Cited-References = {7},
+Times-Cited = {0},
+Usage-Count-Last-180-days = {11},
+Usage-Count-Since-2013 = {22},
+Journal-ISO = {IT Prof.},
+Doc-Delivery-Number = {H3O9D},
+Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)},
+Unique-ID = {WOS:001322577100012},
+DA = {2025-06-26},
+}
diff --git a/test/files/test.txt b/input.txt
similarity index 100%
rename from test/files/test.txt
rename to input.txt
diff --git a/main.py b/main.py
index d48e02e..6698911 100644
--- a/main.py
+++ b/main.py
@@ -3,10 +3,10 @@
 
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
-        EnvSettings,
-        InputSettings,
-        OutputSettings,
-        FileSettings
+    EnvSettings,
+    InputSettings,
+    OutputSettings,
+    FileSettings
 )
 from scystream.sdk.file_handling.s3_manager import S3Operations
 
@@ -81,7 +81,7 @@ def _preprocess_and_store(texts, settings):
     dtm, vocab = pre.generate_document_term_matrix()
 
     with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
-         tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
+            tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
 
         pickle.dump(dtm, tmp_dtm)
         tmp_dtm.flush()
@@ -108,40 +108,3 @@ def preprocess_bib_file(settings):
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
     )
     _preprocess_and_store(texts, settings)
-
-
-"""
-if __name__ == "__main__":
-    test = PreprocessBIB(
-        bib_input=BIBFileInput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="input-bucket",
-            FILE_PATH="input_file_path",
-            FILE_NAME="wos_export",
-            SELECTED_ATTRIBUTE="abstract"
-        ),
-        dtm_output=DTMFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="dtm_file_bib"
-        ),
-        vocab_output=VocabFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="vocab_file_bib"
-        )
-    )
-
-    preprocess_bib_file(test)
-"""
diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/preprocessing/loader.py b/preprocessing/loader.py
index d55aac3..9ff51c6 100644
--- a/preprocessing/loader.py
+++ b/preprocessing/loader.py
@@ -5,17 +5,17 @@
 def normalize_text(text: str) -> str:
     if not text:
         return ""
-    # Remove curly braces
-    text = re.sub(r"[{}]", "", text)
 
-    # Remove LaTeX commands
-    text = re.sub(r"\\[a-zA-Z]+\s*(\{[^}]*\})?", "", text)
+    text = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", text)
+
+    text = re.sub(r"\\[a-zA-Z]+", "", text)
 
-    # Remove LaTeX escaped quotes/accents
-    text = re.sub(r"\\""[a-zA-Z]", lambda m: m.group(0)[-1], text)
+    text = re.sub(r"[{}]", "", text)
+
+    text = re.sub(r'\\"([a-zA-Z])', r'\1', text)
 
     text = re.sub(r"\\'", "", text)
-    text = text.replace("'", "")
+
     text = re.sub(r"\s+", " ", text)
 
     return text.strip()
diff --git a/requirements.txt b/requirements.txt
index 3493ec1..e7737db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ spacy==3.8.7
 nltk==3.9.1
 numpy==2.3.3
 bibtexparser==1.4.3
+pytest==9.0.1
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000..5ce5cca
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,20 @@
+import pytest
+
+from preprocessing.core import Preprocessor
+
+
+@pytest.fixture
+def simple_texts():
+    return ["This is a test sentence.", "Another test sentence."]
+
+
+@pytest.fixture
+def preprocessor():
+    return Preprocessor(
+        language="en",
+        filter_stopwords=True,
+        unigram_normalizer="porter",
+        use_ngrams=True,
+        ngram_min=2,
+        ngram_max=3,
+    )
diff --git a/test/files/expected_dtm_from_bib.pkl b/test/files/expected_dtm_from_bib.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..812760358e55fa9a738dfd127f9a613172985d55
GIT binary patch
literal 7546
zcmdU!zityj7{q;c3=K6E^i-}XsG!5b1uqaKjwK=`c66KzcQpKobX}I$;u&}o2=+}g
z(oeg4-jjuk`!usN^L@K}y2fA5er{fl_P_S|dV71fo1HH=^=fvzy=ks5>iS|A+GpqW
zYPnf|ZtCr_2@j#2?v<f^y1qI{9>RR|HN2}X!}stpyqZ_@@rPg4xA1AwPOqBX-Rhrf
zeEmAy|2POHZS^KB7K`_P9)J7CeAI-?`?tSe9H!~>tU6O_jrE|M2^d}PvfOpo6ZSqs
z<5F|z96fK`ZNPcjm+OQ{Px<nxlHYmdIqz6H<*T~xf5+Na=}PC4+h^z;$}i2+`;>bg
zuAg>;I$sKxItQ93HJ|*%>wD$-q$l3}rEqS))c1$xqkhhlKefN-2~)1ty^>z{>iQU_
zyiz>%r~I>SfqcYMp7Z3-<>}tVtET4qoTYK@JU%zg<GQ|>^2Mj-(ms_#H8r1flzS?k
z=FH`3emZB*cfZz&*LrHc^q%KfI@P}4OZh!N7c0kWZjSVQ_alF5^?4mr4$bHKSYB-1
z^9XxRU!3}$(j0UyG!NwvpPQ4b_rBzJ{z+$a-m&KkyKZQlde435(7sAnnkTo<&^f8+
zo2n}{Z|*q_rKkPS_k+%f=A#_%OMc?rFHCyk<x8!!fA{D1Bfaz9N7!|QrT0quRNnK1
z$**;<q>u5Q(--T0Rh2{iweFSlG2Z)nu5c{pRO^~=sCw+aXs%;(<&S;ur0ZK_??>NU
r`RII3#cQr(E6typOMd55^IR9ZSM_n<u~@lYkDbT!QgL|vr*rlfHW@$`

literal 0
HcmV?d00001

diff --git a/test/files/expected_dtm_from_txt.pkl b/test/files/expected_dtm_from_txt.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..6ca1a740a3059ed64c47c2bd2d4a87965a077623
GIT binary patch
literal 1081
zcmb`GyAFad6ox@CF3xW5PRtHYy4i%m7chV%nkblB1~wNjB;7dfzNr`RUry54L_74$
z`7dn;IQqB1cbcUKS+U!f!7|BZ8tjU#+C)-DCF%ZBrb(V1lq?cOHR*1XB;CtaZKNjQ
zoM`68bfE=}ge!b8Jk#3K-IXf$>ATd~jH$Y|zNg&@g<&{<yjHUbM^Ri&?_T>Do4xZ1
zxs1H~Eyw$q`8l?;Z67&Keh++;Zt{_b4?+i_bHWGVW45KK2c6UQ+<)av_jCQ*Z2KO3
Xk30_@<UVHjAau~A!~d1^`cs1^LX$kw

literal 0
HcmV?d00001

diff --git a/test/files/expected_vocab_from_bib.pkl b/test/files/expected_vocab_from_bib.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0641a70427f6ee85f8fb59a875a2c17cac43eaec
GIT binary patch
literal 9047
zcmZu%2bd&96=r0^-EFwraC>`qfQ_J{lFV62D%d;~#e|CV&Gzm#H#0rgGqZP~C<=<g
zGXRa42rA~B0|F|^oO8}O=dAyG6}o!v?)&zeSATU?z4z*+s=9MWE<5><{q)~C<zc-4
zNyndB#{2Pl{6<-)ldqL=gXeq;SU)Y+(oVmOZ^Ki9ck-gYnkGFU`}2GaiHdY3%R6N}
z!W*Q+HAi`|YGTUe0>g_zvp*=xc#OxAX0nvF%Xplp4H<Rw?x2;RcOB2yAxzpz!R2}k
zTUtvx%`)D=Q-+MXMcT}IWjw(PH6%)kezu%7vofB<xNf(VVcZl?8Zs&dMbawcX`Tsz
zsEc7`yph)dE%KzfTE?4rBq>$~Wjw>PLPY%{>GW0y?WB{5N@jU+Z#ZhUk___AVQnzd
z#8UHETgfQxtO?7Tc`gK^X5MZOpvnaZXFgocTdjO|8E=u5WtPYw_$e^?B5$(rYPPc4
z!XMb#%G-y*sFydhyqmW&^l$TQ<jcz`^t~PWHj^t5VXN~Z#g_+QZ%hZ2+*5-%5L&K5
z*xH_1_aOgukz%(q2y$CLvxTCxzbe}hLBE*BY|p`FpfsYDv?DCtj<>je1F2EAAYmCF
z0*&^1X%7lK6q~1Huq9=Ddmh~}$XZKfd<ULX%4_zHJR1Voh?N9W9>$~1q?>@TJMn~4
zk)7erJR7wAJa^%B&DEsU!uT>ioTptQTFO>L$4BsD9Wo4>T|1J;n^`|QJB5ah;wei<
zy*5NH<D+@LhD0m*T3X=KyJ8tLOt({v_into4n@1u*0T7}-64@Nz<WR<p&V=XghW+4
z%Gzx>cNyP{$DH}y8$78cMoUG8I8es-fjA*z?8(Y{U!Js`R-5<ZO%~{<&DBoc%AutD
zLo73e2O|WCGJXKGk$0LZ;sbf3r6poy9Z}_jc<T@r>hi%n<9d689ld_i>BHfV5q@>f
z$fI7;V|mNoot;Dj({ako62ep&A1`X?2<>9U2~b^#*kJJxQ2=6QyPFqDf#n~nD|n!0
zoDbsz!jFT#sF#YaPn3<wyW#~W2?s((n0qyW!9QHI7$8tnvo%0R48{vs$1a4alQGQD
zQMxA)QHU36NYqaINq6-u415Hbw;y*0MOwy><hc+)>`i(S-yg+e2-rynZgmPz8q(R+
zqj^3Agx!3%NbndQUrN`~R^Bb+Q+dkL(eAtu_k1jmS{r&CW)#nXgBj5u&vPN*74QU1
zq96Jtp2(YPNMt>tj8AiuIvaly&zMgQ5M@hgQpQgP^E#zPyLqpd?SOYZ#jVvz``~zI
z8K3UBvWS!asXSq(&_r3rXYi(=9huCTICi>&9h!@FICcyGN6(>@CT}!jHOrWYvBV2Q
z7}8$8mf`ah+hV<R8S^PwbI%ptz84yL&?=7@{xY5rB0_4?gnlN^25r9sJ9*4mV~eK@
z>CLds^ED(|De~QY<j@W#R~wExX$rP;OkTD9_%4qFHny{iH(I*d(t;O;FdqwhJZj@l
zUnotm1D;gM$L%$q3jrVCc4LUiOnbnnP$)+<Tb>0*t9Ilv&)FDYeCugEtyJJsPv<Rl
zNG+|U#lktfA>9L)&X7(KDxSeJRTCjfjIWHJ$s@~23&gp+L5K(;RMO_NcvIDs5Gg+P
zY#v?CQt_wrctWYD*B>ll^7DBvXhR7a&C2*WJmw0E=VFM~7`CFgfaim@#LT2KNZ}06
z14~+SJfEk82)Zuh*%}h<L=-OLi@;Ytka)p7sZ`{uuouV(bXmZ~*BA2oWuQr0ON|%t
zv~NKoDHdPMi*-m0CofO|V!exb)K(9d@U&9s?Wd+_xRf{7A)gLP9x;x08Lt;28uT;B
z`x0LCt*AX{^|5=&&U7^wMPJTiE-Sy3Ck=_kmNJBsm+@=}xDfGjp0aQc#kFkaD|o(!
zL@vC(60=*kxq>$s(&Wl3F+_W$8CK<2feX{2aiWZ`_FvjS<U-i1czsYSzMAKK%R~)p
z_OIas_tg=(s-}#u;Sm!*U&|9hc!j=>XM;BK2Qn-2$Lo1oVW0lX_zk?d4n?lKe<QDx
z9^*1^G9#rC**yJb-s<`d6y(t|ehWr>!+k4HxrQ~&w_#b^t%#)YM2nQS^B8)x9(o6)
zQqpFLcj_Q?X*&v}co$FFFD-NL=9v(HUI#F!_dp6Wp-(XH<;6M_dFw9Y_bK19pwr;{
zp;$A6L82n-Az^>OPrQ~D{ejelALKdvrG-#iw3f^GLn5CEG1o#zohFnhuKHn2VH=18
z>a*8D-Wuec_j*iXtBxCZgCUK1eguMRkD8}D34RoU7}Qzy$NYDdHJ9;?U^@KHdCpC|
z9=P-Jn|Y&eMeXFw91-vqUKqmURM?gy)~!5h{qp13Q1MVV?I(l@-!{%v#-9{T2#N`<
zaz4fLb;zmc(=xY);LnI2K;g_O=fuxqUwt#MOiciv<8gGF+V=CpgVJF}{(^`zghr+~
z`=aRH#NjWAS~c#Py1I<N%v)+mXry29U%T;Nl?^s=+x+-r-`7M3+S4siK1!_my0Ro;
z-$Iqoe8Ye3C-|oS+CtUW-%>upyMS-Qs>9m>8-KsU+xCX5Lg2fid%e2&o)`ou4fgih
z;`^eB8YcD9N|7Lk{6O??kE}lwZj_E}3jL87VSu<)`eTvO;IIrWxql*zhmdiCpNeC-
zwDU7z&NO^{{JAh!!%%Wr$Nh!ay1#w-rI?3pRRa7<tZNAMhs0m=h{?Xc;YlGR>6^3F
zZ+RgEs)JD(|IU8v<KXXkUca?KWDf*o{0AOsWgP^aKk|eSddVVp8-L=tpe^oa?hF3R
zqqcVW3r0v4gMCt;`73V<+SYUa#-rA!|IU+2$yDme|KM@CaDg9`@jrRW(k|=%ix+B0
z;J^Rob=7g_KNxAxJ~pubm*;DU->3iaOszwc5$)IDEqlQ>Em5PvW9@uN&a!kH+^HE_
z_P`$oOWMD|2koOHh!Z7^G<bTb=%CRCZ?0oe?<}p0XiTQ8)hM=0g0{KEqVWcgbdqeX
zq;(CR5~`|LX?=rdYmiY6ZGb??YH}W+i3U#^8t&XdO-PeEN_q`qLn}=+c+})Vnl^(5
zg>1XjkF*hb(N2r_j!g~TU^+Bm(hTIS+BnMks+ndRyvcM`S8*lHHF$9tmAKg(Wc`xn
zMSeMZ(B=kD2qiWmo=giOM+LY;3vFrexI6pNqG-)jQBKaS4W6$-SWS<1v`sXC+`U@T
zb`eWRa}1;d8az|A-D!&sZ18%8^)N;UiG(srk6<>H(rp{OFpNS9sXHmclIB8>4hF0B
zt30|Llv#t|S0=X8AsA>k^iXI<Nhjg$p_u@HCF}8=?f}iWXhC;u@W$atL5D#%L!ilF
zbSIdE>(NsV-C0-B8Z4^LHznN#Uf{m$rE+{J>2Ro^j`<KrM_`N5hs{3HtkFkJZEhdv
zD6@t~>>UIh?Y?xuh3@LUtOb;k?j|y~a6(2xq`M1irr{0v9>Q7;tCW0CQL>&W>0Yv;
zknS)^_eM+!ky<FYk1T7i#_7J;S=+Fe;&eYzwCOic)a&9#e7U6i>qv8%K@Wi0*}^lZ
zGU<WBv$^o7U~}4{2g!J6B@dQOHVvQ6=@?nN4(bM$bSwe~BDdVk({Vaf=;}_Mj)y^-
z4?Jiuf9V7>R-!&VM0qgDgdS=plFS07Yw9iOVP>Kbw8@-K)WIh2(@C;prU-D^k{+)7
z>jxHd-h>ws@Sn>S3!Q96g05C2T~+!WhF=p+9w9?*9C)NSl+xyQhaLsL4ggc^&?#cL
za{ES)7UKXFc}sf?;$?_non;NAr|R7J)ZWn1V|8v1n9~kDPL$w+;N!)9OvAbO6GR6g
z<Psn~5jMMPkm8AhP6Jy?x`a<pg5iWn<d2IbJsBEsOg#l$8rt%6I^+p}%6p%xqndIc
zp)+)p(B_as37it<<bcRNAJGm-=bjs=3F+z&4YZ`I>2rj6d$9L9l<I2wbVbXM&V*Pf
zSA*#al;V-_>55hxymenab|*9F%)D07nK~;TuDnC;gb!+`MNl?KK5i*1dP=6Y3NEx2
zyaUtFa~|c=g%h8by(_X+fPX-vUBZMuZ&88lqog=36x$O$N|&~)@Yt{8J{S(bn+v^b
z@B>3vo3<NfP{+hR^>jmfz`EXP(^+6$D443*-?L$=0bpf)nhwB0m7WgW$&nQ@+4yyi
zGNLyp^bG8?xkYhzCiG0`q6RthJ{Nhlm$fszEVi<eo+S&rrgx=hBLEMhkWpXb=salC
zyv@-0!j?SK_z3bGSvNp%&^A0iSNO%N8;;|2f$*!ed!M1_$+!@T%mWlX9~-8xb><?6
zE`%jk?ckV57oq6eS5I&+MC@j>x#tCTS@RZH-R95>Vb(gC$wTg%$7%6JvNLjxK`%xO
z7OGZ{T`X2m2N6Z|qJb_EMc9mVsn~(i*7Qq}t^$IJ^fFmdF8$~wVs@agTkaNsE|+mN
z%-#jiOJ%q8`jcKJY8O(>Ro`^z<syFw*{er-g_wzdK%!TQnYgA8h*yZ1L#QkLuN1xb
zJ}pB_x~ew7)TS<cUoCpE_4})2u+l-n{%WvPT{F>ZAdPS8QIoELG&Kk+k~1;A7VK9Y
z_EL>r2P+>&<#?q@`t=B;?f_4306)V=cX}g)83JMLrYd_=IKY(*Z<h7UTe2&Js?y;t
zBAvXWn`HJ@5ydq_3w~RJx9y_?k;KIAw~MHLpi9~Bz)@frg>r1k@J?7%(5s2}T_RmI
zPb~$%8(yS+`sQEKdk`$Ec6G#fuUl2WbGY-N_lb^V1=DLH<NMvR73hQH2V}?n0J%J(
z4>~pJ=W-!xUU}(5@ZZ|Ero*j$Exc(U&#ClbxS&vCw|cC*4oN8hutR#byIw}fO*q|v
zpeK~Se4&p({t&TG*Yr_vr+ED=TGGd`iTmjA=hquyeC`f}Zo+!CYfHKr-X0=VR=5RD
zWZ$dkR;bI+?yZVG4n}K`d3~Z!V3>U=r%z%zL%R_CDU1pc`+ouG(+Dl*<&r+r7##V3
D$S9&b

literal 0
HcmV?d00001

diff --git a/test/files/expected_vocab_from_txt.pkl b/test/files/expected_vocab_from_txt.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0698bc3d6c3ccde57d064f0e7c0306221b561679
GIT binary patch
literal 400
zcmX|-&2GXl5QL#I5R-;~N?SPi*b^_}r+@^AB_L7*QZJQy>1~eYec82R`?51D&u8!c
ziN}Ap<Ib=Bg4mny(&`RU7V3C5R(FvHqbhT%tbT!XoyJ^GRwtN5r_S}g<d8s8>~2=~
zAgQXN+BFBO`xr&1E~cQ?00VB7HIY(`Tdj)v@M9VRNc7t9kWh?KbjRupL+?E1l`tWa
zBXPJ9B-QZt7<qIaF+t`!AvZ<h2A(0M+Ri$Ms7y&8T_9!Qqf74cD&Ei!Bd;L*U)~{E
Y5P)+Z$fNV&PYA!cH8K`)X`B1@33u~~TL1t6

literal 0
HcmV?d00001

diff --git a/test/files/input.txt b/test/files/input.txt
new file mode 100644
index 0000000..1755b0f
--- /dev/null
+++ b/test/files/input.txt
@@ -0,0 +1,4 @@
+Cats chase mice. Dogs chase cats.
+Birds fly high. Cats and dogs coexist.
+Mice hide from cats. Birds sing loudly.
+Cats and dogs coexist. Cats and dogs coexist.
diff --git a/test/test_full.py b/test/test_full.py
new file mode 100644
index 0000000..c12be20
--- /dev/null
+++ b/test/test_full.py
@@ -0,0 +1,213 @@
+import os
+import boto3
+import pytest
+import pickle
+import numpy as np
+
+from pathlib import Path
+from main import preprocess_bib_file, preprocess_txt_file
+from botocore.exceptions import ClientError
+
+MINIO_USER = "minioadmin"
+MINIO_PWD = "minioadmin"
+BUCKET_NAME = "testbucket"
+
+
+def ensure_bucket(s3, bucket):
+    try:
+        s3.head_bucket(Bucket=bucket)
+    except ClientError as e:
+        error_code = e.response["Error"]["Code"]
+        if error_code in ("404", "NoSuchBucket"):
+            s3.create_bucket(Bucket=bucket)
+        else:
+            raise
+
+
+def download_to_tmp(s3, bucket, key):
+    tmp_path = Path("/tmp") / key.replace("/", "_")
+    s3.download_file(bucket, key, str(tmp_path))
+    return tmp_path
+
+
+@pytest.fixture
+def s3_minio():
+    client = boto3.client(
+        "s3",
+        endpoint_url="http://localhost:9000",
+        aws_access_key_id=MINIO_USER,
+        aws_secret_access_key=MINIO_PWD
+    )
+    ensure_bucket(client, BUCKET_NAME)
+    return client
+
+
+def test_full_bib(s3_minio):
+    input_file_name = "input"
+    dtm_output_file_name = "dtm_file"
+    vocab_output_file_name = "vocab_file"
+
+    bib_path = Path(__file__).parent / "files" / f"{input_file_name}.bib"
+    bib_bytes = bib_path.read_bytes()
+
+    s3_minio.put_object(
+        Bucket=BUCKET_NAME,
+        Key=f"{input_file_name}.bib",
+        Body=bib_bytes
+    )
+
+    env = {
+        "bib_file_S3_HOST": "http://127.0.0.1",
+        "bib_file_S3_PORT": "9000",
+        "bib_file_S3_ACCESS_KEY": MINIO_USER,
+        "bib_file_S3_SECRET_KEY": MINIO_PWD,
+        "bib_file_BUCKET_NAME": BUCKET_NAME,
+        "bib_file_FILE_PATH": "",
+        "bib_file_FILE_NAME": input_file_name,
+        "bib_file_SELECTED_ATTRIBUTE": "abstract",
+
+        "dtm_output_S3_HOST": "http://127.0.0.1",
+        "dtm_output_S3_PORT": "9000",
+        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
+        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
+        "dtm_output_BUCKET_NAME":  BUCKET_NAME,
+        "dtm_output_FILE_PATH": "",
+        "dtm_output_FILE_NAME": dtm_output_file_name,
+
+        "vocab_output_S3_HOST": "http://127.0.0.1",
+        "vocab_output_S3_PORT": "9000",
+        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
+        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
+        "vocab_output_BUCKET_NAME": BUCKET_NAME,
+        "vocab_output_FILE_PATH": "",
+        "vocab_output_FILE_NAME": vocab_output_file_name,
+    }
+
+    for k, v in env.items():
+        os.environ[k] = v
+
+    preprocess_bib_file()
+
+    keys = [
+        o["Key"]
+        for o in s3_minio.list_objects_v2(
+            Bucket="testbucket").get("Contents", [])
+    ]
+
+    assert f"{dtm_output_file_name}.pkl" in keys
+    assert f"{vocab_output_file_name}.pkl" in keys
+
+    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+        dtm_output_file_name}.pkl")
+    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+        vocab_output_file_name}.pkl")
+
+    # Load produced results
+    with open(dtm_path, "rb") as f:
+        dtm = pickle.load(f)
+
+    with open(vocab_path, "rb") as f:
+        vocab = pickle.load(f)
+
+    # Load expected snapshot files
+    expected_vocab_path = Path(__file__).parent / \
+        "files" / "expected_vocab_from_bib.pkl"
+    expected_dtm_path = Path(__file__).parent / "files" / \
+        "expected_dtm_from_bib.pkl"
+
+    with open(expected_vocab_path, "rb") as f:
+        expected_vocab = pickle.load(f)
+
+    with open(expected_dtm_path, "rb") as f:
+        expected_dtm = pickle.load(f)
+
+    assert vocab == expected_vocab
+    np.testing.assert_array_equal(dtm, expected_dtm)
+
+
+def test_full_txt(s3_minio):
+    input_file_name = "input"
+    dtm_output_file_name = "dtm_txt_file"
+    vocab_output_file_name = "vocab_txt_file"
+
+    txt_path = Path(__file__).parent / "files" / f"{input_file_name}.txt"
+    txt_bytes = txt_path.read_bytes()
+
+    s3_minio.put_object(
+        Bucket=BUCKET_NAME,
+        Key=f"{input_file_name}.txt",
+        Body=txt_bytes
+    )
+
+    env = {
+        "txt_file_S3_HOST": "http://127.0.0.1",
+        "txt_file_S3_PORT": "9000",
+        "txt_file_S3_ACCESS_KEY": MINIO_USER,
+        "txt_file_S3_SECRET_KEY": MINIO_PWD,
+        "txt_file_BUCKET_NAME": BUCKET_NAME,
+        "txt_file_FILE_PATH": "",
+        "txt_file_FILE_NAME": input_file_name,
+
+        "dtm_output_S3_HOST": "http://127.0.0.1",
+        "dtm_output_S3_PORT": "9000",
+        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
+        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
+        "dtm_output_BUCKET_NAME": BUCKET_NAME,
+        "dtm_output_FILE_PATH": "",
+        "dtm_output_FILE_NAME": dtm_output_file_name,
+
+        "vocab_output_S3_HOST": "http://127.0.0.1",
+        "vocab_output_S3_PORT": "9000",
+        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
+        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
+        "vocab_output_BUCKET_NAME": BUCKET_NAME,
+        "vocab_output_FILE_PATH": "",
+        "vocab_output_FILE_NAME": vocab_output_file_name,
+    }
+
+    for k, v in env.items():
+        os.environ[k] = v
+
+    preprocess_txt_file()
+
+    keys = [
+        o["Key"]
+        for o in s3_minio.list_objects_v2(
+            Bucket=BUCKET_NAME).get("Contents", [])
+    ]
+
+    assert f"{dtm_output_file_name}.pkl" in keys
+    assert f"{vocab_output_file_name}.pkl" in keys
+
+    # Download produced files
+    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+                               dtm_output_file_name}.pkl")
+    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+                                 vocab_output_file_name}.pkl")
+
+    # Load produced results
+    with open(dtm_path, "rb") as f:
+        dtm = pickle.load(f)
+
+    with open(vocab_path, "rb") as f:
+        vocab = pickle.load(f)
+
+    # Load expected snapshot files
+    expected_vocab_path = Path(__file__).parent / \
+        "files" / "expected_vocab_from_txt.pkl"
+    expected_dtm_path = Path(__file__).parent / \
+        "files" / "expected_dtm_from_txt.pkl"
+
+    with open(expected_vocab_path, "rb") as f:
+        expected_vocab = pickle.load(f)
+
+    with open(expected_dtm_path, "rb") as f:
+        expected_dtm = pickle.load(f)
+
+    # Assertions
+    assert vocab == expected_vocab
+
+    if hasattr(dtm, "toarray"):
+        np.testing.assert_array_equal(dtm.toarray(), expected_dtm.toarray())
+    else:
+        np.testing.assert_array_equal(dtm, expected_dtm)
diff --git a/test/test_loaders.py b/test/test_loaders.py
new file mode 100644
index 0000000..3c96468
--- /dev/null
+++ b/test/test_loaders.py
@@ -0,0 +1,33 @@
+import os
+import tempfile
+
+from preprocessing.loader import TxtLoader, BibLoader
+
+
+def test_txt_loader_reads_and_normalizes():
+    with tempfile.NamedTemporaryFile("w+", delete=False) as f:
+        f.write("Hello {World}\nSecond line")
+        fname = f.name
+
+    result = TxtLoader.load(fname)
+    os.unlink(fname)
+
+    assert result == ["Hello World", "Second line"]
+
+
+def test_bib_loader_extracts_attribute():
+    bib_content = r"""
+    @article{a,
+      abstract = {This is {Bib} \textbf{text}.},
+      title = {Ignore me}
+    }
+    """
+
+    with tempfile.NamedTemporaryFile("w+", delete=False) as f:
+        f.write(bib_content)
+        fname = f.name
+
+    result = BibLoader.load(fname, "abstract")
+    os.unlink(fname)
+
+    assert result == ["This is Bib text."]
diff --git a/test/test_normalize.py b/test/test_normalize.py
new file mode 100644
index 0000000..33dd6e5
--- /dev/null
+++ b/test/test_normalize.py
@@ -0,0 +1,17 @@
+from preprocessing.loader import normalize_text
+
+
+def test_normalize_removes_braces():
+    assert normalize_text("{abc}") == "abc"
+
+
+def test_normalize_removes_latex_commands():
+    assert normalize_text(r"\textbf{Hello}") == "Hello"
+
+
+def test_normalize_removes_accents():
+    assert normalize_text(r"\'a") == "a"
+
+
+def test_normalize_collapses_whitespace():
+    assert normalize_text("a    b   c") == "a b c"
diff --git a/test/test_preprocessor_unit.py b/test/test_preprocessor_unit.py
new file mode 100644
index 0000000..7829105
--- /dev/null
+++ b/test/test_preprocessor_unit.py
@@ -0,0 +1,26 @@
+def test_preprocessor_tokenization(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+
+    assert len(preprocessor.token_frequency) > 0
+
+
+def test_preprocessor_bag_of_words(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+    preprocessor.generate_bag_of_words()
+
+    assert len(preprocessor.bag_of_words) == 2
+    assert all(len(doc) > 0 for doc in preprocessor.bag_of_words)
+
+
+def test_generate_document_term_matrix(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+    preprocessor.generate_bag_of_words()
+
+    dtm, vocab = preprocessor.generate_document_term_matrix()
+
+    assert dtm.shape[0] == 2
+    assert dtm.shape[1] == len(vocab)
+    assert dtm.sum() > 0

From db2ab5546e413a5007906a7aeadb624ec87a39eb Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Mon, 1 Dec 2025 21:56:11 -0600
Subject: [PATCH 2/9] chore: test cbc validity

---
 .github/workflows/ci.yaml | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 3939887..6a49819 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -25,9 +25,44 @@ jobs:
       - name: Run flake8
         uses: py-actions/flake8@v2
 
-  test:
+  validate-compute-block:
     runs-on: ubuntu-latest
     needs: lint-python
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+
+      - name: Intall dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Check cbcs
+        run: |
+          python3 - <<'EOF'
+          import main
+
+          from scystream.sdk.config import load_config, get_compute_block
+          from scystream.sdk.config.config_loader import _compare_configs
+          from pathlib import Path
+
+          CBC_PATH = Path("cbc.yaml")
+
+          if not CBC_PATH.exists():
+              raise FileNotFoundError("cbc.yaml not found in repo root.")
+
+          block_from_code = get_compute_block()
+          block_from_yaml = load_config(str(CBC_PATH))
+
+          _compare_configs(block_from_code, block_from_yaml)
+
+          print("cbc.yaml matches python code definition")
+          EOF
+
+  run-test:
+    runs-on: ubuntu-latest
+    needs: validate-compute-block
     services:
       minio:
         image: lazybit/minio
@@ -59,7 +94,7 @@ jobs:
   build:
     name: Build docker image
     runs-on: ubuntu-latest
-    needs: test
+    needs: run-test
     permissions:
       contents: read
       packages: write

From 182e71c04c232f556cc73dd13a71e30095cbb471 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Mon, 1 Dec 2025 21:58:14 -0600
Subject: [PATCH 3/9] chore: rename workflow steps

---
 .github/workflows/ci.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 6a49819..786a9ed 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -26,6 +26,7 @@ jobs:
         uses: py-actions/flake8@v2
 
   validate-compute-block:
+    name: Validate Compute Block Config
     runs-on: ubuntu-latest
     needs: lint-python
     steps:
@@ -61,6 +62,7 @@ jobs:
           EOF
 
   run-test:
+    name: Run Tests
     runs-on: ubuntu-latest
     needs: validate-compute-block
     services:
@@ -92,7 +94,7 @@ jobs:
         run: pytest -vv
 
   build:
-    name: Build docker image
+    name: Build Docker Image
     runs-on: ubuntu-latest
     needs: run-test
     permissions:

From ab58dbb75b9aa93f2faf7f763aea0fdf24cdffee Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 3 Dec 2025 13:50:20 -0600
Subject: [PATCH 4/9] chore: add logging

---
 main.py                 | 24 +++++++++++++++++++++++-
 preprocessing/core.py   | 21 ++++++++++++++++++---
 preprocessing/loader.py |  5 +++++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 6698911..64b9e21 100644
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 import pickle
 import tempfile
+import logging
 
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
@@ -14,6 +15,13 @@
 from preprocessing.loader import TxtLoader, BibLoader
 
 
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
 class DTMFileOutput(FileSettings, OutputSettings):
     __identifier__ = "dtm_output"
 
@@ -66,6 +74,8 @@ class PreprocessBIB(EnvSettings):
 
 def _preprocess_and_store(texts, settings):
     """Shared preprocessing logic for TXT and BIB."""
+    logger.info(f"Starting preprocessing with {len(texts)} documents")
+
     pre = Preprocessor(
         language=settings.LANGUAGE,
         filter_stopwords=settings.FILTER_STOPWORDS,
@@ -74,10 +84,12 @@ def _preprocess_and_store(texts, settings):
         ngram_min=settings.NGRAM_MIN,
         ngram_max=settings.NGRAM_MAX,
     )
-    pre.texts = texts
 
+    pre.texts = texts
     pre.analyze_texts()
+
     pre.generate_bag_of_words()
+
     dtm, vocab = pre.generate_document_term_matrix()
 
     with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
@@ -89,20 +101,30 @@ def _preprocess_and_store(texts, settings):
         pickle.dump(vocab, tmp_vocab)
         tmp_vocab.flush()
 
+        logger.info("Uploading DTM to S3...")
         S3Operations.upload(settings.dtm_output, tmp_dtm.name)
+
+        logger.info("Uploading vocabulary to S3...")
         S3Operations.upload(settings.vocab_output, tmp_vocab.name)
 
+    logger.info("Preprocessing completed successfully.")
+
 
 @entrypoint(PreprocessTXT)
 def preprocess_txt_file(settings):
+    logger.info("Downloading TXT input from S3...")
     S3Operations.download(settings.txt_input, "input.txt")
+
     texts = TxtLoader.load("./input.txt")
+
     _preprocess_and_store(texts, settings)
 
 
 @entrypoint(PreprocessBIB)
 def preprocess_bib_file(settings):
+    logger.info("Downloading BIB input from S3...")
     S3Operations.download(settings.bib_input, "input.bib")
+
     texts = BibLoader.load(
         "./input.bib",
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
diff --git a/preprocessing/core.py b/preprocessing/core.py
index 4db4585..dba2a8d 100644
--- a/preprocessing/core.py
+++ b/preprocessing/core.py
@@ -1,3 +1,4 @@
+import logging
 import spacy
 import numpy as np
 
@@ -9,6 +10,7 @@
     "en": "en_core_web_sm",
     "de": "de_core_news_sm"
 }
+logger = logging.getLogger(__name__)
 
 
 class Preprocessor:
@@ -21,6 +23,12 @@ def __init__(
         ngram_min: int = 2,
         ngram_max: int = 3,
     ):
+        logger.info(
+            "Init Preprocessor (lang=%s, filter_stopwords=%s, ngrams=%s)",
+            language,
+            filter_stopwords,
+            use_ngrams,
+        )
         self.language = language
         self.filter_stopwords = filter_stopwords
         self.unigram_normalizer = unigram_normalizer
@@ -58,6 +66,7 @@ def filter_tokens(
         ]
 
     def analyze_texts(self):
+        logger.info(f"Analyzing {len(self.texts)} texts...")
         porter = PorterStemmer()
         for text in self.texts:
             doc = self.nlp(text)
@@ -67,8 +76,8 @@ def analyze_texts(self):
 
             for sentence in doc.sents:
                 filtered_tokens = self.filter_tokens(
-                        list(sentence),
-                        self.filter_stopwords
+                    list(sentence),
+                    self.filter_stopwords
                 )
                 normalized_tokens = [
                     self.normalize_token(t, porter) for t in filtered_tokens
@@ -93,6 +102,10 @@ def analyze_texts(self):
             if ngram_list:
                 self.ngram_frequency.update(ngram_list)
                 self.ngram_document_frequency.update(set(ngram_list))
+        logger.info(
+            f"Finished analyzing texts: {self.token_frequency} unigrams, {
+                self.ngram_frequency} n-grams",
+        )
 
     def normalize_token(
         self,
@@ -110,6 +123,7 @@ def normalize_token(
         return word
 
     def generate_bag_of_words(self):
+        logger.info("Generating bag-of-words...")
         porter = PorterStemmer()
         self.bag_of_words = []
 
@@ -177,7 +191,7 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict):
             dtm (np.ndarray): shape = (num_docs, num_terms)
             vocab (dict): mapping term -> column index
         """
-
+        logger.info("Building document-term-matrix...")
         all_terms = set()
         for doc in self.bag_of_words:
             for t in doc:
@@ -194,4 +208,5 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict):
                 term_idx = vocab[token["term"]]
                 dtm[doc_idx, term_idx] += 1
 
+        logger.info(f"Matrix shape: {dtm.shape} | Vocab size: {len(vocab)}")
         return dtm, vocab
diff --git a/preprocessing/loader.py b/preprocessing/loader.py
index 9ff51c6..50d0177 100644
--- a/preprocessing/loader.py
+++ b/preprocessing/loader.py
@@ -1,6 +1,9 @@
+import logging
 import re
 import bibtexparser
 
+logger = logging.getLogger(__name__)
+
 
 def normalize_text(text: str) -> str:
     if not text:
@@ -24,6 +27,7 @@ def normalize_text(text: str) -> str:
 class TxtLoader:
     @staticmethod
     def load(file_path: str) -> list[str]:
+        logger.info("Loading TXT file...")
         with open(file_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
         return [normalize_text(line) for line in lines]
@@ -32,6 +36,7 @@ def load(file_path: str) -> list[str]:
 class BibLoader:
     @staticmethod
     def load(file_path: str, attribute: str) -> list[str]:
+        logger.info(f"Loading BIB file (attribute={attribute})...")
         with open(file_path, "r", encoding="utf-8") as f:
             bib_database = bibtexparser.load(f)
 

From 8868ef1cf9a6d82c61f384d3767b77062fe84f24 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 10 Dec 2025 16:46:47 -0600
Subject: [PATCH 5/9] fix: use absolute path for file download

---
 .github/workflows/ci.yaml |   2 +-
 cbc.yaml                  |  12 ++--
 input.bib                 | 112 --------------------------------------
 input.txt                 |   4 --
 main.py                   |  13 +++--
 5 files changed, 16 insertions(+), 127 deletions(-)
 delete mode 100644 input.bib
 delete mode 100644 input.txt

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 786a9ed..60e9630 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -38,7 +38,7 @@ jobs:
       - name: Intall dependencies
         run: |
           pip install -r requirements.txt
-
+      
       - name: Check cbcs
         run: |
           python3 - <<'EOF'
diff --git a/cbc.yaml b/cbc.yaml
index 0932961..a24e387 100644
--- a/cbc.yaml
+++ b/cbc.yaml
@@ -1,10 +1,11 @@
 author: Paul Kalhorn 
 description: Language preprocessing for .txt or .bib files
-docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
+docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing 
 entrypoints:
   preprocess_bib_file:
-    description: Entrypoint for preprocessing a .bib file
+    description: Entrypoint for preprocessing a .bib file 
     envs:
+      BIB_DOWNLOAD_PATH: /tmp/input.bib
       FILTER_STOPWORDS: true
       LANGUAGE: en
       NGRAM_MAX: 3
@@ -23,7 +24,7 @@ entrypoints:
           bib_file_S3_PORT: null
           bib_file_S3_SECRET_KEY: null
           bib_file_SELECTED_ATTRIBUTE: Abstract
-        description: The bib file, aswell as one attribute selected for preprocessing 
+        description: The bib file, aswell as one attribute selected for preprocessing
         type: file
     outputs:
       dtm_output:
@@ -36,7 +37,7 @@ entrypoints:
           dtm_output_S3_HOST: null
           dtm_output_S3_PORT: null
           dtm_output_S3_SECRET_KEY: null
-        description: Numpy representation of document-term matrix as .pkl file 
+        description: Numpy representation of document-term matrix as .pkl file
         type: file
       vocab_output:
         config:
@@ -57,6 +58,7 @@ entrypoints:
       LANGUAGE: en
       NGRAM_MAX: 3
       NGRAM_MIN: 2
+      TXT_DOWNLOAD_PATH: /tmp/input.txt
       UNIGRAM_NORMALIZER: porter
       USE_NGRAMS: true
     inputs:
@@ -70,7 +72,7 @@ entrypoints:
           txt_file_S3_HOST: null
           txt_file_S3_PORT: null
           txt_file_S3_SECRET_KEY: null
-        description: A .txt file 
+        description: A .txt file
         type: file
     outputs:
       dtm_output:
diff --git a/input.bib b/input.bib
deleted file mode 100644
index f525305..0000000
--- a/input.bib
+++ /dev/null
@@ -1,112 +0,0 @@
-﻿
-@article{ WOS:001016714700004,
-Author = {White, Joel},
-Title = {Theoretical and Practical Paralogisms of Digital Immortality},
-Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY},
-Year = {2022},
-Volume = {9},
-Number = {2, SI},
-Pages = {155-172},
-Month = {JUL 3},
-Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic
-   and popular relevance; specific naive metaphysical ideas, such as
-   immortality, have returned with this rise. This article refrains from
-   any ethical or political assessment of transhumanism. Still, it
-   critiques the exact metaphysical or idealistic nature of transhumanism
-   and its pursuit of digital immortality: the idea that, through
-   technological advancements, precisely in Artificial General
-   Intelligence, an immortal virtual ``self{''} will become possible. The
-   article follows the form of Immanuel Kant's ``Paralogisms{''} from the
-   Critique of Pure Reason, where Kant is concerned with the substantial,
-   immortal nature of the soul and its experiential impossibility. The
-   article will offer theoretical and practical paralogisms (false logical
-   inferences), arguing that the transhumanist claim that digital
-   immortality is possible fundamentally stems from two incorrect major
-   premises. The first concerns the substantial nature of information,
-   which informs the theoretical paralogisms; the second concerns infinite
-   transformation (pure plasticity), which informs the practical
-   paralogisms},
-Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD},
-Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND},
-Type = {Article},
-Language = {English},
-DOI = {10.1080/20539320.2022.2150463},
-ISSN = {2053-9320},
-EISSN = {2053-9339},
-Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms;
-   Digital Immortality},
-Research-Areas = {Philosophy},
-Web-of-Science-Categories  = {Philosophy},
-Author-Email = {jhmw01@gmail.com},
-ORCID-Numbers = {White, Joel/0000-0001-6460-0564},
-Number-of-Cited-References = {30},
-Times-Cited = {0},
-Usage-Count-Last-180-days = {3},
-Usage-Count-Since-2013 = {15},
-Journal-ISO = {J. Aesthet. Phenomenol.},
-Doc-Delivery-Number = {K5GF0},
-Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)},
-Unique-ID = {WOS:001016714700004},
-DA = {2025-06-26},
-}
-
-@article{ WOS:001322577100012,
-Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir},
-Title = {Sociopolitical Challenges to Digital Transformation of Rural
-   Communities: Learnings from a Case Study From Manipur, India},
-Journal = {IT PROFESSIONAL},
-Year = {2024},
-Volume = {26},
-Number = {4},
-Pages = {42-47},
-Month = {JUL-AUG},
-Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized
-   the inclusive growth of digital networks and digital public goods,
-   utilizing a multistakeholder systems approach. Similarly, the
-   information and communications technology (ICT) Innovation and
-   Intervention Program of the Government of India's Digital North East
-   Vision 2022 has also emphasized a need for inclusive growth of ICT in
-   the Northeast Region. In line with the above, this article presents
-   insights from a field study conducted in the rural parts of Manipur,
-   India, which incidentally can be found to be applicable to many rural
-   parts of the developing world. The article envisions a community-driven
-   sociodigital transformation of the Northeast Region of India. In this
-   quest, the article highlights sociopolitical challenges for digital
-   transformation and provides insights for inclusive ICT in such
-   regions-infrastructure as a utility for every citizen, smart governance
-   and services on demand, digital empowerment of citizens, social welfare,
-   capacity building, and community engagement.},
-Publisher = {IEEE COMPUTER SOC},
-Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA},
-Type = {Article},
-Language = {English},
-Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India.
-   Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India.
-   Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India.
-   Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.},
-DOI = {10.1109/MITP.2024.3433459},
-ISSN = {1520-9202},
-EISSN = {1941-045X},
-Keywords = {Technological innovation; Digital transformation; Government; Buildings;
-   Asia; Africa; Information and communication technology},
-Research-Areas = {Computer Science; Telecommunications},
-Web-of-Science-Categories  = {Computer Science, Information Systems; Computer Science, Software
-   Engineering; Telecommunications},
-Author-Email = {vkant@iitk.ac.in
-   sanjrampk@iiti.ac.in
-   sudhir.dixit@ieee.org},
-Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of
-   Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT
-   System); Indian Institute of Technology (IIT) - Indore},
-ResearcherID-Numbers = {/ITU-6308-2023},
-ORCID-Numbers = {/0000-0002-6215-7500},
-Number-of-Cited-References = {7},
-Times-Cited = {0},
-Usage-Count-Last-180-days = {11},
-Usage-Count-Since-2013 = {22},
-Journal-ISO = {IT Prof.},
-Doc-Delivery-Number = {H3O9D},
-Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)},
-Unique-ID = {WOS:001322577100012},
-DA = {2025-06-26},
-}
diff --git a/input.txt b/input.txt
deleted file mode 100644
index 1755b0f..0000000
--- a/input.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Cats chase mice. Dogs chase cats.
-Birds fly high. Cats and dogs coexist.
-Mice hide from cats. Birds sing loudly.
-Cats and dogs coexist. Cats and dogs coexist.
diff --git a/main.py b/main.py
index 64b9e21..aa03bf3 100644
--- a/main.py
+++ b/main.py
@@ -14,7 +14,6 @@
 from preprocessing.core import Preprocessor
 from preprocessing.loader import TxtLoader, BibLoader
 
-
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
@@ -54,6 +53,8 @@ class PreprocessTXT(EnvSettings):
     NGRAM_MIN: int = 2
     NGRAM_MAX: int = 3
 
+    TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"
+
     txt_input: TXTFileInput
     dtm_output: DTMFileOutput
     vocab_output: VocabFileOutput
@@ -67,6 +68,8 @@ class PreprocessBIB(EnvSettings):
     NGRAM_MIN: int = 2
     NGRAM_MAX: int = 3
 
+    BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"
+
     bib_input: BIBFileInput
     dtm_output: DTMFileOutput
     vocab_output: VocabFileOutput
@@ -113,9 +116,9 @@ def _preprocess_and_store(texts, settings):
 @entrypoint(PreprocessTXT)
 def preprocess_txt_file(settings):
     logger.info("Downloading TXT input from S3...")
-    S3Operations.download(settings.txt_input, "input.txt")
+    S3Operations.download(settings.txt_input, settings.TXT_DOWNLOAD_PATH)
 
-    texts = TxtLoader.load("./input.txt")
+    texts = TxtLoader.load(settings.TXT_DOWNLOAD_PATH)
 
     _preprocess_and_store(texts, settings)
 
@@ -123,10 +126,10 @@ def preprocess_txt_file(settings):
 @entrypoint(PreprocessBIB)
 def preprocess_bib_file(settings):
     logger.info("Downloading BIB input from S3...")
-    S3Operations.download(settings.bib_input, "input.bib")
+    S3Operations.download(settings.bib_input, settings.BIB_DOWNLOAD_PATH)
 
     texts = BibLoader.load(
-        "./input.bib",
+        settings.BIB_DOWNLOAD_PATH,
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
     )
     _preprocess_and_store(texts, settings)

From d159e80d9ab57156661356b370fe7180c223a8a4 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 10 Dec 2025 16:48:38 -0600
Subject: [PATCH 6/9] fix: use lemma in cbc

---
 cbc.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cbc.yaml b/cbc.yaml
index a24e387..d20bcdf 100644
--- a/cbc.yaml
+++ b/cbc.yaml
@@ -10,7 +10,7 @@ entrypoints:
       LANGUAGE: en
       NGRAM_MAX: 3
       NGRAM_MIN: 2
-      UNIGRAM_NORMALIZER: porter
+      UNIGRAM_NORMALIZER: lemma
       USE_NGRAMS: true
     inputs:
       bib_input:
@@ -59,7 +59,7 @@ entrypoints:
       NGRAM_MAX: 3
       NGRAM_MIN: 2
       TXT_DOWNLOAD_PATH: /tmp/input.txt
-      UNIGRAM_NORMALIZER: porter
+      UNIGRAM_NORMALIZER: lemma
       USE_NGRAMS: true
     inputs:
       txt_input:

From 7994335a7eca8eddea66b59a92a2e51e846d8a7b Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 10 Dec 2025 16:57:58 -0600
Subject: [PATCH 7/9] fix: use porter in tests

---
 test/test_full.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_full.py b/test/test_full.py
index c12be20..c97b370 100644
--- a/test/test_full.py
+++ b/test/test_full.py
@@ -57,6 +57,8 @@ def test_full_bib(s3_minio):
     )
 
     env = {
+        "UNIGRAM_NORMALIZER": "porter",
+
         "bib_file_S3_HOST": "http://127.0.0.1",
         "bib_file_S3_PORT": "9000",
         "bib_file_S3_ACCESS_KEY": MINIO_USER,
@@ -140,6 +142,8 @@ def test_full_txt(s3_minio):
     )
 
     env = {
+        "UNIGRAM_NORMALIZER": "porter",
+
         "txt_file_S3_HOST": "http://127.0.0.1",
         "txt_file_S3_PORT": "9000",
         "txt_file_S3_ACCESS_KEY": MINIO_USER,

From 2100e4568813a9d3e227a2cd119aa9b39e7e7c7a Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Thu, 11 Dec 2025 14:05:04 -0600
Subject: [PATCH 8/9] feat: remove bow and dtm logic

---
 cbc.yaml                               |  70 +++-------
 docker-compose.yml                     |  15 ++-
 main.py                                |  80 ++++++------
 preprocessing/core.py                  | 162 ++++-------------------
 preprocessing/loader.py                |  37 +++++-
 preprocessing/models.py                |  14 ++
 requirements.txt                       |   4 +-
 test/files/expected_dtm_from_bib.pkl   | Bin 7546 -> 0 bytes
 test/files/expected_dtm_from_txt.pkl   | Bin 1081 -> 0 bytes
 test/files/expected_vocab_from_bib.pkl | Bin 9047 -> 0 bytes
 test/files/expected_vocab_from_txt.pkl | Bin 400 -> 0 bytes
 test/test_full.py                      | 174 ++++++++++---------------
 test/test_loaders.py                   |  23 +++-
 test/test_preprocessor_unit.py         |  69 ++++++----
 14 files changed, 278 insertions(+), 370 deletions(-)
 create mode 100644 preprocessing/models.py
 delete mode 100644 test/files/expected_dtm_from_bib.pkl
 delete mode 100644 test/files/expected_dtm_from_txt.pkl
 delete mode 100644 test/files/expected_vocab_from_bib.pkl
 delete mode 100644 test/files/expected_vocab_from_txt.pkl

diff --git a/cbc.yaml b/cbc.yaml
index d20bcdf..4df19d8 100644
--- a/cbc.yaml
+++ b/cbc.yaml
@@ -1,9 +1,9 @@
-author: Paul Kalhorn 
+author: Paul Kalhorn
 description: Language preprocessing for .txt or .bib files
-docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing 
+docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
 entrypoints:
   preprocess_bib_file:
-    description: Entrypoint for preprocessing a .bib file 
+    description: Entrypoint for preprocessing an attribute of a .bib file
     envs:
       BIB_DOWNLOAD_PATH: /tmp/input.bib
       FILTER_STOPWORDS: true
@@ -27,30 +27,15 @@ entrypoints:
         description: The bib file, aswell as one attribute selected for preprocessing
         type: file
     outputs:
-      dtm_output:
+      normalized_docs_output:
         config:
-          dtm_output_BUCKET_NAME: null
-          dtm_output_FILE_EXT: pkl
-          dtm_output_FILE_NAME: null
-          dtm_output_FILE_PATH: null
-          dtm_output_S3_ACCESS_KEY: null
-          dtm_output_S3_HOST: null
-          dtm_output_S3_PORT: null
-          dtm_output_S3_SECRET_KEY: null
-        description: Numpy representation of document-term matrix as .pkl file
-        type: file
-      vocab_output:
-        config:
-          vocab_output_BUCKET_NAME: null
-          vocab_output_FILE_EXT: pkl
-          vocab_output_FILE_NAME: null
-          vocab_output_FILE_PATH: null
-          vocab_output_S3_ACCESS_KEY: null
-          vocab_output_S3_HOST: null
-          vocab_output_S3_PORT: null
-          vocab_output_S3_SECRET_KEY: null
-        description: Pkl file of a dictionary that maps all words to their index in the DTM
-        type: file
+          normalized_docs_DB_TABLE: null
+          normalized_docs_PG_HOST: null
+          normalized_docs_PG_PASS: null
+          normalized_docs_PG_PORT: null
+          normalized_docs_PG_USER: null
+        description: Database Output, containing bib_id aswell as the normalized text
+        type: pg_table
   preprocess_txt_file:
     description: Entrypoint to preprocess a .txt file
     envs:
@@ -72,31 +57,16 @@ entrypoints:
           txt_file_S3_HOST: null
           txt_file_S3_PORT: null
           txt_file_S3_SECRET_KEY: null
-        description: A .txt file
+        description: A .txt file, each line will be treated as a document
         type: file
     outputs:
-      dtm_output:
+      normalized_docs_output:
         config:
-          dtm_output_BUCKET_NAME: null
-          dtm_output_FILE_EXT: pkl
-          dtm_output_FILE_NAME: null
-          dtm_output_FILE_PATH: null
-          dtm_output_S3_ACCESS_KEY: null
-          dtm_output_S3_HOST: null
-          dtm_output_S3_PORT: null
-          dtm_output_S3_SECRET_KEY: null
-        description: Numpy representation of document-term matrix as .pkl file
-        type: file
-      vocab_output:
-        config:
-          vocab_output_BUCKET_NAME: null
-          vocab_output_FILE_EXT: pkl
-          vocab_output_FILE_NAME: null
-          vocab_output_FILE_PATH: null
-          vocab_output_S3_ACCESS_KEY: null
-          vocab_output_S3_HOST: null
-          vocab_output_S3_PORT: null
-          vocab_output_S3_SECRET_KEY: null
-        description: Pkl file of a dictionary that maps all words to their index in the DTM
-        type: file
+          normalized_docs_DB_TABLE: null
+          normalized_docs_PG_HOST: null
+          normalized_docs_PG_PASS: null
+          normalized_docs_PG_PORT: null
+          normalized_docs_PG_USER: null
+        description: Database Output, containing bib_id aswell as the normalized text
+        type: pg_table
 name: Language-Preprocessing
diff --git a/docker-compose.yml b/docker-compose.yml
index 3c73fe4..d75aa02 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -13,13 +13,16 @@ services:
     ports:
       - "9000:9000"
       - "9001:9001"
-    networks:
-      - scystream-net
 
-networks:
-  scystream-net:
-    driver: bridge
+  postgres:
+    image: postgres:13
+    container_name: postgres
+    environment:
+      - POSTGRES_USER=postgres
+      - POSTGRES_PASSWORD=postgres
+      - POSTGRES_DB=postgres
+    ports:
+      - "5432:5432"
 
 volumes:
   minio_data:
-  search_query:
diff --git a/main.py b/main.py
index e599750..3223be6 100644
--- a/main.py
+++ b/main.py
@@ -1,18 +1,21 @@
-import pickle
-import tempfile
 import logging
+import pandas as pd
+from sqlalchemy import create_engine
 
+from typing import List
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
     EnvSettings,
     InputSettings,
     OutputSettings,
-    FileSettings
+    FileSettings,
+    PostgresSettings
 )
 from scystream.sdk.file_handling.s3_manager import S3Operations
 
 from preprocessing.core import Preprocessor
 from preprocessing.loader import TxtLoader, BibLoader
+from preprocessing.models import DocumentRecord, PreprocessedDocument
 
 logging.basicConfig(
     level=logging.INFO,
@@ -21,16 +24,8 @@
 logger = logging.getLogger(__name__)
 
 
-class DTMFileOutput(FileSettings, OutputSettings):
-    __identifier__ = "dtm_output"
-
-    FILE_EXT: str = "pkl"
-
-
-class VocabFileOutput(FileSettings, OutputSettings):
-    __identifier__ = "vocab_output"
-
-    FILE_EXT: str = "pkl"
+class NormalizedDocsOutput(PostgresSettings, OutputSettings):
+    __identifier__ = "normalized_docs"
 
 
 class TXTFileInput(FileSettings, InputSettings):
@@ -56,8 +51,7 @@ class PreprocessTXT(EnvSettings):
     TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"
 
     txt_input: TXTFileInput
-    dtm_output: DTMFileOutput
-    vocab_output: VocabFileOutput
+    normalized_docs_output: NormalizedDocsOutput
 
 
 class PreprocessBIB(EnvSettings):
@@ -71,13 +65,37 @@ class PreprocessBIB(EnvSettings):
     BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"
 
     bib_input: BIBFileInput
-    dtm_output: DTMFileOutput
-    vocab_output: VocabFileOutput
+    normalized_docs_output: NormalizedDocsOutput
+
+
+def _write_preprocessed_docs_to_postgres(
+        preprocessed_ouput: List[PreprocessedDocument],
+        settings: PostgresSettings
+):
+    df = pd.DataFrame([
+        {
+            "doc_id": d.doc_id,
+            "tokens": d.tokens
+        }
+        for d in preprocessed_ouput
+    ])
+
+    logger.info(f"Writing {len(df)} processed documents to DB table '{
+                settings.DB_TABLE}'…")
+    engine = create_engine(
+        f"postgresql+psycopg2://{settings.PG_USER}:{settings.PG_PASS}"
+        f"@{settings.PG_HOST}:{int(settings.PG_PORT)}/"
+    )
+
+    df.to_sql(settings.DB_TABLE, engine, if_exists="replace", index=False)
+
+    logger.info(f"Successfully stored normalized documents into '{
+                settings.DB_TABLE}'.")
 
 
-def _preprocess_and_store(texts, settings):
+def _preprocess_and_store(documents: List[DocumentRecord], settings):
     """Shared preprocessing logic for TXT and BIB."""
-    logger.info(f"Starting preprocessing with {len(texts)} documents")
+    logger.info(f"Starting preprocessing with {len(documents)} documents")
 
     pre = Preprocessor(
         language=settings.LANGUAGE,
@@ -88,27 +106,11 @@ def _preprocess_and_store(texts, settings):
         ngram_max=settings.NGRAM_MAX,
     )
 
-    pre.texts = texts
-    pre.analyze_texts()
-
-    pre.generate_bag_of_words()
-
-    dtm, vocab = pre.generate_document_term_matrix()
-
-    with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
-            tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
-
-        pickle.dump(dtm, tmp_dtm)
-        tmp_dtm.flush()
-
-        pickle.dump(vocab, tmp_vocab)
-        tmp_vocab.flush()
-
-        logger.info("Uploading DTM to S3...")
-        S3Operations.upload(settings.dtm_output, tmp_dtm.name)
+    pre.documents = documents
+    result = pre.generate_normalized_output()
 
-        logger.info("Uploading vocabulary to S3...")
-        S3Operations.upload(settings.vocab_output, tmp_vocab.name)
+    _write_preprocessed_docs_to_postgres(
+        result, settings.normalized_docs_output)
 
     logger.info("Preprocessing completed successfully.")
 
diff --git a/preprocessing/core.py b/preprocessing/core.py
index dba2a8d..f819839 100644
--- a/preprocessing/core.py
+++ b/preprocessing/core.py
@@ -1,10 +1,9 @@
 import logging
 import spacy
-import numpy as np
 
-from typing import Literal
+from typing import Literal, List
 from nltk.stem.porter import PorterStemmer
-from collections import Counter
+from preprocessing.models import PreprocessedDocument, DocumentRecord
 
 LANG_TO_SPACY_MODELS = {
     "en": "en_core_web_sm",
@@ -37,22 +36,14 @@ def __init__(
         self.ngram_max = ngram_max
 
         self.nlp_model = LANG_TO_SPACY_MODELS.get(language, "en_core_web_sm")
-
-        self.ngram_frequency = Counter()
-        self.ngram_document_frequency = Counter()
-        self.token_frequency = Counter()
-        self.token_document_frequency = Counter()
-
-        self.texts: list[str] = []
-
-        self.bag_of_words = []
-
         try:
             self.nlp = spacy.load(self.nlp_model, disable=["ner"])
         except OSError:
             spacy.cli.download(self.nlp_model)
             self.nlp = spacy.load(self.nlp_model, disable=["ner"])
 
+        self.documents: List[DocumentRecord] = []
+
     def filter_tokens(
         self,
         tokens: list[spacy.tokens.Token],
@@ -65,47 +56,39 @@ def filter_tokens(
             and len(t.text) > 2
         ]
 
-    def analyze_texts(self):
-        logger.info(f"Analyzing {len(self.texts)} texts...")
+    def generate_normalized_output(self) -> List[PreprocessedDocument]:
+        logger.info("Generating normalized output...")
         porter = PorterStemmer()
-        for text in self.texts:
-            doc = self.nlp(text)
 
-            token_list = []
-            ngram_list = []
+        processed_docs: List[PreprocessedDocument] = []
+
+        for record in self.documents:
+            doc = self.nlp(record.text)
+            doc_terms = []
 
-            for sentence in doc.sents:
-                filtered_tokens = self.filter_tokens(
-                    list(sentence),
-                    self.filter_stopwords
+            # Process each sentence
+            for sent in doc.sents:
+                filtered = self.filter_tokens(
+                    list(sent), self.filter_stopwords
                 )
-                normalized_tokens = [
-                    self.normalize_token(t, porter) for t in filtered_tokens
+                normalized = [
+                    self.normalize_token(t, porter) for t in filtered
                 ]
-                token_list.extend(normalized_tokens)
+                doc_terms.extend(normalized)
 
-                if (
-                    self.use_ngrams and
-                    self.ngram_min > 1 and
-                    self.ngram_max > 1
-                ):
+                # Generate n-grams
+                if self.use_ngrams and self.ngram_min > 1:
                     for n in range(self.ngram_min, self.ngram_max + 1):
-                        for i in range(len(normalized_tokens) - n + 1):
-                            ngram = " ".join(normalized_tokens[i:i+n])
-                            ngram_list.append(ngram)
+                        for i in range(len(normalized) - n + 1):
+                            ngram = " ".join(normalized[i:i+n])
+                            doc_terms.append(ngram)
 
-            # update unigram counters
-            self.token_frequency.update(token_list)
-            self.token_document_frequency.update(set(token_list))
+            processed_docs.append(PreprocessedDocument(
+                doc_id=record.doc_id,
+                tokens=doc_terms
+            ))
 
-            # update n-gram counters if any
-            if ngram_list:
-                self.ngram_frequency.update(ngram_list)
-                self.ngram_document_frequency.update(set(ngram_list))
-        logger.info(
-            f"Finished analyzing texts: {self.token_frequency} unigrams, {
-                self.ngram_frequency} n-grams",
-        )
+        return processed_docs
 
     def normalize_token(
         self,
@@ -121,92 +104,3 @@ def normalize_token(
         elif self.unigram_normalizer == "lemma":
             return token.lemma_.lower()
         return word
-
-    def generate_bag_of_words(self):
-        logger.info("Generating bag-of-words...")
-        porter = PorterStemmer()
-        self.bag_of_words = []
-
-        for text in self.texts:
-            doc = self.nlp(text)
-            doc_terms = []
-
-            for sent in doc.sents:
-                tokens = self.filter_tokens(list(sent), self.filter_stopwords)
-
-                # Handle unigrams
-                for token in tokens:
-                    normalized = self.normalize_token(token, porter)
-
-                    token_dict = {
-                        "term": normalized,
-                        "type": "word",
-                        "span": 1,
-                        "freq": self.token_frequency.get(normalized, 0),
-                        "docs": (
-                            self.token_document_frequency.get(normalized, 0)
-                        ),
-                        "filters": (
-                            ["stop"] if not self.filter_stopwords
-                            and token.is_stop else []
-                        )
-                    }
-                    doc_terms.append(token_dict)
-
-                # Handle ngrams
-                if self.use_ngrams and self.ngram_min > 1:
-                    added_ngrams = set()  # avoid duplicates
-                    for n in range(self.ngram_min, self.ngram_max + 1):
-                        for i in range(len(tokens) - n + 1):
-                            ngram_tokens = tokens[i:i+n]
-                            ngram_str = " ".join(
-                                [self.normalize_token(t, porter)
-                                 for t in ngram_tokens]
-                            )
-
-                            if ngram_str in added_ngrams:
-                                continue
-                            added_ngrams.add(ngram_str)
-
-                            ngram_dict = {
-                                "term": ngram_str,
-                                "type": "ngram",
-                                "span": n,
-                                "freq": self.ngram_frequency.get(ngram_str, 0),
-                                "docs": (
-                                    self.ngram_document_frequency.get(
-                                        ngram_str, 0
-                                    )
-                                ),
-                                "filters": []
-                            }
-                            doc_terms.append(ngram_dict)
-
-            self.bag_of_words.append(doc_terms)
-
-    def generate_document_term_matrix(self) -> (np.ndarray, dict):
-        """
-        Converts bag_of_words into document-term matrix
-
-            dtm (np.ndarray): shape = (num_docs, num_terms)
-            vocab (dict): mapping term -> column index
-        """
-        logger.info("Building document-term-matrix...")
-        all_terms = set()
-        for doc in self.bag_of_words:
-            for t in doc:
-                all_terms.add(t["term"])
-
-        vocab = {term: idx for idx, term in enumerate(sorted(all_terms))}
-
-        num_docs = len(self.bag_of_words)
-        num_terms = len(vocab)
-        dtm = np.zeros((num_docs, num_terms), dtype=int)
-
-        for doc_idx, doc in enumerate(self.bag_of_words):
-            for token in doc:
-                term_idx = vocab[token["term"]]
-                dtm[doc_idx, term_idx] += 1
-
-        logger.info(f"Matrix shape: {dtm.shape} | Vocab size: {len(vocab)}")
-        return dtm, vocab
diff --git a/preprocessing/loader.py b/preprocessing/loader.py
index 50d0177..ecb5193 100644
--- a/preprocessing/loader.py
+++ b/preprocessing/loader.py
@@ -2,6 +2,8 @@
 import re
 import bibtexparser
 
+from preprocessing.models import DocumentRecord
+
 logger = logging.getLogger(__name__)
 
 
@@ -26,23 +28,46 @@ def normalize_text(text: str) -> str:
 
 class TxtLoader:
     @staticmethod
-    def load(file_path: str) -> list[str]:
-        logger.info("Loading TXT file...")
+    def load(file_path: str) -> list[DocumentRecord]:
         with open(file_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
-        return [normalize_text(line) for line in lines]
+
+        return [
+            DocumentRecord(
+                doc_id=str(i),
+                text=normalize_text(line)
+            )
+            for i, line in enumerate(lines, start=1)
+        ]
 
 
 class BibLoader:
     @staticmethod
-    def load(file_path: str, attribute: str) -> list[str]:
+    def load(file_path: str, attribute: str) -> list[DocumentRecord]:
         logger.info(f"Loading BIB file (attribute={attribute})...")
+
         with open(file_path, "r", encoding="utf-8") as f:
             bib_database = bibtexparser.load(f)
 
         results = []
+        attribute_lower = attribute.lower()
+
         for entry in bib_database.entries:
-            value = entry.get(attribute.lower(), "")
-            results.append(normalize_text(value))
+            bib_id = (
+                entry.get("id")
+                or entry.get("ID")
+                or entry.get("citekey")
+                or entry.get("entrykey")
+                or entry.get("Unique-ID")
+                or "UNKNOWN_ID"
+            )
+
+            raw_value = entry.get(attribute_lower, "")
+            normalized = normalize_text(raw_value)
+
+            results.append(DocumentRecord(
+                doc_id=bib_id,
+                text=normalized
+            ))
 
         return results
diff --git a/preprocessing/models.py b/preprocessing/models.py
new file mode 100644
index 0000000..48ddc4c
--- /dev/null
+++ b/preprocessing/models.py
@@ -0,0 +1,14 @@
+from typing import List
+from dataclasses import dataclass
+
+
+@dataclass
+class DocumentRecord:
+    doc_id: str        # "0", "1", ... for TXT OR bib_id for BIB
+    text: str          # normalized text
+
+
+@dataclass
+class PreprocessedDocument:
+    doc_id: str
+    tokens: List[str]
diff --git a/requirements.txt b/requirements.txt
index e7737db..1804e39 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
 scystream-sdk==1.2.2
 spacy==3.8.7
 nltk==3.9.1
-numpy==2.3.3
 bibtexparser==1.4.3
 pytest==9.0.1
+pandas==2.3.3
+SQLAlchemy==2.0.43
+psycopg2-binary==2.9.10
diff --git a/test/files/expected_dtm_from_bib.pkl b/test/files/expected_dtm_from_bib.pkl
deleted file mode 100644
index 812760358e55fa9a738dfd127f9a613172985d55..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7546
zcmdU!zityj7{q;c3=K6E^i-}XsG!5b1uqaKjwK=`c66KzcQpKobX}I$;u&}o2=+}g
z(oeg4-jjuk`!usN^L@K}y2fA5er{fl_P_S|dV71fo1HH=^=fvzy=ks5>iS|A+GpqW
zYPnf|ZtCr_2@j#2?v<f^y1qI{9>RR|HN2}X!}stpyqZ_@@rPg4xA1AwPOqBX-Rhrf
zeEmAy|2POHZS^KB7K`_P9)J7CeAI-?`?tSe9H!~>tU6O_jrE|M2^d}PvfOpo6ZSqs
z<5F|z96fK`ZNPcjm+OQ{Px<nxlHYmdIqz6H<*T~xf5+Na=}PC4+h^z;$}i2+`;>bg
zuAg>;I$sKxItQ93HJ|*%>wD$-q$l3}rEqS))c1$xqkhhlKefN-2~)1ty^>z{>iQU_
zyiz>%r~I>SfqcYMp7Z3-<>}tVtET4qoTYK@JU%zg<GQ|>^2Mj-(ms_#H8r1flzS?k
z=FH`3emZB*cfZz&*LrHc^q%KfI@P}4OZh!N7c0kWZjSVQ_alF5^?4mr4$bHKSYB-1
z^9XxRU!3}$(j0UyG!NwvpPQ4b_rBzJ{z+$a-m&KkyKZQlde435(7sAnnkTo<&^f8+
zo2n}{Z|*q_rKkPS_k+%f=A#_%OMc?rFHCyk<x8!!fA{D1Bfaz9N7!|QrT0quRNnK1
z$**;<q>u5Q(--T0Rh2{iweFSlG2Z)nu5c{pRO^~=sCw+aXs%;(<&S;ur0ZK_??>NU
r`RII3#cQr(E6typOMd55^IR9ZSM_n<u~@lYkDbT!QgL|vr*rlfHW@$`

diff --git a/test/files/expected_dtm_from_txt.pkl b/test/files/expected_dtm_from_txt.pkl
deleted file mode 100644
index 6ca1a740a3059ed64c47c2bd2d4a87965a077623..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1081
zcmb`GyAFad6ox@CF3xW5PRtHYy4i%m7chV%nkblB1~wNjB;7dfzNr`RUry54L_74$
z`7dn;IQqB1cbcUKS+U!f!7|BZ8tjU#+C)-DCF%ZBrb(V1lq?cOHR*1XB;CtaZKNjQ
zoM`68bfE=}ge!b8Jk#3K-IXf$>ATd~jH$Y|zNg&@g<&{<yjHUbM^Ri&?_T>Do4xZ1
zxs1H~Eyw$q`8l?;Z67&Keh++;Zt{_b4?+i_bHWGVW45KK2c6UQ+<)av_jCQ*Z2KO3
Xk30_@<UVHjAau~A!~d1^`cs1^LX$kw

diff --git a/test/files/expected_vocab_from_bib.pkl b/test/files/expected_vocab_from_bib.pkl
deleted file mode 100644
index 0641a70427f6ee85f8fb59a875a2c17cac43eaec..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9047
zcmZu%2bd&96=r0^-EFwraC>`qfQ_J{lFV62D%d;~#e|CV&Gzm#H#0rgGqZP~C<=<g
zGXRa42rA~B0|F|^oO8}O=dAyG6}o!v?)&zeSATU?z4z*+s=9MWE<5><{q)~C<zc-4
zNyndB#{2Pl{6<-)ldqL=gXeq;SU)Y+(oVmOZ^Ki9ck-gYnkGFU`}2GaiHdY3%R6N}
z!W*Q+HAi`|YGTUe0>g_zvp*=xc#OxAX0nvF%Xplp4H<Rw?x2;RcOB2yAxzpz!R2}k
zTUtvx%`)D=Q-+MXMcT}IWjw(PH6%)kezu%7vofB<xNf(VVcZl?8Zs&dMbawcX`Tsz
zsEc7`yph)dE%KzfTE?4rBq>$~Wjw>PLPY%{>GW0y?WB{5N@jU+Z#ZhUk___AVQnzd
z#8UHETgfQxtO?7Tc`gK^X5MZOpvnaZXFgocTdjO|8E=u5WtPYw_$e^?B5$(rYPPc4
z!XMb#%G-y*sFydhyqmW&^l$TQ<jcz`^t~PWHj^t5VXN~Z#g_+QZ%hZ2+*5-%5L&K5
z*xH_1_aOgukz%(q2y$CLvxTCxzbe}hLBE*BY|p`FpfsYDv?DCtj<>je1F2EAAYmCF
z0*&^1X%7lK6q~1Huq9=Ddmh~}$XZKfd<ULX%4_zHJR1Voh?N9W9>$~1q?>@TJMn~4
zk)7erJR7wAJa^%B&DEsU!uT>ioTptQTFO>L$4BsD9Wo4>T|1J;n^`|QJB5ah;wei<
zy*5NH<D+@LhD0m*T3X=KyJ8tLOt({v_into4n@1u*0T7}-64@Nz<WR<p&V=XghW+4
z%Gzx>cNyP{$DH}y8$78cMoUG8I8es-fjA*z?8(Y{U!Js`R-5<ZO%~{<&DBoc%AutD
zLo73e2O|WCGJXKGk$0LZ;sbf3r6poy9Z}_jc<T@r>hi%n<9d689ld_i>BHfV5q@>f
z$fI7;V|mNoot;Dj({ako62ep&A1`X?2<>9U2~b^#*kJJxQ2=6QyPFqDf#n~nD|n!0
zoDbsz!jFT#sF#YaPn3<wyW#~W2?s((n0qyW!9QHI7$8tnvo%0R48{vs$1a4alQGQD
zQMxA)QHU36NYqaINq6-u415Hbw;y*0MOwy><hc+)>`i(S-yg+e2-rynZgmPz8q(R+
zqj^3Agx!3%NbndQUrN`~R^Bb+Q+dkL(eAtu_k1jmS{r&CW)#nXgBj5u&vPN*74QU1
zq96Jtp2(YPNMt>tj8AiuIvaly&zMgQ5M@hgQpQgP^E#zPyLqpd?SOYZ#jVvz``~zI
z8K3UBvWS!asXSq(&_r3rXYi(=9huCTICi>&9h!@FICcyGN6(>@CT}!jHOrWYvBV2Q
z7}8$8mf`ah+hV<R8S^PwbI%ptz84yL&?=7@{xY5rB0_4?gnlN^25r9sJ9*4mV~eK@
z>CLds^ED(|De~QY<j@W#R~wExX$rP;OkTD9_%4qFHny{iH(I*d(t;O;FdqwhJZj@l
zUnotm1D;gM$L%$q3jrVCc4LUiOnbnnP$)+<Tb>0*t9Ilv&)FDYeCugEtyJJsPv<Rl
zNG+|U#lktfA>9L)&X7(KDxSeJRTCjfjIWHJ$s@~23&gp+L5K(;RMO_NcvIDs5Gg+P
zY#v?CQt_wrctWYD*B>ll^7DBvXhR7a&C2*WJmw0E=VFM~7`CFgfaim@#LT2KNZ}06
z14~+SJfEk82)Zuh*%}h<L=-OLi@;Ytka)p7sZ`{uuouV(bXmZ~*BA2oWuQr0ON|%t
zv~NKoDHdPMi*-m0CofO|V!exb)K(9d@U&9s?Wd+_xRf{7A)gLP9x;x08Lt;28uT;B
z`x0LCt*AX{^|5=&&U7^wMPJTiE-Sy3Ck=_kmNJBsm+@=}xDfGjp0aQc#kFkaD|o(!
zL@vC(60=*kxq>$s(&Wl3F+_W$8CK<2feX{2aiWZ`_FvjS<U-i1czsYSzMAKK%R~)p
z_OIas_tg=(s-}#u;Sm!*U&|9hc!j=>XM;BK2Qn-2$Lo1oVW0lX_zk?d4n?lKe<QDx
z9^*1^G9#rC**yJb-s<`d6y(t|ehWr>!+k4HxrQ~&w_#b^t%#)YM2nQS^B8)x9(o6)
zQqpFLcj_Q?X*&v}co$FFFD-NL=9v(HUI#F!_dp6Wp-(XH<;6M_dFw9Y_bK19pwr;{
zp;$A6L82n-Az^>OPrQ~D{ejelALKdvrG-#iw3f^GLn5CEG1o#zohFnhuKHn2VH=18
z>a*8D-Wuec_j*iXtBxCZgCUK1eguMRkD8}D34RoU7}Qzy$NYDdHJ9;?U^@KHdCpC|
z9=P-Jn|Y&eMeXFw91-vqUKqmURM?gy)~!5h{qp13Q1MVV?I(l@-!{%v#-9{T2#N`<
zaz4fLb;zmc(=xY);LnI2K;g_O=fuxqUwt#MOiciv<8gGF+V=CpgVJF}{(^`zghr+~
z`=aRH#NjWAS~c#Py1I<N%v)+mXry29U%T;Nl?^s=+x+-r-`7M3+S4siK1!_my0Ro;
z-$Iqoe8Ye3C-|oS+CtUW-%>upyMS-Qs>9m>8-KsU+xCX5Lg2fid%e2&o)`ou4fgih
z;`^eB8YcD9N|7Lk{6O??kE}lwZj_E}3jL87VSu<)`eTvO;IIrWxql*zhmdiCpNeC-
zwDU7z&NO^{{JAh!!%%Wr$Nh!ay1#w-rI?3pRRa7<tZNAMhs0m=h{?Xc;YlGR>6^3F
zZ+RgEs)JD(|IU8v<KXXkUca?KWDf*o{0AOsWgP^aKk|eSddVVp8-L=tpe^oa?hF3R
zqqcVW3r0v4gMCt;`73V<+SYUa#-rA!|IU+2$yDme|KM@CaDg9`@jrRW(k|=%ix+B0
z;J^Rob=7g_KNxAxJ~pubm*;DU->3iaOszwc5$)IDEqlQ>Em5PvW9@uN&a!kH+^HE_
z_P`$oOWMD|2koOHh!Z7^G<bTb=%CRCZ?0oe?<}p0XiTQ8)hM=0g0{KEqVWcgbdqeX
zq;(CR5~`|LX?=rdYmiY6ZGb??YH}W+i3U#^8t&XdO-PeEN_q`qLn}=+c+})Vnl^(5
zg>1XjkF*hb(N2r_j!g~TU^+Bm(hTIS+BnMks+ndRyvcM`S8*lHHF$9tmAKg(Wc`xn
zMSeMZ(B=kD2qiWmo=giOM+LY;3vFrexI6pNqG-)jQBKaS4W6$-SWS<1v`sXC+`U@T
zb`eWRa}1;d8az|A-D!&sZ18%8^)N;UiG(srk6<>H(rp{OFpNS9sXHmclIB8>4hF0B
zt30|Llv#t|S0=X8AsA>k^iXI<Nhjg$p_u@HCF}8=?f}iWXhC;u@W$atL5D#%L!ilF
zbSIdE>(NsV-C0-B8Z4^LHznN#Uf{m$rE+{J>2Ro^j`<KrM_`N5hs{3HtkFkJZEhdv
zD6@t~>>UIh?Y?xuh3@LUtOb;k?j|y~a6(2xq`M1irr{0v9>Q7;tCW0CQL>&W>0Yv;
zknS)^_eM+!ky<FYk1T7i#_7J;S=+Fe;&eYzwCOic)a&9#e7U6i>qv8%K@Wi0*}^lZ
zGU<WBv$^o7U~}4{2g!J6B@dQOHVvQ6=@?nN4(bM$bSwe~BDdVk({Vaf=;}_Mj)y^-
z4?Jiuf9V7>R-!&VM0qgDgdS=plFS07Yw9iOVP>Kbw8@-K)WIh2(@C;prU-D^k{+)7
z>jxHd-h>ws@Sn>S3!Q96g05C2T~+!WhF=p+9w9?*9C)NSl+xyQhaLsL4ggc^&?#cL
za{ES)7UKXFc}sf?;$?_non;NAr|R7J)ZWn1V|8v1n9~kDPL$w+;N!)9OvAbO6GR6g
z<Psn~5jMMPkm8AhP6Jy?x`a<pg5iWn<d2IbJsBEsOg#l$8rt%6I^+p}%6p%xqndIc
zp)+)p(B_as37it<<bcRNAJGm-=bjs=3F+z&4YZ`I>2rj6d$9L9l<I2wbVbXM&V*Pf
zSA*#al;V-_>55hxymenab|*9F%)D07nK~;TuDnC;gb!+`MNl?KK5i*1dP=6Y3NEx2
zyaUtFa~|c=g%h8by(_X+fPX-vUBZMuZ&88lqog=36x$O$N|&~)@Yt{8J{S(bn+v^b
z@B>3vo3<NfP{+hR^>jmfz`EXP(^+6$D443*-?L$=0bpf)nhwB0m7WgW$&nQ@+4yyi
zGNLyp^bG8?xkYhzCiG0`q6RthJ{Nhlm$fszEVi<eo+S&rrgx=hBLEMhkWpXb=salC
zyv@-0!j?SK_z3bGSvNp%&^A0iSNO%N8;;|2f$*!ed!M1_$+!@T%mWlX9~-8xb><?6
zE`%jk?ckV57oq6eS5I&+MC@j>x#tCTS@RZH-R95>Vb(gC$wTg%$7%6JvNLjxK`%xO
z7OGZ{T`X2m2N6Z|qJb_EMc9mVsn~(i*7Qq}t^$IJ^fFmdF8$~wVs@agTkaNsE|+mN
z%-#jiOJ%q8`jcKJY8O(>Ro`^z<syFw*{er-g_wzdK%!TQnYgA8h*yZ1L#QkLuN1xb
zJ}pB_x~ew7)TS<cUoCpE_4})2u+l-n{%WvPT{F>ZAdPS8QIoELG&Kk+k~1;A7VK9Y
z_EL>r2P+>&<#?q@`t=B;?f_4306)V=cX}g)83JMLrYd_=IKY(*Z<h7UTe2&Js?y;t
zBAvXWn`HJ@5ydq_3w~RJx9y_?k;KIAw~MHLpi9~Bz)@frg>r1k@J?7%(5s2}T_RmI
zPb~$%8(yS+`sQEKdk`$Ec6G#fuUl2WbGY-N_lb^V1=DLH<NMvR73hQH2V}?n0J%J(
z4>~pJ=W-!xUU}(5@ZZ|Ero*j$Exc(U&#ClbxS&vCw|cC*4oN8hutR#byIw}fO*q|v
zpeK~Se4&p({t&TG*Yr_vr+ED=TGGd`iTmjA=hquyeC`f}Zo+!CYfHKr-X0=VR=5RD
zWZ$dkR;bI+?yZVG4n}K`d3~Z!V3>U=r%z%zL%R_CDU1pc`+ouG(+Dl*<&r+r7##V3
D$S9&b

diff --git a/test/files/expected_vocab_from_txt.pkl b/test/files/expected_vocab_from_txt.pkl
deleted file mode 100644
index 0698bc3d6c3ccde57d064f0e7c0306221b561679..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 400
zcmX|-&2GXl5QL#I5R-;~N?SPi*b^_}r+@^AB_L7*QZJQy>1~eYec82R`?51D&u8!c
ziN}Ap<Ib=Bg4mny(&`RU7V3C5R(FvHqbhT%tbT!XoyJ^GRwtN5r_S}g<d8s8>~2=~
zAgQXN+BFBO`xr&1E~cQ?00VB7HIY(`Tdj)v@M9VRNc7t9kWh?KbjRupL+?E1l`tWa
zBXPJ9B-QZt7<qIaF+t`!AvZ<h2A(0M+Ri$Ms7y&8T_9!Qqf74cD&Ei!Bd;L*U)~{E
Y5P)+Z$fNV&PYA!cH8K`)X`B1@33u~~TL1t6

diff --git a/test/test_full.py b/test/test_full.py
index c97b370..6a04e61 100644
--- a/test/test_full.py
+++ b/test/test_full.py
@@ -1,17 +1,29 @@
 import os
 import boto3
 import pytest
-import pickle
-import numpy as np
+import pandas as pd
 
 from pathlib import Path
 from main import preprocess_bib_file, preprocess_txt_file
 from botocore.exceptions import ClientError
+from sqlalchemy import create_engine
 
 MINIO_USER = "minioadmin"
 MINIO_PWD = "minioadmin"
 BUCKET_NAME = "testbucket"
 
+PG_USER = "postgres"
+PG_PASS = "postgres"
+
+
+def parse_pg_array(arr: str) -> list[str]:
+    # Convert Postgres literal → Python list
+    arr = arr.strip("{}")
+    if not arr:
+        return []
+    # handle quoted items
+    return [a.strip('"') for a in arr.split(",")]
+
 
 def ensure_bucket(s3, bucket):
     try:
@@ -44,21 +56,23 @@ def s3_minio():
 
 def test_full_bib(s3_minio):
     input_file_name = "input"
-    dtm_output_file_name = "dtm_file"
-    vocab_output_file_name = "vocab_file"
 
     bib_path = Path(__file__).parent / "files" / f"{input_file_name}.bib"
     bib_bytes = bib_path.read_bytes()
 
+    # Upload to MinIO
     s3_minio.put_object(
         Bucket=BUCKET_NAME,
         Key=f"{input_file_name}.bib",
         Body=bib_bytes
     )
 
+    # ENV for preprocess_bib_file
     env = {
+        # Preprocessor config
         "UNIGRAM_NORMALIZER": "porter",
 
+        # BIB INPUT S3
         "bib_file_S3_HOST": "http://127.0.0.1",
         "bib_file_S3_PORT": "9000",
         "bib_file_S3_ACCESS_KEY": MINIO_USER,
@@ -68,73 +82,54 @@ def test_full_bib(s3_minio):
         "bib_file_FILE_NAME": input_file_name,
         "bib_file_SELECTED_ATTRIBUTE": "abstract",
 
-        "dtm_output_S3_HOST": "http://127.0.0.1",
-        "dtm_output_S3_PORT": "9000",
-        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
-        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
-        "dtm_output_BUCKET_NAME":  BUCKET_NAME,
-        "dtm_output_FILE_PATH": "",
-        "dtm_output_FILE_NAME": dtm_output_file_name,
-
-        "vocab_output_S3_HOST": "http://127.0.0.1",
-        "vocab_output_S3_PORT": "9000",
-        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
-        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
-        "vocab_output_BUCKET_NAME": BUCKET_NAME,
-        "vocab_output_FILE_PATH": "",
-        "vocab_output_FILE_NAME": vocab_output_file_name,
+        # PostgreSQL output
+        "normalized_docs_PG_HOST": "localhost",
+        "normalized_docs_PG_PORT": "5432",
+        "normalized_docs_PG_USER": PG_USER,
+        "normalized_docs_PG_PASS": PG_PASS,
+        "normalized_docs_DB_TABLE": "normalized_docs_bib",
     }
 
     for k, v in env.items():
         os.environ[k] = v
 
+    # Run block
     preprocess_bib_file()
 
-    keys = [
-        o["Key"]
-        for o in s3_minio.list_objects_v2(
-            Bucket="testbucket").get("Contents", [])
-    ]
-
-    assert f"{dtm_output_file_name}.pkl" in keys
-    assert f"{vocab_output_file_name}.pkl" in keys
-
-    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
-        dtm_output_file_name}.pkl")
-    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
-        vocab_output_file_name}.pkl")
-
-    # Load produced results
-    with open(dtm_path, "rb") as f:
-        dtm = pickle.load(f)
-
-    with open(vocab_path, "rb") as f:
-        vocab = pickle.load(f)
-
-    # Load expected snapshot files
-    expected_vocab_path = Path(__file__).parent / \
-        "files" / "expected_vocab_from_bib.pkl"
-    expected_dtm_path = Path(__file__).parent / "files" / \
-        "expected_dtm_from_bib.pkl"
+    # Query PostgreSQL for inserted documents
+    engine = create_engine(
+        f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@localhost:5432/"
+    )
+    df = pd.read_sql_table("normalized_docs_bib", engine)
 
-    with open(expected_vocab_path, "rb") as f:
-        expected_vocab = pickle.load(f)
+    # Assertions
+    assert len(df) > 0
+    assert "doc_id" in df.columns
+    assert "tokens" in df.columns
+
+    # doc_id increments
+    assert len(df["doc_id"]) == len(df)  # doc_id count matches rows
+    assert df["doc_id"].is_unique         # no duplicates
+    assert all(isinstance(x, str) for x in df["doc_id"])  # Bib IDs are strings
+
+    assert set(df["doc_id"]) == {
+        "WOS:001016714700004",
+        "WOS:001322577100012"
+    }
 
-    with open(expected_dtm_path, "rb") as f:
-        expected_dtm = pickle.load(f)
+    df["tokens"] = df["tokens"].apply(parse_pg_array)
 
-    assert vocab == expected_vocab
-    np.testing.assert_array_equal(dtm, expected_dtm)
+    assert isinstance(df.iloc[0]["tokens"], list)
+    assert all(isinstance(t, str) for t in df.iloc[0]["tokens"])
 
 
 def test_full_txt(s3_minio):
     input_file_name = "input"
-    dtm_output_file_name = "dtm_txt_file"
-    vocab_output_file_name = "vocab_txt_file"
 
     txt_path = Path(__file__).parent / "files" / f"{input_file_name}.txt"
     txt_bytes = txt_path.read_bytes()
 
+    # Upload input to MinIO
     s3_minio.put_object(
         Bucket=BUCKET_NAME,
         Key=f"{input_file_name}.txt",
@@ -144,6 +139,7 @@ def test_full_txt(s3_minio):
     env = {
         "UNIGRAM_NORMALIZER": "porter",
 
+        # TXT input S3
         "txt_file_S3_HOST": "http://127.0.0.1",
         "txt_file_S3_PORT": "9000",
         "txt_file_S3_ACCESS_KEY": MINIO_USER,
@@ -152,21 +148,12 @@ def test_full_txt(s3_minio):
         "txt_file_FILE_PATH": "",
         "txt_file_FILE_NAME": input_file_name,
 
-        "dtm_output_S3_HOST": "http://127.0.0.1",
-        "dtm_output_S3_PORT": "9000",
-        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
-        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
-        "dtm_output_BUCKET_NAME": BUCKET_NAME,
-        "dtm_output_FILE_PATH": "",
-        "dtm_output_FILE_NAME": dtm_output_file_name,
-
-        "vocab_output_S3_HOST": "http://127.0.0.1",
-        "vocab_output_S3_PORT": "9000",
-        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
-        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
-        "vocab_output_BUCKET_NAME": BUCKET_NAME,
-        "vocab_output_FILE_PATH": "",
-        "vocab_output_FILE_NAME": vocab_output_file_name,
+        # Postgres output
+        "normalized_docs_PG_HOST": "localhost",
+        "normalized_docs_PG_PORT": "5432",
+        "normalized_docs_PG_USER": PG_USER,
+        "normalized_docs_PG_PASS": PG_PASS,
+        "normalized_docs_DB_TABLE": "normalized_docs_txt",
     }
 
     for k, v in env.items():
@@ -174,44 +161,19 @@ def test_full_txt(s3_minio):
 
     preprocess_txt_file()
 
-    keys = [
-        o["Key"]
-        for o in s3_minio.list_objects_v2(
-            Bucket=BUCKET_NAME).get("Contents", [])
-    ]
-
-    assert f"{dtm_output_file_name}.pkl" in keys
-    assert f"{vocab_output_file_name}.pkl" in keys
-
-    # Download produced files
-    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
-                               dtm_output_file_name}.pkl")
-    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
-                                 vocab_output_file_name}.pkl")
-
-    # Load produced results
-    with open(dtm_path, "rb") as f:
-        dtm = pickle.load(f)
-
-    with open(vocab_path, "rb") as f:
-        vocab = pickle.load(f)
-
-    # Load expected snapshot files
-    expected_vocab_path = Path(__file__).parent / \
-        "files" / "expected_vocab_from_txt.pkl"
-    expected_dtm_path = Path(__file__).parent / \
-        "files" / "expected_dtm_from_txt.pkl"
-
-    with open(expected_vocab_path, "rb") as f:
-        expected_vocab = pickle.load(f)
-
-    with open(expected_dtm_path, "rb") as f:
-        expected_dtm = pickle.load(f)
+    # Query PostgreSQL
+    engine = create_engine(
+        f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@localhost:5432/"
+    )
+    df = pd.read_sql_table("normalized_docs_txt", engine)
 
     # Assertions
-    assert vocab == expected_vocab
+    assert len(df) > 0
+    assert "doc_id" in df.columns
+    assert "tokens" in df.columns
+    assert len(df["doc_id"]) == len(df)
+
+    df["tokens"] = df["tokens"].apply(parse_pg_array)
 
-    if hasattr(dtm, "toarray"):
-        np.testing.assert_array_equal(dtm.toarray(), expected_dtm.toarray())
-    else:
-        np.testing.assert_array_equal(dtm, expected_dtm)
+    assert isinstance(df.iloc[0]["tokens"], list)
+    assert all(isinstance(t, str) for t in df.iloc[0]["tokens"])
diff --git a/test/test_loaders.py b/test/test_loaders.py
index 3c96468..c55827c 100644
--- a/test/test_loaders.py
+++ b/test/test_loaders.py
@@ -2,6 +2,7 @@
 import tempfile
 
 from preprocessing.loader import TxtLoader, BibLoader
+from preprocessing.models import DocumentRecord
 
 
 def test_txt_loader_reads_and_normalizes():
@@ -12,7 +13,16 @@ def test_txt_loader_reads_and_normalizes():
     result = TxtLoader.load(fname)
     os.unlink(fname)
 
-    assert result == ["Hello World", "Second line"]
+    # Expect list of DocumentRecord
+    assert len(result) == 2
+
+    assert isinstance(result[0], DocumentRecord)
+    assert result[0].doc_id == "1"
+    assert result[0].text == "Hello World"
+
+    assert isinstance(result[1], DocumentRecord)
+    assert result[1].doc_id == "2"
+    assert result[1].text == "Second line"
 
 
 def test_bib_loader_extracts_attribute():
@@ -30,4 +40,13 @@ def test_bib_loader_extracts_attribute():
     result = BibLoader.load(fname, "abstract")
     os.unlink(fname)
 
-    assert result == ["This is Bib text."]
+    assert len(result) == 1
+
+    record = result[0]
+    assert isinstance(record, DocumentRecord)
+
+    # ID taken from bib entry key: "@article{a,..."
+    assert record.doc_id == "a"
+
+    # Normalized abstract text
+    assert record.text == "This is Bib text."
diff --git a/test/test_preprocessor_unit.py b/test/test_preprocessor_unit.py
index 7829105..0833a14 100644
--- a/test/test_preprocessor_unit.py
+++ b/test/test_preprocessor_unit.py
@@ -1,26 +1,43 @@
-def test_preprocessor_tokenization(preprocessor, simple_texts):
-    preprocessor.texts = simple_texts
-    preprocessor.analyze_texts()
-
-    assert len(preprocessor.token_frequency) > 0
-
-
-def test_preprocessor_bag_of_words(preprocessor, simple_texts):
-    preprocessor.texts = simple_texts
-    preprocessor.analyze_texts()
-    preprocessor.generate_bag_of_words()
-
-    assert len(preprocessor.bag_of_words) == 2
-    assert all(len(doc) > 0 for doc in preprocessor.bag_of_words)
-
-
-def test_generate_document_term_matrix(preprocessor, simple_texts):
-    preprocessor.texts = simple_texts
-    preprocessor.analyze_texts()
-    preprocessor.generate_bag_of_words()
-
-    dtm, vocab = preprocessor.generate_document_term_matrix()
-
-    assert dtm.shape[0] == 2
-    assert dtm.shape[1] == len(vocab)
-    assert dtm.sum() > 0
+from preprocessing.core import Preprocessor
+from preprocessing.models import DocumentRecord
+
+
+def test_preprocessor_generate_normalized_output():
+    # Prepare input documents as dataclasses
+    docs = [
+        DocumentRecord(doc_id="1", text="Dogs are running fast."),
+        DocumentRecord(doc_id="2", text="Cats jump high.")
+    ]
+
+    pre = Preprocessor(
+        language="en",
+        filter_stopwords=True,
+        unigram_normalizer="lemma",
+        use_ngrams=True,
+        ngram_min=2,
+        ngram_max=2,
+    )
+
+    pre.documents = docs
+    output = pre.generate_normalized_output()
+
+    # Basic structure checks
+    assert len(output) == 2
+    assert output[0].doc_id == "1"
+    assert output[1].doc_id == "2"
+
+    # Tokens must not be empty
+    assert len(output[0].tokens) > 0
+    assert len(output[1].tokens) > 0
+
+    # Check that lemmatization worked
+    # "running" → "run"
+    assert "run" in output[0].tokens
+
+    # Stopwords filtered → "are" removed
+    assert "are" not in output[0].tokens
+
+    # Check n-gram generation (bigram because ngram_min=ngram_max=2)
+    # Example bigram from doc1: "dog run" (if spacy lemmatizes)
+    bigrams_doc1 = [tok for tok in output[0].tokens if " " in tok]
+    assert len(bigrams_doc1) > 0  # at least one n-gram produced

From e36e7acd2c04a68cc7d7163d5ba7f09ee0afe52b Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Mon, 15 Dec 2025 13:25:55 -0600
Subject: [PATCH 9/9] chore: add postgres test container

---
 .github/workflows/ci.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 60e9630..3e28428 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -78,6 +78,19 @@ jobs:
           --health-interval 5s
           --health-retries 5
           --health-timeout 5s
+      postgres:
+        image: postgres:15
+        ports:
+          - 5432:5432
+        env:
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DB: postgres
+        options: >-
+          --health-cmd="pg_isready -U postgres"
+          --health-interval=5s
+          --health-retries=10
+          --health-timeout=5s
     steps:
       - uses: actions/checkout@v4