From b5874cc0cbd62add13c36ac57ec7ccf18bd13dd5 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 1 Dec 2025 21:26:58 -0600 Subject: [PATCH 1/9] chore: add tests --- .github/workflows/{docker.yaml => ci.yaml} | 50 ++++- input.bib | 112 +++++++++++ test/files/test.txt => input.txt | 0 main.py | 47 +---- preprocessing/__init__.py | 0 preprocessing/loader.py | 14 +- requirements.txt | 1 + test/__init__.py | 0 test/conftest.py | 20 ++ test/files/expected_dtm_from_bib.pkl | Bin 0 -> 7546 bytes test/files/expected_dtm_from_txt.pkl | Bin 0 -> 1081 bytes test/files/expected_vocab_from_bib.pkl | Bin 0 -> 9047 bytes test/files/expected_vocab_from_txt.pkl | Bin 0 -> 400 bytes test/files/input.txt | 4 + test/test_full.py | 213 +++++++++++++++++++++ test/test_loaders.py | 33 ++++ test/test_normalize.py | 17 ++ test/test_preprocessor_unit.py | 26 +++ 18 files changed, 487 insertions(+), 50 deletions(-) rename .github/workflows/{docker.yaml => ci.yaml} (52%) create mode 100644 input.bib rename test/files/test.txt => input.txt (100%) create mode 100644 preprocessing/__init__.py create mode 100644 test/__init__.py create mode 100644 test/conftest.py create mode 100644 test/files/expected_dtm_from_bib.pkl create mode 100644 test/files/expected_dtm_from_txt.pkl create mode 100644 test/files/expected_vocab_from_bib.pkl create mode 100644 test/files/expected_vocab_from_txt.pkl create mode 100644 test/files/input.txt create mode 100644 test/test_full.py create mode 100644 test/test_loaders.py create mode 100644 test/test_normalize.py create mode 100644 test/test_preprocessor_unit.py diff --git a/.github/workflows/docker.yaml b/.github/workflows/ci.yaml similarity index 52% rename from .github/workflows/docker.yaml rename to .github/workflows/ci.yaml index df0d4cf..3939887 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/ci.yaml @@ -1,4 +1,5 @@ -name: Docker +name: CI + on: push: branches: @@ -10,9 +11,55 @@ env: IMAGE_NAME: ${{ github.repository }} jobs: + lint-python: + name: Lint Python + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + cache: "pip" + + - name: Run flake8 + uses: py-actions/flake8@v2 + + test: + runs-on: ubuntu-latest + needs: lint-python + services: + minio: + image: lazybit/minio + ports: + - 9000:9000 + env: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + options: >- + --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1" + --health-interval 5s + --health-retries 5 + --health-timeout 5s + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + cache: "pip" + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Run Tests + run: pytest -vv + build: name: Build docker image runs-on: ubuntu-latest + needs: test permissions: contents: read packages: write @@ -42,3 +89,4 @@ jobs: push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + diff --git a/input.bib b/input.bib new file mode 100644 index 0000000..f525305 --- /dev/null +++ b/input.bib @@ -0,0 +1,112 @@ + +@article{ WOS:001016714700004, +Author = {White, Joel}, +Title = {Theoretical and Practical Paralogisms of Digital Immortality}, +Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY}, +Year = {2022}, +Volume = {9}, +Number = {2, SI}, +Pages = {155-172}, +Month = {JUL 3}, +Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic + and popular relevance; specific naive metaphysical ideas, such as + immortality, have returned with this rise. This article refrains from + any ethical or political assessment of transhumanism. Still, it + critiques the exact metaphysical or idealistic nature of transhumanism + and its pursuit of digital immortality: the idea that, through + technological advancements, precisely in Artificial General + Intelligence, an immortal virtual ``self{''} will become possible. The + article follows the form of Immanuel Kant's ``Paralogisms{''} from the + Critique of Pure Reason, where Kant is concerned with the substantial, + immortal nature of the soul and its experiential impossibility. The + article will offer theoretical and practical paralogisms (false logical + inferences), arguing that the transhumanist claim that digital + immortality is possible fundamentally stems from two incorrect major + premises. The first concerns the substantial nature of information, + which informs the theoretical paralogisms; the second concerns infinite + transformation (pure plasticity), which informs the practical + paralogisms}, +Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD}, +Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND}, +Type = {Article}, +Language = {English}, +DOI = {10.1080/20539320.2022.2150463}, +ISSN = {2053-9320}, +EISSN = {2053-9339}, +Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms; + Digital Immortality}, +Research-Areas = {Philosophy}, +Web-of-Science-Categories = {Philosophy}, +Author-Email = {jhmw01@gmail.com}, +ORCID-Numbers = {White, Joel/0000-0001-6460-0564}, +Number-of-Cited-References = {30}, +Times-Cited = {0}, +Usage-Count-Last-180-days = {3}, +Usage-Count-Since-2013 = {15}, +Journal-ISO = {J. Aesthet. Phenomenol.}, +Doc-Delivery-Number = {K5GF0}, +Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)}, +Unique-ID = {WOS:001016714700004}, +DA = {2025-06-26}, +} + +@article{ WOS:001322577100012, +Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir}, +Title = {Sociopolitical Challenges to Digital Transformation of Rural + Communities: Learnings from a Case Study From Manipur, India}, +Journal = {IT PROFESSIONAL}, +Year = {2024}, +Volume = {26}, +Number = {4}, +Pages = {42-47}, +Month = {JUL-AUG}, +Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized + the inclusive growth of digital networks and digital public goods, + utilizing a multistakeholder systems approach. Similarly, the + information and communications technology (ICT) Innovation and + Intervention Program of the Government of India's Digital North East + Vision 2022 has also emphasized a need for inclusive growth of ICT in + the Northeast Region. In line with the above, this article presents + insights from a field study conducted in the rural parts of Manipur, + India, which incidentally can be found to be applicable to many rural + parts of the developing world. The article envisions a community-driven + sociodigital transformation of the Northeast Region of India. In this + quest, the article highlights sociopolitical challenges for digital + transformation and provides insights for inclusive ICT in such + regions-infrastructure as a utility for every citizen, smart governance + and services on demand, digital empowerment of citizens, social welfare, + capacity building, and community engagement.}, +Publisher = {IEEE COMPUTER SOC}, +Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA}, +Type = {Article}, +Language = {English}, +Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India. + Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India. + Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India. + Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.}, +DOI = {10.1109/MITP.2024.3433459}, +ISSN = {1520-9202}, +EISSN = {1941-045X}, +Keywords = {Technological innovation; Digital transformation; Government; Buildings; + Asia; Africa; Information and communication technology}, +Research-Areas = {Computer Science; Telecommunications}, +Web-of-Science-Categories = {Computer Science, Information Systems; Computer Science, Software + Engineering; Telecommunications}, +Author-Email = {vkant@iitk.ac.in + sanjrampk@iiti.ac.in + sudhir.dixit@ieee.org}, +Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of + Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT + System); Indian Institute of Technology (IIT) - Indore}, +ResearcherID-Numbers = {/ITU-6308-2023}, +ORCID-Numbers = {/0000-0002-6215-7500}, +Number-of-Cited-References = {7}, +Times-Cited = {0}, +Usage-Count-Last-180-days = {11}, +Usage-Count-Since-2013 = {22}, +Journal-ISO = {IT Prof.}, +Doc-Delivery-Number = {H3O9D}, +Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)}, +Unique-ID = {WOS:001322577100012}, +DA = {2025-06-26}, +} diff --git a/test/files/test.txt b/input.txt similarity index 100% rename from test/files/test.txt rename to input.txt diff --git a/main.py b/main.py index d48e02e..6698911 100644 --- a/main.py +++ b/main.py @@ -3,10 +3,10 @@ from scystream.sdk.core import entrypoint from scystream.sdk.env.settings import ( - EnvSettings, - InputSettings, - OutputSettings, - FileSettings + EnvSettings, + InputSettings, + OutputSettings, + FileSettings ) from scystream.sdk.file_handling.s3_manager import S3Operations @@ -81,7 +81,7 @@ def _preprocess_and_store(texts, settings): dtm, vocab = pre.generate_document_term_matrix() with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \ - tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab: + tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab: pickle.dump(dtm, tmp_dtm) tmp_dtm.flush() @@ -108,40 +108,3 @@ def preprocess_bib_file(settings): attribute=settings.bib_input.SELECTED_ATTRIBUTE, ) _preprocess_and_store(texts, settings) - - -""" -if __name__ == "__main__": - test = PreprocessBIB( - bib_input=BIBFileInput( - S3_HOST="http://localhost", - S3_PORT="9000", - S3_ACCESS_KEY="minioadmin", - S3_SECRET_KEY="minioadmin", - BUCKET_NAME="input-bucket", - FILE_PATH="input_file_path", - FILE_NAME="wos_export", - SELECTED_ATTRIBUTE="abstract" - ), - dtm_output=DTMFileOutput( - S3_HOST="http://localhost", - S3_PORT="9000", - S3_ACCESS_KEY="minioadmin", - S3_SECRET_KEY="minioadmin", - BUCKET_NAME="output-bucket", - FILE_PATH="output_file_path", - FILE_NAME="dtm_file_bib" - ), - vocab_output=VocabFileOutput( - S3_HOST="http://localhost", - S3_PORT="9000", - S3_ACCESS_KEY="minioadmin", - S3_SECRET_KEY="minioadmin", - BUCKET_NAME="output-bucket", - FILE_PATH="output_file_path", - FILE_NAME="vocab_file_bib" - ) - ) - - preprocess_bib_file(test) -""" diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/loader.py b/preprocessing/loader.py index d55aac3..9ff51c6 100644 --- a/preprocessing/loader.py +++ b/preprocessing/loader.py @@ -5,17 +5,17 @@ def normalize_text(text: str) -> str: if not text: return "" - # Remove curly braces - text = re.sub(r"[{}]", "", text) - # Remove LaTeX commands - text = re.sub(r"\\[a-zA-Z]+\s*(\{[^}]*\})?", "", text) + text = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", text) + + text = re.sub(r"\\[a-zA-Z]+", "", text) - # Remove LaTeX escaped quotes/accents - text = re.sub(r"\\""[a-zA-Z]", lambda m: m.group(0)[-1], text) + text = re.sub(r"[{}]", "", text) + + text = re.sub(r'\\"([a-zA-Z])', r'\1', text) text = re.sub(r"\\'", "", text) - text = text.replace("'", "") + text = re.sub(r"\s+", " ", text) return text.strip() diff --git a/requirements.txt b/requirements.txt index 3493ec1..e7737db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ spacy==3.8.7 nltk==3.9.1 numpy==2.3.3 bibtexparser==1.4.3 +pytest==9.0.1 diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..5ce5cca --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,20 @@ +import pytest + +from preprocessing.core import Preprocessor + + +@pytest.fixture +def simple_texts(): + return ["This is a test sentence.", "Another test sentence."] + + +@pytest.fixture +def preprocessor(): + return Preprocessor( + language="en", + filter_stopwords=True, + unigram_normalizer="porter", + use_ngrams=True, + ngram_min=2, + ngram_max=3, + ) diff --git a/test/files/expected_dtm_from_bib.pkl b/test/files/expected_dtm_from_bib.pkl new file mode 100644 index 0000000000000000000000000000000000000000..812760358e55fa9a738dfd127f9a613172985d55 GIT binary patch literal 7546 zcmdU!zityj7{q;c3=K6E^i-}XsG!5b1uqaKjwK=`c66KzcQpKobX}I$;u&}o2=+}g z(oeg4-jjuk`!usN^L@K}y2fA5er{fl_P_S|dV71fo1HH=^=fvzy=ks5>iS|A+GpqW zYPnf|ZtCr_2@j#2?vRR|HN2}X!}stpyqZ_@@rPg4xA1AwPOqBX-Rhrf zeEmAy|2POHZS^KB7K`_P9)J7CeAI-?`?tSe9H!~>tU6O_jrE|M2^d}PvfOpo6ZSqs z<5F|z96fK`ZNPcjm+OQ{Pxbg zuAg>;I$sKxItQ93HJ|*%>wD$-q$l3}rEqS))c1$xqkhhlKefN-2~)1ty^>z{>iQU_ zyiz>%r~I>SfqcYMp7Z3-<>}tVtET4qoTYK@JU%zg^2Mj-(ms_#H8r1flzS?k z=FH`3emZB*cfZz&*LrHc^q%KfI@P}4OZh!N7c0kWZjSVQ_alF5^?4mr4$bHKSYB-1 z^9XxRU!3}$(j0UyG!NwvpPQ4b_rBzJ{z+$a-m&KkyKZQlde435(7sAnnkTo<&^f8+ zo2n}{Z|*q_rKkPS_k+%f=A#_%OMc?rFHCyku5Q(--T0Rh2{iweFSlG2Z)nu5c{pRO^~=sCw+aXs%;(<&S;ur0ZK_??>NU r`RII3#cQr(E6typOMd55^IR9ZSM_nATd~jH$Y|zNg&@g<&{Do4xZ1 zxs1H~Eyw$q`8l?;Z67&Keh++;Zt{_b4?+i_bHWGVW45KK2c6UQ+<)av_jCQ*Z2KO3 Xk30_@`qfQ_J{lFV62D%d;~#e|CV&Gzm#H#0rgGqZP~C<=<{q)~Cg_zvp*=xc#OxAX0nvF%Xplp4H$~Wjw>PLPY%{>GW0y?WB{5N@jU+Z#ZhUk___AVQnzd z#8UHETgfQxtO?7Tc`gK^X5MZOpvnaZXFgocTdjO|8E=u5WtPYw_$e^?B5$(rYPPc4 z!XMb#%G-y*sFydhyqmW&^l$TQje1F2EAAYmCF z0*&^1X%7lK6q~1Huq9=Ddmh~}$XZKfd$~1q?>@TJMn~4 zk)7erJR7wAJa^%B&DEsU!uT>ioTptQTFO>L$4BsD9Wo4>T|1J;n^`|QJB5ah;wei< zy*5NHcNyP{$DH}y8$78cMoUG8I8es-fjA*z?8(Y{U!Js`R-5hi%n<9d689ld_i>BHfV5q@>f z$fI7;V|mNoot;Dj({ako62ep&A1`X?2<>9U2~b^#*kJJxQ2=6QyPFqDf#n~nD|n!0 zoDbsz!jFT#sF#YaPn3`i(S-yg+e2-rynZgmPz8q(R+ zqj^3Agx!3%NbndQUrN`~R^Bb+Q+dkL(eAtu_k1jmS{r&CW)#nXgBj5u&vPN*74QU1 zq96Jtp2(YPNMt>tj8AiuIvaly&zMgQ5M@hgQpQgP^E#zPyLqpd?SOYZ#jVvz``~zI z8K3UBvWS!asXSq(&_r3rXYi(=9huCTICi>&9h!@FICcyGN6(>@CT}!jHOrWYvBV2Q z7}8$8mf`ah+hVCLds^ED(|De~QY0*t9Ilv&)FDYeCugEtyJJsPv9L)&X7(KDxSeJRTCjfjIWHJ$s@~23&gp+L5K(;RMO_NcvIDs5Gg+P zY#v?CQt_wrctWYD*B>ll^7DBvXhR7a&C2*WJmw0E=VFM~7`CFgfaim@#LT2KNZ}06 z14~+SJfEk82)Zuh*%}h$s(&Wl3F+_W$8CK<2feX{2aiWZ`_FvjSXM;BK2Qn-2$Lo1oVW0lX_zk?d4n?lKe!+k4HxrQ~&w_#b^t%#)YM2nQS^B8)x9(o6) zQqpFLcj_Q?X*&v}co$FFFD-NL=9v(HUI#F!_dp6Wp-(XH<;6M_dFw9Y_bK19pwr;{ zp;$A6L82n-Az^>OPrQ~D{ejelALKdvrG-#iw3f^GLn5CEG1o#zohFnhuKHn2VH=18 z>a*8D-Wuec_j*iXtBxCZgCUK1eguMRkD8}D34RoU7}Qzy$NYDdHJ9;?U^@KHdCpC| z9=P-Jn|Y&eMeXFw91-vqUKqmURM?gy)~!5h{qp13Q1MVV?I(l@-!{%v#-9{T2#N`< zaz4fLb;zmc(=xY);LnI2K;g_O=fuxqUwt#MOiciv<8gGF+V=CpgVJF}{(^`zghr+~ z`=aRH#NjWAS~c#Py1IupyMS-Qs>9m>8-KsU+xCX5Lg2fid%e2&o)`ou4fgih z;`^eB8YcD9N|7Lk{6O??kE}lwZj_E}3jL87VSu<)`eTvO;IIrWxql*zhmdiCpNeC- zwDU7z&NO^{{JAh!!%%Wr$Nh!ay1#w-rI?3pRRa76^3F zZ+RgEs)JD(|IU8v3iaOszwc5$)IDEqlQ>Em5PvW9@uN&a!kH+^HE_ z_P`$oOWMD|2koOHh!Z7^G1XjkF*hb(N2r_j!g~TU^+Bm(hTIS+BnMks+ndRyvcM`S8*lHHF$9tmAKg(Wc`xn zMSeMZ(B=kD2qiWmo=giOM+LY;3vFrexI6pNqG-)jQBKaS4W6$-SWS<1v`sXC+`U@T zb`eWRa}1;d8az|A-D!&sZ18%8^)N;UiG(srk6<>H(rp{OFpNS9sXHmclIB8>4hF0B zt30|Llv#t|S0=X8AsA>k^iXI(NsV-C0-B8Z4^LHznN#Uf{m$rE+{J>2Ro^j`>UIh?Y?xuh3@LUtOb;k?j|y~a6(2xq`M1irr{0v9>Q7;tCW0CQL>&W>0Yv; zknS)^_eM+!kyqv8%K@Wi0*}^lZ zGUR-!&VM0qgDgdS=plFS07Yw9iOVP>Kbw8@-K)WIh2(@C;prU-D^k{+)7 z>jxHd-h>ws@Sn>S3!Q96g05C2T~+!WhF=p+9w9?*9C)NSl+xyQhaLsL4ggc^&?#cL za{ES)7UKXFc}sf?;$?_non;NAr|R7J)ZWn1V|8v1n9~kDPL$w+;N!)9OvAbO6GR6g z2rj6d$9L9l55hxymenab|*9F%)D07nK~;TuDnC;gb!+`MNl?KK5i*1dP=6Y3NEx2 zyaUtFa~|c=g%h8by(_X+fPX-vUBZMuZ&88lqog=36x$O$N|&~)@Yt{8J{S(bn+v^b z@B>3vo3jmfz`EXP(^+6$D443*-?L$=0bpf)nhwB0m7WgW$&nQ@+4yyi zGNLyp^bG8?xkYhzCiG0`q6RthJ{Nhlm$fszEVix#tCTS@RZH-R95>Vb(gC$wTg%$7%6JvNLjxK`%xO z7OGZ{T`X2m2N6Z|qJb_EMc9mVsn~(i*7Qq}t^$IJ^fFmdF8$~wVs@agTkaNsE|+mN z%-#jiOJ%q8`jcKJY8O(>Ro`^zZAdPS8QIoELG&Kk+k~1;A7VK9Y z_EL>r2P+>&<#?q@`t=B;?f_4306)V=cX}g)83JMLrYd_=IKY(*Zr1k@J?7%(5s2}T_RmI zPb~$%8(yS+`sQEKdk`$Ec6G#fuUl2WbGY-N_lb^V1=DLH~pJ=W-!xUU}(5@ZZ|Ero*j$Exc(U&#ClbxS&vCw|cC*4oN8hutR#byIw}fO*q|v zpeK~Se4&p({t&TG*Yr_vr+ED=TGGd`iTmjA=hquyeC`f}Zo+!CYfHKr-X0=VR=5RD zWZ$dkR;bI+?yZVG4n}K`d3~Z!V3>U=r%z%zL%R_CDU1pc`+ouG(+Dl*<&r+r7##V3 D$S9&b literal 0 HcmV?d00001 diff --git a/test/files/expected_vocab_from_txt.pkl b/test/files/expected_vocab_from_txt.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0698bc3d6c3ccde57d064f0e7c0306221b561679 GIT binary patch literal 400 zcmX|-&2GXl5QL#I5R-;~N?SPi*b^_}r+@^AB_L7*QZJQy>1~eYec82R`?51D&u8!c ziN}Ap~2=~ zAgQXN+BFBO`xr&1E~cQ?00VB7HIY(`Tdj)v@M9VRNc7t9kWh?KbjRupL+?E1l`tWa zBXPJ9B-QZt7 0 + + +def test_preprocessor_bag_of_words(preprocessor, simple_texts): + preprocessor.texts = simple_texts + preprocessor.analyze_texts() + preprocessor.generate_bag_of_words() + + assert len(preprocessor.bag_of_words) == 2 + assert all(len(doc) > 0 for doc in preprocessor.bag_of_words) + + +def test_generate_document_term_matrix(preprocessor, simple_texts): + preprocessor.texts = simple_texts + preprocessor.analyze_texts() + preprocessor.generate_bag_of_words() + + dtm, vocab = preprocessor.generate_document_term_matrix() + + assert dtm.shape[0] == 2 + assert dtm.shape[1] == len(vocab) + assert dtm.sum() > 0 From db2ab5546e413a5007906a7aeadb624ec87a39eb Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 1 Dec 2025 21:56:11 -0600 Subject: [PATCH 2/9] chore: test cbc validity --- .github/workflows/ci.yaml | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3939887..6a49819 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,9 +25,44 @@ jobs: - name: Run flake8 uses: py-actions/flake8@v2 - test: + validate-compute-block: runs-on: ubuntu-latest needs: lint-python + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + + - name: Intall dependencies + run: | + pip install -r requirements.txt + + - name: Check cbcs + run: | + python3 - <<'EOF' + import main + + from scystream.sdk.config import load_config, get_compute_block + from scystream.sdk.config.config_loader import _compare_configs + from pathlib import Path + + CBC_PATH = Path("cbc.yaml") + + if not CBC_PATH.exists(): + raise FileNotFoundError("cbc.yaml not found in repo root.") + + block_from_code = get_compute_block() + block_from_yaml = load_config(str(CBC_PATH)) + + _compare_configs(block_from_code, block_from_yaml) + + print("cbc.yaml matches python code definition") + EOF + + run-test: + runs-on: ubuntu-latest + needs: validate-compute-block services: minio: image: lazybit/minio @@ -59,7 +94,7 @@ jobs: build: name: Build docker image runs-on: ubuntu-latest - needs: test + needs: run-test permissions: contents: read packages: write From 182e71c04c232f556cc73dd13a71e30095cbb471 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 1 Dec 2025 21:58:14 -0600 Subject: [PATCH 3/9] chore: rename workflow steps --- .github/workflows/ci.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6a49819..786a9ed 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -26,6 +26,7 @@ jobs: uses: py-actions/flake8@v2 validate-compute-block: + name: Validate Compute Block Config runs-on: ubuntu-latest needs: lint-python steps: @@ -61,6 +62,7 @@ jobs: EOF run-test: + name: Run Tests runs-on: ubuntu-latest needs: validate-compute-block services: @@ -92,7 +94,7 @@ jobs: run: pytest -vv build: - name: Build docker image + name: Build Docker Image runs-on: ubuntu-latest needs: run-test permissions: From ab58dbb75b9aa93f2faf7f763aea0fdf24cdffee Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 3 Dec 2025 13:50:20 -0600 Subject: [PATCH 4/9] chore: add logging --- main.py | 24 +++++++++++++++++++++++- preprocessing/core.py | 21 ++++++++++++++++++--- preprocessing/loader.py | 5 +++++ 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 6698911..64b9e21 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ import pickle import tempfile +import logging from scystream.sdk.core import entrypoint from scystream.sdk.env.settings import ( @@ -14,6 +15,13 @@ from preprocessing.loader import TxtLoader, BibLoader +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + class DTMFileOutput(FileSettings, OutputSettings): __identifier__ = "dtm_output" @@ -66,6 +74,8 @@ class PreprocessBIB(EnvSettings): def _preprocess_and_store(texts, settings): """Shared preprocessing logic for TXT and BIB.""" + logger.info(f"Starting preprocessing with {len(texts)} documents") + pre = Preprocessor( language=settings.LANGUAGE, filter_stopwords=settings.FILTER_STOPWORDS, @@ -74,10 +84,12 @@ def _preprocess_and_store(texts, settings): ngram_min=settings.NGRAM_MIN, ngram_max=settings.NGRAM_MAX, ) - pre.texts = texts + pre.texts = texts pre.analyze_texts() + pre.generate_bag_of_words() + dtm, vocab = pre.generate_document_term_matrix() with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \ @@ -89,20 +101,30 @@ def _preprocess_and_store(texts, settings): pickle.dump(vocab, tmp_vocab) tmp_vocab.flush() + logger.info("Uploading DTM to S3...") S3Operations.upload(settings.dtm_output, tmp_dtm.name) + + logger.info("Uploading vocabulary to S3...") S3Operations.upload(settings.vocab_output, tmp_vocab.name) + logger.info("Preprocessing completed successfully.") + @entrypoint(PreprocessTXT) def preprocess_txt_file(settings): + logger.info("Downloading TXT input from S3...") S3Operations.download(settings.txt_input, "input.txt") + texts = TxtLoader.load("./input.txt") + _preprocess_and_store(texts, settings) @entrypoint(PreprocessBIB) def preprocess_bib_file(settings): + logger.info("Downloading BIB input from S3...") S3Operations.download(settings.bib_input, "input.bib") + texts = BibLoader.load( "./input.bib", attribute=settings.bib_input.SELECTED_ATTRIBUTE, diff --git a/preprocessing/core.py b/preprocessing/core.py index 4db4585..dba2a8d 100644 --- a/preprocessing/core.py +++ b/preprocessing/core.py @@ -1,3 +1,4 @@ +import logging import spacy import numpy as np @@ -9,6 +10,7 @@ "en": "en_core_web_sm", "de": "de_core_news_sm" } +logger = logging.getLogger(__name__) class Preprocessor: @@ -21,6 +23,12 @@ def __init__( ngram_min: int = 2, ngram_max: int = 3, ): + logger.info( + "Init Preprocessor (lang=%s, filter_stopwords=%s, ngrams=%s)", + language, + filter_stopwords, + use_ngrams, + ) self.language = language self.filter_stopwords = filter_stopwords self.unigram_normalizer = unigram_normalizer @@ -58,6 +66,7 @@ def filter_tokens( ] def analyze_texts(self): + logger.info(f"Analyzing {len(self.texts)} texts...") porter = PorterStemmer() for text in self.texts: doc = self.nlp(text) @@ -67,8 +76,8 @@ def analyze_texts(self): for sentence in doc.sents: filtered_tokens = self.filter_tokens( - list(sentence), - self.filter_stopwords + list(sentence), + self.filter_stopwords ) normalized_tokens = [ self.normalize_token(t, porter) for t in filtered_tokens @@ -93,6 +102,10 @@ def analyze_texts(self): if ngram_list: self.ngram_frequency.update(ngram_list) self.ngram_document_frequency.update(set(ngram_list)) + logger.info( + f"Finished analyzing texts: {self.token_frequency} unigrams, { + self.ngram_frequency} n-grams", + ) def normalize_token( self, @@ -110,6 +123,7 @@ def normalize_token( return word def generate_bag_of_words(self): + logger.info("Generating bag-of-words...") porter = PorterStemmer() self.bag_of_words = [] @@ -177,7 +191,7 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict): dtm (np.ndarray): shape = (num_docs, num_terms) vocab (dict): mapping term -> column index """ - + logger.info("Building document-term-matrix...") all_terms = set() for doc in self.bag_of_words: for t in doc: @@ -194,4 +208,5 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict): term_idx = vocab[token["term"]] dtm[doc_idx, term_idx] += 1 + logger.info(f"Matrix shape: {dtm.shape} | Vocab size: {len(vocab)}") return dtm, vocab diff --git a/preprocessing/loader.py b/preprocessing/loader.py index 9ff51c6..50d0177 100644 --- a/preprocessing/loader.py +++ b/preprocessing/loader.py @@ -1,6 +1,9 @@ +import logging import re import bibtexparser +logger = logging.getLogger(__name__) + def normalize_text(text: str) -> str: if not text: @@ -24,6 +27,7 @@ def normalize_text(text: str) -> str: class TxtLoader: @staticmethod def load(file_path: str) -> list[str]: + logger.info("Loading TXT file...") with open(file_path, "r", encoding="utf-8") as f: lines = f.readlines() return [normalize_text(line) for line in lines] @@ -32,6 +36,7 @@ def load(file_path: str) -> list[str]: class BibLoader: @staticmethod def load(file_path: str, attribute: str) -> list[str]: + logger.info(f"Loading BIB file (attribute={attribute})...") with open(file_path, "r", encoding="utf-8") as f: bib_database = bibtexparser.load(f) From 8868ef1cf9a6d82c61f384d3767b77062fe84f24 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 10 Dec 2025 16:46:47 -0600 Subject: [PATCH 5/9] fix: use absolute path for file download --- .github/workflows/ci.yaml | 2 +- cbc.yaml | 12 ++-- input.bib | 112 -------------------------------------- input.txt | 4 -- main.py | 13 +++-- 5 files changed, 16 insertions(+), 127 deletions(-) delete mode 100644 input.bib delete mode 100644 input.txt diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 786a9ed..60e9630 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -38,7 +38,7 @@ jobs: - name: Intall dependencies run: | pip install -r requirements.txt - + - name: Check cbcs run: | python3 - <<'EOF' diff --git a/cbc.yaml b/cbc.yaml index 0932961..a24e387 100644 --- a/cbc.yaml +++ b/cbc.yaml @@ -1,10 +1,11 @@ author: Paul Kalhorn description: Language preprocessing for .txt or .bib files -docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing +docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing entrypoints: preprocess_bib_file: - description: Entrypoint for preprocessing a .bib file + description: Entrypoint for preprocessing a .bib file envs: + BIB_DOWNLOAD_PATH: /tmp/input.bib FILTER_STOPWORDS: true LANGUAGE: en NGRAM_MAX: 3 @@ -23,7 +24,7 @@ entrypoints: bib_file_S3_PORT: null bib_file_S3_SECRET_KEY: null bib_file_SELECTED_ATTRIBUTE: Abstract - description: The bib file, aswell as one attribute selected for preprocessing + description: The bib file, aswell as one attribute selected for preprocessing type: file outputs: dtm_output: @@ -36,7 +37,7 @@ entrypoints: dtm_output_S3_HOST: null dtm_output_S3_PORT: null dtm_output_S3_SECRET_KEY: null - description: Numpy representation of document-term matrix as .pkl file + description: Numpy representation of document-term matrix as .pkl file type: file vocab_output: config: @@ -57,6 +58,7 @@ entrypoints: LANGUAGE: en NGRAM_MAX: 3 NGRAM_MIN: 2 + TXT_DOWNLOAD_PATH: /tmp/input.txt UNIGRAM_NORMALIZER: porter USE_NGRAMS: true inputs: @@ -70,7 +72,7 @@ entrypoints: txt_file_S3_HOST: null txt_file_S3_PORT: null txt_file_S3_SECRET_KEY: null - description: A .txt file + description: A .txt file type: file outputs: dtm_output: diff --git a/input.bib b/input.bib deleted file mode 100644 index f525305..0000000 --- a/input.bib +++ /dev/null @@ -1,112 +0,0 @@ - -@article{ WOS:001016714700004, -Author = {White, Joel}, -Title = {Theoretical and Practical Paralogisms of Digital Immortality}, -Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY}, -Year = {2022}, -Volume = {9}, -Number = {2, SI}, -Pages = {155-172}, -Month = {JUL 3}, -Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic - and popular relevance; specific naive metaphysical ideas, such as - immortality, have returned with this rise. This article refrains from - any ethical or political assessment of transhumanism. Still, it - critiques the exact metaphysical or idealistic nature of transhumanism - and its pursuit of digital immortality: the idea that, through - technological advancements, precisely in Artificial General - Intelligence, an immortal virtual ``self{''} will become possible. The - article follows the form of Immanuel Kant's ``Paralogisms{''} from the - Critique of Pure Reason, where Kant is concerned with the substantial, - immortal nature of the soul and its experiential impossibility. The - article will offer theoretical and practical paralogisms (false logical - inferences), arguing that the transhumanist claim that digital - immortality is possible fundamentally stems from two incorrect major - premises. The first concerns the substantial nature of information, - which informs the theoretical paralogisms; the second concerns infinite - transformation (pure plasticity), which informs the practical - paralogisms}, -Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD}, -Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND}, -Type = {Article}, -Language = {English}, -DOI = {10.1080/20539320.2022.2150463}, -ISSN = {2053-9320}, -EISSN = {2053-9339}, -Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms; - Digital Immortality}, -Research-Areas = {Philosophy}, -Web-of-Science-Categories = {Philosophy}, -Author-Email = {jhmw01@gmail.com}, -ORCID-Numbers = {White, Joel/0000-0001-6460-0564}, -Number-of-Cited-References = {30}, -Times-Cited = {0}, -Usage-Count-Last-180-days = {3}, -Usage-Count-Since-2013 = {15}, -Journal-ISO = {J. Aesthet. Phenomenol.}, -Doc-Delivery-Number = {K5GF0}, -Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)}, -Unique-ID = {WOS:001016714700004}, -DA = {2025-06-26}, -} - -@article{ WOS:001322577100012, -Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir}, -Title = {Sociopolitical Challenges to Digital Transformation of Rural - Communities: Learnings from a Case Study From Manipur, India}, -Journal = {IT PROFESSIONAL}, -Year = {2024}, -Volume = {26}, -Number = {4}, -Pages = {42-47}, -Month = {JUL-AUG}, -Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized - the inclusive growth of digital networks and digital public goods, - utilizing a multistakeholder systems approach. Similarly, the - information and communications technology (ICT) Innovation and - Intervention Program of the Government of India's Digital North East - Vision 2022 has also emphasized a need for inclusive growth of ICT in - the Northeast Region. In line with the above, this article presents - insights from a field study conducted in the rural parts of Manipur, - India, which incidentally can be found to be applicable to many rural - parts of the developing world. The article envisions a community-driven - sociodigital transformation of the Northeast Region of India. In this - quest, the article highlights sociopolitical challenges for digital - transformation and provides insights for inclusive ICT in such - regions-infrastructure as a utility for every citizen, smart governance - and services on demand, digital empowerment of citizens, social welfare, - capacity building, and community engagement.}, -Publisher = {IEEE COMPUTER SOC}, -Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA}, -Type = {Article}, -Language = {English}, -Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India. - Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India. - Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India. - Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.}, -DOI = {10.1109/MITP.2024.3433459}, -ISSN = {1520-9202}, -EISSN = {1941-045X}, -Keywords = {Technological innovation; Digital transformation; Government; Buildings; - Asia; Africa; Information and communication technology}, -Research-Areas = {Computer Science; Telecommunications}, -Web-of-Science-Categories = {Computer Science, Information Systems; Computer Science, Software - Engineering; Telecommunications}, -Author-Email = {vkant@iitk.ac.in - sanjrampk@iiti.ac.in - sudhir.dixit@ieee.org}, -Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of - Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT - System); Indian Institute of Technology (IIT) - Indore}, -ResearcherID-Numbers = {/ITU-6308-2023}, -ORCID-Numbers = {/0000-0002-6215-7500}, -Number-of-Cited-References = {7}, -Times-Cited = {0}, -Usage-Count-Last-180-days = {11}, -Usage-Count-Since-2013 = {22}, -Journal-ISO = {IT Prof.}, -Doc-Delivery-Number = {H3O9D}, -Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)}, -Unique-ID = {WOS:001322577100012}, -DA = {2025-06-26}, -} diff --git a/input.txt b/input.txt deleted file mode 100644 index 1755b0f..0000000 --- a/input.txt +++ /dev/null @@ -1,4 +0,0 @@ -Cats chase mice. Dogs chase cats. -Birds fly high. Cats and dogs coexist. -Mice hide from cats. Birds sing loudly. -Cats and dogs coexist. Cats and dogs coexist. diff --git a/main.py b/main.py index 64b9e21..aa03bf3 100644 --- a/main.py +++ b/main.py @@ -14,7 +14,6 @@ from preprocessing.core import Preprocessor from preprocessing.loader import TxtLoader, BibLoader - logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' @@ -54,6 +53,8 @@ class PreprocessTXT(EnvSettings): NGRAM_MIN: int = 2 NGRAM_MAX: int = 3 + TXT_DOWNLOAD_PATH: str = "/tmp/input.txt" + txt_input: TXTFileInput dtm_output: DTMFileOutput vocab_output: VocabFileOutput @@ -67,6 +68,8 @@ class PreprocessBIB(EnvSettings): NGRAM_MIN: int = 2 NGRAM_MAX: int = 3 + BIB_DOWNLOAD_PATH: str = "/tmp/input.bib" + bib_input: BIBFileInput dtm_output: DTMFileOutput vocab_output: VocabFileOutput @@ -113,9 +116,9 @@ def _preprocess_and_store(texts, settings): @entrypoint(PreprocessTXT) def preprocess_txt_file(settings): logger.info("Downloading TXT input from S3...") - S3Operations.download(settings.txt_input, "input.txt") + S3Operations.download(settings.txt_input, settings.TXT_DOWNLOAD_PATH) - texts = TxtLoader.load("./input.txt") + texts = TxtLoader.load(settings.TXT_DOWNLOAD_PATH) _preprocess_and_store(texts, settings) @@ -123,10 +126,10 @@ def preprocess_txt_file(settings): @entrypoint(PreprocessBIB) def preprocess_bib_file(settings): logger.info("Downloading BIB input from S3...") - S3Operations.download(settings.bib_input, "input.bib") + S3Operations.download(settings.bib_input, settings.BIB_DOWNLOAD_PATH) texts = BibLoader.load( - "./input.bib", + settings.BIB_DOWNLOAD_PATH, attribute=settings.bib_input.SELECTED_ATTRIBUTE, ) _preprocess_and_store(texts, settings) From d159e80d9ab57156661356b370fe7180c223a8a4 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 10 Dec 2025 16:48:38 -0600 Subject: [PATCH 6/9] fix: use lemma in cbc --- cbc.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cbc.yaml b/cbc.yaml index a24e387..d20bcdf 100644 --- a/cbc.yaml +++ b/cbc.yaml @@ -10,7 +10,7 @@ entrypoints: LANGUAGE: en NGRAM_MAX: 3 NGRAM_MIN: 2 - UNIGRAM_NORMALIZER: porter + UNIGRAM_NORMALIZER: lemma USE_NGRAMS: true inputs: bib_input: @@ -59,7 +59,7 @@ entrypoints: NGRAM_MAX: 3 NGRAM_MIN: 2 TXT_DOWNLOAD_PATH: /tmp/input.txt - UNIGRAM_NORMALIZER: porter + UNIGRAM_NORMALIZER: lemma USE_NGRAMS: true inputs: txt_input: From 7994335a7eca8eddea66b59a92a2e51e846d8a7b Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 10 Dec 2025 16:57:58 -0600 Subject: [PATCH 7/9] fix: use porter in tests --- test/test_full.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_full.py b/test/test_full.py index c12be20..c97b370 100644 --- a/test/test_full.py +++ b/test/test_full.py @@ -57,6 +57,8 @@ def test_full_bib(s3_minio): ) env = { + "UNIGRAM_NORMALIZER": "porter", + "bib_file_S3_HOST": "http://127.0.0.1", "bib_file_S3_PORT": "9000", "bib_file_S3_ACCESS_KEY": MINIO_USER, @@ -140,6 +142,8 @@ def test_full_txt(s3_minio): ) env = { + "UNIGRAM_NORMALIZER": "porter", + "txt_file_S3_HOST": "http://127.0.0.1", "txt_file_S3_PORT": "9000", "txt_file_S3_ACCESS_KEY": MINIO_USER, From 2100e4568813a9d3e227a2cd119aa9b39e7e7c7a Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Thu, 11 Dec 2025 14:05:04 -0600 Subject: [PATCH 8/9] feat: remove bow and dtm logic --- cbc.yaml | 70 +++------- docker-compose.yml | 15 ++- main.py | 80 ++++++------ preprocessing/core.py | 162 ++++------------------- preprocessing/loader.py | 37 +++++- preprocessing/models.py | 14 ++ requirements.txt | 4 +- test/files/expected_dtm_from_bib.pkl | Bin 7546 -> 0 bytes test/files/expected_dtm_from_txt.pkl | Bin 1081 -> 0 bytes test/files/expected_vocab_from_bib.pkl | Bin 9047 -> 0 bytes test/files/expected_vocab_from_txt.pkl | Bin 400 -> 0 bytes test/test_full.py | 174 ++++++++++--------------- test/test_loaders.py | 23 +++- test/test_preprocessor_unit.py | 69 ++++++---- 14 files changed, 278 insertions(+), 370 deletions(-) create mode 100644 preprocessing/models.py delete mode 100644 test/files/expected_dtm_from_bib.pkl delete mode 100644 test/files/expected_dtm_from_txt.pkl delete mode 100644 test/files/expected_vocab_from_bib.pkl delete mode 100644 test/files/expected_vocab_from_txt.pkl diff --git a/cbc.yaml b/cbc.yaml index d20bcdf..4df19d8 100644 --- a/cbc.yaml +++ b/cbc.yaml @@ -1,9 +1,9 @@ -author: Paul Kalhorn +author: Paul Kalhorn description: Language preprocessing for .txt or .bib files -docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing +docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing entrypoints: preprocess_bib_file: - description: Entrypoint for preprocessing a .bib file + description: Entrypoint for preprocessing an attribute of a .bib file envs: BIB_DOWNLOAD_PATH: /tmp/input.bib FILTER_STOPWORDS: true @@ -27,30 +27,15 @@ entrypoints: description: The bib file, aswell as one attribute selected for preprocessing type: file outputs: - dtm_output: + normalized_docs_output: config: - dtm_output_BUCKET_NAME: null - dtm_output_FILE_EXT: pkl - dtm_output_FILE_NAME: null - dtm_output_FILE_PATH: null - dtm_output_S3_ACCESS_KEY: null - dtm_output_S3_HOST: null - dtm_output_S3_PORT: null - dtm_output_S3_SECRET_KEY: null - description: Numpy representation of document-term matrix as .pkl file - type: file - vocab_output: - config: - vocab_output_BUCKET_NAME: null - vocab_output_FILE_EXT: pkl - vocab_output_FILE_NAME: null - vocab_output_FILE_PATH: null - vocab_output_S3_ACCESS_KEY: null - vocab_output_S3_HOST: null - vocab_output_S3_PORT: null - vocab_output_S3_SECRET_KEY: null - description: Pkl file of a dictionary that maps all words to their index in the DTM - type: file + normalized_docs_DB_TABLE: null + normalized_docs_PG_HOST: null + normalized_docs_PG_PASS: null + normalized_docs_PG_PORT: null + normalized_docs_PG_USER: null + description: Database Output, containing bib_id aswell as the normalized text + type: pg_table preprocess_txt_file: description: Entrypoint to preprocess a .txt file envs: @@ -72,31 +57,16 @@ entrypoints: txt_file_S3_HOST: null txt_file_S3_PORT: null txt_file_S3_SECRET_KEY: null - description: A .txt file + description: A .txt file, each line will be treated as a document type: file outputs: - dtm_output: + normalized_docs_output: config: - dtm_output_BUCKET_NAME: null - dtm_output_FILE_EXT: pkl - dtm_output_FILE_NAME: null - dtm_output_FILE_PATH: null - dtm_output_S3_ACCESS_KEY: null - dtm_output_S3_HOST: null - dtm_output_S3_PORT: null - dtm_output_S3_SECRET_KEY: null - description: Numpy representation of document-term matrix as .pkl file - type: file - vocab_output: - config: - vocab_output_BUCKET_NAME: null - vocab_output_FILE_EXT: pkl - vocab_output_FILE_NAME: null - vocab_output_FILE_PATH: null - vocab_output_S3_ACCESS_KEY: null - vocab_output_S3_HOST: null - vocab_output_S3_PORT: null - vocab_output_S3_SECRET_KEY: null - description: Pkl file of a dictionary that maps all words to their index in the DTM - type: file + normalized_docs_DB_TABLE: null + normalized_docs_PG_HOST: null + normalized_docs_PG_PASS: null + normalized_docs_PG_PORT: null + normalized_docs_PG_USER: null + description: Database Output, containing bib_id aswell as the normalized text + type: pg_table name: Language-Preprocessing diff --git a/docker-compose.yml b/docker-compose.yml index 3c73fe4..d75aa02 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,13 +13,16 @@ services: ports: - "9000:9000" - "9001:9001" - networks: - - scystream-net -networks: - scystream-net: - driver: bridge + postgres: + image: postgres:13 + container_name: postgres + environment: + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=postgres + - POSTGRES_DB=postgres + ports: + - "5432:5432" volumes: minio_data: - search_query: diff --git a/main.py b/main.py index e599750..3223be6 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,21 @@ -import pickle -import tempfile import logging +import pandas as pd +from sqlalchemy import create_engine +from typing import List from scystream.sdk.core import entrypoint from scystream.sdk.env.settings import ( EnvSettings, InputSettings, OutputSettings, - FileSettings + FileSettings, + PostgresSettings ) from scystream.sdk.file_handling.s3_manager import S3Operations from preprocessing.core import Preprocessor from preprocessing.loader import TxtLoader, BibLoader +from preprocessing.models import DocumentRecord, PreprocessedDocument logging.basicConfig( level=logging.INFO, @@ -21,16 +24,8 @@ logger = logging.getLogger(__name__) -class DTMFileOutput(FileSettings, OutputSettings): - __identifier__ = "dtm_output" - - FILE_EXT: str = "pkl" - - -class VocabFileOutput(FileSettings, OutputSettings): - __identifier__ = "vocab_output" - - FILE_EXT: str = "pkl" +class NormalizedDocsOutput(PostgresSettings, OutputSettings): + __identifier__ = "normalized_docs" class TXTFileInput(FileSettings, InputSettings): @@ -56,8 +51,7 @@ class PreprocessTXT(EnvSettings): TXT_DOWNLOAD_PATH: str = "/tmp/input.txt" txt_input: TXTFileInput - dtm_output: DTMFileOutput - vocab_output: VocabFileOutput + normalized_docs_output: NormalizedDocsOutput class PreprocessBIB(EnvSettings): @@ -71,13 +65,37 @@ class PreprocessBIB(EnvSettings): BIB_DOWNLOAD_PATH: str = "/tmp/input.bib" bib_input: BIBFileInput - dtm_output: DTMFileOutput - vocab_output: VocabFileOutput + normalized_docs_output: NormalizedDocsOutput + + +def _write_preprocessed_docs_to_postgres( + preprocessed_ouput: List[PreprocessedDocument], + settings: PostgresSettings +): + df = pd.DataFrame([ + { + "doc_id": d.doc_id, + "tokens": d.tokens + } + for d in preprocessed_ouput + ]) + + logger.info(f"Writing {len(df)} processed documents to DB table '{ + settings.DB_TABLE}'…") + engine = create_engine( + f"postgresql+psycopg2://{settings.PG_USER}:{settings.PG_PASS}" + f"@{settings.PG_HOST}:{int(settings.PG_PORT)}/" + ) + + df.to_sql(settings.DB_TABLE, engine, if_exists="replace", index=False) + + logger.info(f"Successfully stored normalized documents into '{ + settings.DB_TABLE}'.") -def _preprocess_and_store(texts, settings): +def _preprocess_and_store(documents: List[DocumentRecord], settings): """Shared preprocessing logic for TXT and BIB.""" - logger.info(f"Starting preprocessing with {len(texts)} documents") + logger.info(f"Starting preprocessing with {len(documents)} documents") pre = Preprocessor( language=settings.LANGUAGE, @@ -88,27 +106,11 @@ def _preprocess_and_store(texts, settings): ngram_max=settings.NGRAM_MAX, ) - pre.texts = texts - pre.analyze_texts() - - pre.generate_bag_of_words() - - dtm, vocab = pre.generate_document_term_matrix() - - with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \ - tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab: - - pickle.dump(dtm, tmp_dtm) - tmp_dtm.flush() - - pickle.dump(vocab, tmp_vocab) - tmp_vocab.flush() - - logger.info("Uploading DTM to S3...") - S3Operations.upload(settings.dtm_output, tmp_dtm.name) + pre.documents = documents + result = pre.generate_normalized_output() - logger.info("Uploading vocabulary to S3...") - S3Operations.upload(settings.vocab_output, tmp_vocab.name) + _write_preprocessed_docs_to_postgres( + result, settings.normalized_docs_output) logger.info("Preprocessing completed successfully.") diff --git a/preprocessing/core.py b/preprocessing/core.py index dba2a8d..f819839 100644 --- a/preprocessing/core.py +++ b/preprocessing/core.py @@ -1,10 +1,9 @@ import logging import spacy -import numpy as np -from typing import Literal +from typing import Literal, List from nltk.stem.porter import PorterStemmer -from collections import Counter +from preprocessing.models import PreprocessedDocument, DocumentRecord LANG_TO_SPACY_MODELS = { "en": "en_core_web_sm", @@ -37,22 +36,14 @@ def __init__( self.ngram_max = ngram_max self.nlp_model = LANG_TO_SPACY_MODELS.get(language, "en_core_web_sm") - - self.ngram_frequency = Counter() - self.ngram_document_frequency = Counter() - self.token_frequency = Counter() - self.token_document_frequency = Counter() - - self.texts: list[str] = [] - - self.bag_of_words = [] - try: self.nlp = spacy.load(self.nlp_model, disable=["ner"]) except OSError: spacy.cli.download(self.nlp_model) self.nlp = spacy.load(self.nlp_model, disable=["ner"]) + self.documents: List[DocumentRecord] = [] + def filter_tokens( self, tokens: list[spacy.tokens.Token], @@ -65,47 +56,39 @@ def filter_tokens( and len(t.text) > 2 ] - def analyze_texts(self): - logger.info(f"Analyzing {len(self.texts)} texts...") + def generate_normalized_output(self) -> List[PreprocessedDocument]: + logger.info("Generating normalized output...") porter = PorterStemmer() - for text in self.texts: - doc = self.nlp(text) - token_list = [] - ngram_list = [] + processed_docs: List[PreprocessedDocument] = [] + + for record in self.documents: + doc = self.nlp(record.text) + doc_terms = [] - for sentence in doc.sents: - filtered_tokens = self.filter_tokens( - list(sentence), - self.filter_stopwords + # Process each sentence + for sent in doc.sents: + filtered = self.filter_tokens( + list(sent), self.filter_stopwords ) - normalized_tokens = [ - self.normalize_token(t, porter) for t in filtered_tokens + normalized = [ + self.normalize_token(t, porter) for t in filtered ] - token_list.extend(normalized_tokens) + doc_terms.extend(normalized) - if ( - self.use_ngrams and - self.ngram_min > 1 and - self.ngram_max > 1 - ): + # Generate n-grams + if self.use_ngrams and self.ngram_min > 1: for n in range(self.ngram_min, self.ngram_max + 1): - for i in range(len(normalized_tokens) - n + 1): - ngram = " ".join(normalized_tokens[i:i+n]) - ngram_list.append(ngram) + for i in range(len(normalized) - n + 1): + ngram = " ".join(normalized[i:i+n]) + doc_terms.append(ngram) - # update unigram counters - self.token_frequency.update(token_list) - self.token_document_frequency.update(set(token_list)) + processed_docs.append(PreprocessedDocument( + doc_id=record.doc_id, + tokens=doc_terms + )) - # update n-gram counters if any - if ngram_list: - self.ngram_frequency.update(ngram_list) - self.ngram_document_frequency.update(set(ngram_list)) - logger.info( - f"Finished analyzing texts: {self.token_frequency} unigrams, { - self.ngram_frequency} n-grams", - ) + return processed_docs def normalize_token( self, @@ -121,92 +104,3 @@ def normalize_token( elif self.unigram_normalizer == "lemma": return token.lemma_.lower() return word - - def generate_bag_of_words(self): - logger.info("Generating bag-of-words...") - porter = PorterStemmer() - self.bag_of_words = [] - - for text in self.texts: - doc = self.nlp(text) - doc_terms = [] - - for sent in doc.sents: - tokens = self.filter_tokens(list(sent), self.filter_stopwords) - - # Handle unigrams - for token in tokens: - normalized = self.normalize_token(token, porter) - - token_dict = { - "term": normalized, - "type": "word", - "span": 1, - "freq": self.token_frequency.get(normalized, 0), - "docs": ( - self.token_document_frequency.get(normalized, 0) - ), - "filters": ( - ["stop"] if not self.filter_stopwords - and token.is_stop else [] - ) - } - doc_terms.append(token_dict) - - # Handle ngrams - if self.use_ngrams and self.ngram_min > 1: - added_ngrams = set() # avoid duplicates - for n in range(self.ngram_min, self.ngram_max + 1): - for i in range(len(tokens) - n + 1): - ngram_tokens = tokens[i:i+n] - ngram_str = " ".join( - [self.normalize_token(t, porter) - for t in ngram_tokens] - ) - - if ngram_str in added_ngrams: - continue - added_ngrams.add(ngram_str) - - ngram_dict = { - "term": ngram_str, - "type": "ngram", - "span": n, - "freq": self.ngram_frequency.get(ngram_str, 0), - "docs": ( - self.ngram_document_frequency.get( - ngram_str, 0 - ) - ), - "filters": [] - } - doc_terms.append(ngram_dict) - - self.bag_of_words.append(doc_terms) - - def generate_document_term_matrix(self) -> (np.ndarray, dict): - """ - Converts bag_of_words into document-term matrix - - dtm (np.ndarray): shape = (num_docs, num_terms) - vocab (dict): mapping term -> column index - """ - logger.info("Building document-term-matrix...") - all_terms = set() - for doc in self.bag_of_words: - for t in doc: - all_terms.add(t["term"]) - - vocab = {term: idx for idx, term in enumerate(sorted(all_terms))} - - num_docs = len(self.bag_of_words) - num_terms = len(vocab) - dtm = np.zeros((num_docs, num_terms), dtype=int) - - for doc_idx, doc in enumerate(self.bag_of_words): - for token in doc: - term_idx = vocab[token["term"]] - dtm[doc_idx, term_idx] += 1 - - logger.info(f"Matrix shape: {dtm.shape} | Vocab size: {len(vocab)}") - return dtm, vocab diff --git a/preprocessing/loader.py b/preprocessing/loader.py index 50d0177..ecb5193 100644 --- a/preprocessing/loader.py +++ b/preprocessing/loader.py @@ -2,6 +2,8 @@ import re import bibtexparser +from preprocessing.models import DocumentRecord + logger = logging.getLogger(__name__) @@ -26,23 +28,46 @@ def normalize_text(text: str) -> str: class TxtLoader: @staticmethod - def load(file_path: str) -> list[str]: - logger.info("Loading TXT file...") + def load(file_path: str) -> list[DocumentRecord]: with open(file_path, "r", encoding="utf-8") as f: lines = f.readlines() - return [normalize_text(line) for line in lines] + + return [ + DocumentRecord( + doc_id=str(i), + text=normalize_text(line) + ) + for i, line in enumerate(lines, start=1) + ] class BibLoader: @staticmethod - def load(file_path: str, attribute: str) -> list[str]: + def load(file_path: str, attribute: str) -> list[DocumentRecord]: logger.info(f"Loading BIB file (attribute={attribute})...") + with open(file_path, "r", encoding="utf-8") as f: bib_database = bibtexparser.load(f) results = [] + attribute_lower = attribute.lower() + for entry in bib_database.entries: - value = entry.get(attribute.lower(), "") - results.append(normalize_text(value)) + bib_id = ( + entry.get("id") + or entry.get("ID") + or entry.get("citekey") + or entry.get("entrykey") + or entry.get("Unique-ID") + or "UNKNOWN_ID" + ) + + raw_value = entry.get(attribute_lower, "") + normalized = normalize_text(raw_value) + + results.append(DocumentRecord( + doc_id=bib_id, + text=normalized + )) return results diff --git a/preprocessing/models.py b/preprocessing/models.py new file mode 100644 index 0000000..48ddc4c --- /dev/null +++ b/preprocessing/models.py @@ -0,0 +1,14 @@ +from typing import List +from dataclasses import dataclass + + +@dataclass +class DocumentRecord: + doc_id: str # "0", "1", ... for TXT OR bib_id for BIB + text: str # normalized text + + +@dataclass +class PreprocessedDocument: + doc_id: str + tokens: List[str] diff --git a/requirements.txt b/requirements.txt index e7737db..1804e39 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ scystream-sdk==1.2.2 spacy==3.8.7 nltk==3.9.1 -numpy==2.3.3 bibtexparser==1.4.3 pytest==9.0.1 +pandas==2.3.3 +SQLAlchemy==2.0.43 +psycopg2-binary==2.9.10 diff --git a/test/files/expected_dtm_from_bib.pkl b/test/files/expected_dtm_from_bib.pkl deleted file mode 100644 index 812760358e55fa9a738dfd127f9a613172985d55..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7546 zcmdU!zityj7{q;c3=K6E^i-}XsG!5b1uqaKjwK=`c66KzcQpKobX}I$;u&}o2=+}g z(oeg4-jjuk`!usN^L@K}y2fA5er{fl_P_S|dV71fo1HH=^=fvzy=ks5>iS|A+GpqW zYPnf|ZtCr_2@j#2?vRR|HN2}X!}stpyqZ_@@rPg4xA1AwPOqBX-Rhrf zeEmAy|2POHZS^KB7K`_P9)J7CeAI-?`?tSe9H!~>tU6O_jrE|M2^d}PvfOpo6ZSqs z<5F|z96fK`ZNPcjm+OQ{Pxbg zuAg>;I$sKxItQ93HJ|*%>wD$-q$l3}rEqS))c1$xqkhhlKefN-2~)1ty^>z{>iQU_ zyiz>%r~I>SfqcYMp7Z3-<>}tVtET4qoTYK@JU%zg^2Mj-(ms_#H8r1flzS?k z=FH`3emZB*cfZz&*LrHc^q%KfI@P}4OZh!N7c0kWZjSVQ_alF5^?4mr4$bHKSYB-1 z^9XxRU!3}$(j0UyG!NwvpPQ4b_rBzJ{z+$a-m&KkyKZQlde435(7sAnnkTo<&^f8+ zo2n}{Z|*q_rKkPS_k+%f=A#_%OMc?rFHCyku5Q(--T0Rh2{iweFSlG2Z)nu5c{pRO^~=sCw+aXs%;(<&S;ur0ZK_??>NU r`RII3#cQr(E6typOMd55^IR9ZSM_nATd~jH$Y|zNg&@g<&{Do4xZ1 zxs1H~Eyw$q`8l?;Z67&Keh++;Zt{_b4?+i_bHWGVW45KK2c6UQ+<)av_jCQ*Z2KO3 Xk30_@`qfQ_J{lFV62D%d;~#e|CV&Gzm#H#0rgGqZP~C<=<{q)~Cg_zvp*=xc#OxAX0nvF%Xplp4H$~Wjw>PLPY%{>GW0y?WB{5N@jU+Z#ZhUk___AVQnzd z#8UHETgfQxtO?7Tc`gK^X5MZOpvnaZXFgocTdjO|8E=u5WtPYw_$e^?B5$(rYPPc4 z!XMb#%G-y*sFydhyqmW&^l$TQje1F2EAAYmCF z0*&^1X%7lK6q~1Huq9=Ddmh~}$XZKfd$~1q?>@TJMn~4 zk)7erJR7wAJa^%B&DEsU!uT>ioTptQTFO>L$4BsD9Wo4>T|1J;n^`|QJB5ah;wei< zy*5NHcNyP{$DH}y8$78cMoUG8I8es-fjA*z?8(Y{U!Js`R-5hi%n<9d689ld_i>BHfV5q@>f z$fI7;V|mNoot;Dj({ako62ep&A1`X?2<>9U2~b^#*kJJxQ2=6QyPFqDf#n~nD|n!0 zoDbsz!jFT#sF#YaPn3`i(S-yg+e2-rynZgmPz8q(R+ zqj^3Agx!3%NbndQUrN`~R^Bb+Q+dkL(eAtu_k1jmS{r&CW)#nXgBj5u&vPN*74QU1 zq96Jtp2(YPNMt>tj8AiuIvaly&zMgQ5M@hgQpQgP^E#zPyLqpd?SOYZ#jVvz``~zI z8K3UBvWS!asXSq(&_r3rXYi(=9huCTICi>&9h!@FICcyGN6(>@CT}!jHOrWYvBV2Q z7}8$8mf`ah+hVCLds^ED(|De~QY0*t9Ilv&)FDYeCugEtyJJsPv9L)&X7(KDxSeJRTCjfjIWHJ$s@~23&gp+L5K(;RMO_NcvIDs5Gg+P zY#v?CQt_wrctWYD*B>ll^7DBvXhR7a&C2*WJmw0E=VFM~7`CFgfaim@#LT2KNZ}06 z14~+SJfEk82)Zuh*%}h$s(&Wl3F+_W$8CK<2feX{2aiWZ`_FvjSXM;BK2Qn-2$Lo1oVW0lX_zk?d4n?lKe!+k4HxrQ~&w_#b^t%#)YM2nQS^B8)x9(o6) zQqpFLcj_Q?X*&v}co$FFFD-NL=9v(HUI#F!_dp6Wp-(XH<;6M_dFw9Y_bK19pwr;{ zp;$A6L82n-Az^>OPrQ~D{ejelALKdvrG-#iw3f^GLn5CEG1o#zohFnhuKHn2VH=18 z>a*8D-Wuec_j*iXtBxCZgCUK1eguMRkD8}D34RoU7}Qzy$NYDdHJ9;?U^@KHdCpC| z9=P-Jn|Y&eMeXFw91-vqUKqmURM?gy)~!5h{qp13Q1MVV?I(l@-!{%v#-9{T2#N`< zaz4fLb;zmc(=xY);LnI2K;g_O=fuxqUwt#MOiciv<8gGF+V=CpgVJF}{(^`zghr+~ z`=aRH#NjWAS~c#Py1IupyMS-Qs>9m>8-KsU+xCX5Lg2fid%e2&o)`ou4fgih z;`^eB8YcD9N|7Lk{6O??kE}lwZj_E}3jL87VSu<)`eTvO;IIrWxql*zhmdiCpNeC- zwDU7z&NO^{{JAh!!%%Wr$Nh!ay1#w-rI?3pRRa76^3F zZ+RgEs)JD(|IU8v3iaOszwc5$)IDEqlQ>Em5PvW9@uN&a!kH+^HE_ z_P`$oOWMD|2koOHh!Z7^G1XjkF*hb(N2r_j!g~TU^+Bm(hTIS+BnMks+ndRyvcM`S8*lHHF$9tmAKg(Wc`xn zMSeMZ(B=kD2qiWmo=giOM+LY;3vFrexI6pNqG-)jQBKaS4W6$-SWS<1v`sXC+`U@T zb`eWRa}1;d8az|A-D!&sZ18%8^)N;UiG(srk6<>H(rp{OFpNS9sXHmclIB8>4hF0B zt30|Llv#t|S0=X8AsA>k^iXI(NsV-C0-B8Z4^LHznN#Uf{m$rE+{J>2Ro^j`>UIh?Y?xuh3@LUtOb;k?j|y~a6(2xq`M1irr{0v9>Q7;tCW0CQL>&W>0Yv; zknS)^_eM+!kyqv8%K@Wi0*}^lZ zGUR-!&VM0qgDgdS=plFS07Yw9iOVP>Kbw8@-K)WIh2(@C;prU-D^k{+)7 z>jxHd-h>ws@Sn>S3!Q96g05C2T~+!WhF=p+9w9?*9C)NSl+xyQhaLsL4ggc^&?#cL za{ES)7UKXFc}sf?;$?_non;NAr|R7J)ZWn1V|8v1n9~kDPL$w+;N!)9OvAbO6GR6g z2rj6d$9L9l55hxymenab|*9F%)D07nK~;TuDnC;gb!+`MNl?KK5i*1dP=6Y3NEx2 zyaUtFa~|c=g%h8by(_X+fPX-vUBZMuZ&88lqog=36x$O$N|&~)@Yt{8J{S(bn+v^b z@B>3vo3jmfz`EXP(^+6$D443*-?L$=0bpf)nhwB0m7WgW$&nQ@+4yyi zGNLyp^bG8?xkYhzCiG0`q6RthJ{Nhlm$fszEVix#tCTS@RZH-R95>Vb(gC$wTg%$7%6JvNLjxK`%xO z7OGZ{T`X2m2N6Z|qJb_EMc9mVsn~(i*7Qq}t^$IJ^fFmdF8$~wVs@agTkaNsE|+mN z%-#jiOJ%q8`jcKJY8O(>Ro`^zZAdPS8QIoELG&Kk+k~1;A7VK9Y z_EL>r2P+>&<#?q@`t=B;?f_4306)V=cX}g)83JMLrYd_=IKY(*Zr1k@J?7%(5s2}T_RmI zPb~$%8(yS+`sQEKdk`$Ec6G#fuUl2WbGY-N_lb^V1=DLH~pJ=W-!xUU}(5@ZZ|Ero*j$Exc(U&#ClbxS&vCw|cC*4oN8hutR#byIw}fO*q|v zpeK~Se4&p({t&TG*Yr_vr+ED=TGGd`iTmjA=hquyeC`f}Zo+!CYfHKr-X0=VR=5RD zWZ$dkR;bI+?yZVG4n}K`d3~Z!V3>U=r%z%zL%R_CDU1pc`+ouG(+Dl*<&r+r7##V3 D$S9&b diff --git a/test/files/expected_vocab_from_txt.pkl b/test/files/expected_vocab_from_txt.pkl deleted file mode 100644 index 0698bc3d6c3ccde57d064f0e7c0306221b561679..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 400 zcmX|-&2GXl5QL#I5R-;~N?SPi*b^_}r+@^AB_L7*QZJQy>1~eYec82R`?51D&u8!c ziN}Ap~2=~ zAgQXN+BFBO`xr&1E~cQ?00VB7HIY(`Tdj)v@M9VRNc7t9kWh?KbjRupL+?E1l`tWa zBXPJ9B-QZt7 list[str]: + # Convert Postgres literal → Python list + arr = arr.strip("{}") + if not arr: + return [] + # handle quoted items + return [a.strip('"') for a in arr.split(",")] + def ensure_bucket(s3, bucket): try: @@ -44,21 +56,23 @@ def s3_minio(): def test_full_bib(s3_minio): input_file_name = "input" - dtm_output_file_name = "dtm_file" - vocab_output_file_name = "vocab_file" bib_path = Path(__file__).parent / "files" / f"{input_file_name}.bib" bib_bytes = bib_path.read_bytes() + # Upload to MinIO s3_minio.put_object( Bucket=BUCKET_NAME, Key=f"{input_file_name}.bib", Body=bib_bytes ) + # ENV for preprocess_bib_file env = { + # Preprocessor config "UNIGRAM_NORMALIZER": "porter", + # BIB INPUT S3 "bib_file_S3_HOST": "http://127.0.0.1", "bib_file_S3_PORT": "9000", "bib_file_S3_ACCESS_KEY": MINIO_USER, @@ -68,73 +82,54 @@ def test_full_bib(s3_minio): "bib_file_FILE_NAME": input_file_name, "bib_file_SELECTED_ATTRIBUTE": "abstract", - "dtm_output_S3_HOST": "http://127.0.0.1", - "dtm_output_S3_PORT": "9000", - "dtm_output_S3_ACCESS_KEY": MINIO_USER, - "dtm_output_S3_SECRET_KEY": MINIO_PWD, - "dtm_output_BUCKET_NAME": BUCKET_NAME, - "dtm_output_FILE_PATH": "", - "dtm_output_FILE_NAME": dtm_output_file_name, - - "vocab_output_S3_HOST": "http://127.0.0.1", - "vocab_output_S3_PORT": "9000", - "vocab_output_S3_ACCESS_KEY": MINIO_USER, - "vocab_output_S3_SECRET_KEY": MINIO_PWD, - "vocab_output_BUCKET_NAME": BUCKET_NAME, - "vocab_output_FILE_PATH": "", - "vocab_output_FILE_NAME": vocab_output_file_name, + # PostgreSQL output + "normalized_docs_PG_HOST": "localhost", + "normalized_docs_PG_PORT": "5432", + "normalized_docs_PG_USER": PG_USER, + "normalized_docs_PG_PASS": PG_PASS, + "normalized_docs_DB_TABLE": "normalized_docs_bib", } for k, v in env.items(): os.environ[k] = v + # Run block preprocess_bib_file() - keys = [ - o["Key"] - for o in s3_minio.list_objects_v2( - Bucket="testbucket").get("Contents", []) - ] - - assert f"{dtm_output_file_name}.pkl" in keys - assert f"{vocab_output_file_name}.pkl" in keys - - dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ - dtm_output_file_name}.pkl") - vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ - vocab_output_file_name}.pkl") - - # Load produced results - with open(dtm_path, "rb") as f: - dtm = pickle.load(f) - - with open(vocab_path, "rb") as f: - vocab = pickle.load(f) - - # Load expected snapshot files - expected_vocab_path = Path(__file__).parent / \ - "files" / "expected_vocab_from_bib.pkl" - expected_dtm_path = Path(__file__).parent / "files" / \ - "expected_dtm_from_bib.pkl" + # Query PostgreSQL for inserted documents + engine = create_engine( + f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@localhost:5432/" + ) + df = pd.read_sql_table("normalized_docs_bib", engine) - with open(expected_vocab_path, "rb") as f: - expected_vocab = pickle.load(f) + # Assertions + assert len(df) > 0 + assert "doc_id" in df.columns + assert "tokens" in df.columns + + # doc_id increments + assert len(df["doc_id"]) == len(df) # doc_id count matches rows + assert df["doc_id"].is_unique # no duplicates + assert all(isinstance(x, str) for x in df["doc_id"]) # Bib IDs are strings + + assert set(df["doc_id"]) == { + "WOS:001016714700004", + "WOS:001322577100012" + } - with open(expected_dtm_path, "rb") as f: - expected_dtm = pickle.load(f) + df["tokens"] = df["tokens"].apply(parse_pg_array) - assert vocab == expected_vocab - np.testing.assert_array_equal(dtm, expected_dtm) + assert isinstance(df.iloc[0]["tokens"], list) + assert all(isinstance(t, str) for t in df.iloc[0]["tokens"]) def test_full_txt(s3_minio): input_file_name = "input" - dtm_output_file_name = "dtm_txt_file" - vocab_output_file_name = "vocab_txt_file" txt_path = Path(__file__).parent / "files" / f"{input_file_name}.txt" txt_bytes = txt_path.read_bytes() + # Upload input to MinIO s3_minio.put_object( Bucket=BUCKET_NAME, Key=f"{input_file_name}.txt", @@ -144,6 +139,7 @@ def test_full_txt(s3_minio): env = { "UNIGRAM_NORMALIZER": "porter", + # TXT input S3 "txt_file_S3_HOST": "http://127.0.0.1", "txt_file_S3_PORT": "9000", "txt_file_S3_ACCESS_KEY": MINIO_USER, @@ -152,21 +148,12 @@ def test_full_txt(s3_minio): "txt_file_FILE_PATH": "", "txt_file_FILE_NAME": input_file_name, - "dtm_output_S3_HOST": "http://127.0.0.1", - "dtm_output_S3_PORT": "9000", - "dtm_output_S3_ACCESS_KEY": MINIO_USER, - "dtm_output_S3_SECRET_KEY": MINIO_PWD, - "dtm_output_BUCKET_NAME": BUCKET_NAME, - "dtm_output_FILE_PATH": "", - "dtm_output_FILE_NAME": dtm_output_file_name, - - "vocab_output_S3_HOST": "http://127.0.0.1", - "vocab_output_S3_PORT": "9000", - "vocab_output_S3_ACCESS_KEY": MINIO_USER, - "vocab_output_S3_SECRET_KEY": MINIO_PWD, - "vocab_output_BUCKET_NAME": BUCKET_NAME, - "vocab_output_FILE_PATH": "", - "vocab_output_FILE_NAME": vocab_output_file_name, + # Postgres output + "normalized_docs_PG_HOST": "localhost", + "normalized_docs_PG_PORT": "5432", + "normalized_docs_PG_USER": PG_USER, + "normalized_docs_PG_PASS": PG_PASS, + "normalized_docs_DB_TABLE": "normalized_docs_txt", } for k, v in env.items(): @@ -174,44 +161,19 @@ def test_full_txt(s3_minio): preprocess_txt_file() - keys = [ - o["Key"] - for o in s3_minio.list_objects_v2( - Bucket=BUCKET_NAME).get("Contents", []) - ] - - assert f"{dtm_output_file_name}.pkl" in keys - assert f"{vocab_output_file_name}.pkl" in keys - - # Download produced files - dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ - dtm_output_file_name}.pkl") - vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ - vocab_output_file_name}.pkl") - - # Load produced results - with open(dtm_path, "rb") as f: - dtm = pickle.load(f) - - with open(vocab_path, "rb") as f: - vocab = pickle.load(f) - - # Load expected snapshot files - expected_vocab_path = Path(__file__).parent / \ - "files" / "expected_vocab_from_txt.pkl" - expected_dtm_path = Path(__file__).parent / \ - "files" / "expected_dtm_from_txt.pkl" - - with open(expected_vocab_path, "rb") as f: - expected_vocab = pickle.load(f) - - with open(expected_dtm_path, "rb") as f: - expected_dtm = pickle.load(f) + # Query PostgreSQL + engine = create_engine( + f"postgresql+psycopg2://{PG_USER}:{PG_PASS}@localhost:5432/" + ) + df = pd.read_sql_table("normalized_docs_txt", engine) # Assertions - assert vocab == expected_vocab + assert len(df) > 0 + assert "doc_id" in df.columns + assert "tokens" in df.columns + assert len(df["doc_id"]) == len(df) + + df["tokens"] = df["tokens"].apply(parse_pg_array) - if hasattr(dtm, "toarray"): - np.testing.assert_array_equal(dtm.toarray(), expected_dtm.toarray()) - else: - np.testing.assert_array_equal(dtm, expected_dtm) + assert isinstance(df.iloc[0]["tokens"], list) + assert all(isinstance(t, str) for t in df.iloc[0]["tokens"]) diff --git a/test/test_loaders.py b/test/test_loaders.py index 3c96468..c55827c 100644 --- a/test/test_loaders.py +++ b/test/test_loaders.py @@ -2,6 +2,7 @@ import tempfile from preprocessing.loader import TxtLoader, BibLoader +from preprocessing.models import DocumentRecord def test_txt_loader_reads_and_normalizes(): @@ -12,7 +13,16 @@ def test_txt_loader_reads_and_normalizes(): result = TxtLoader.load(fname) os.unlink(fname) - assert result == ["Hello World", "Second line"] + # Expect list of DocumentRecord + assert len(result) == 2 + + assert isinstance(result[0], DocumentRecord) + assert result[0].doc_id == "1" + assert result[0].text == "Hello World" + + assert isinstance(result[1], DocumentRecord) + assert result[1].doc_id == "2" + assert result[1].text == "Second line" def test_bib_loader_extracts_attribute(): @@ -30,4 +40,13 @@ def test_bib_loader_extracts_attribute(): result = BibLoader.load(fname, "abstract") os.unlink(fname) - assert result == ["This is Bib text."] + assert len(result) == 1 + + record = result[0] + assert isinstance(record, DocumentRecord) + + # ID taken from bib entry key: "@article{a,..." + assert record.doc_id == "a" + + # Normalized abstract text + assert record.text == "This is Bib text." diff --git a/test/test_preprocessor_unit.py b/test/test_preprocessor_unit.py index 7829105..0833a14 100644 --- a/test/test_preprocessor_unit.py +++ b/test/test_preprocessor_unit.py @@ -1,26 +1,43 @@ -def test_preprocessor_tokenization(preprocessor, simple_texts): - preprocessor.texts = simple_texts - preprocessor.analyze_texts() - - assert len(preprocessor.token_frequency) > 0 - - -def test_preprocessor_bag_of_words(preprocessor, simple_texts): - preprocessor.texts = simple_texts - preprocessor.analyze_texts() - preprocessor.generate_bag_of_words() - - assert len(preprocessor.bag_of_words) == 2 - assert all(len(doc) > 0 for doc in preprocessor.bag_of_words) - - -def test_generate_document_term_matrix(preprocessor, simple_texts): - preprocessor.texts = simple_texts - preprocessor.analyze_texts() - preprocessor.generate_bag_of_words() - - dtm, vocab = preprocessor.generate_document_term_matrix() - - assert dtm.shape[0] == 2 - assert dtm.shape[1] == len(vocab) - assert dtm.sum() > 0 +from preprocessing.core import Preprocessor +from preprocessing.models import DocumentRecord + + +def test_preprocessor_generate_normalized_output(): + # Prepare input documents as dataclasses + docs = [ + DocumentRecord(doc_id="1", text="Dogs are running fast."), + DocumentRecord(doc_id="2", text="Cats jump high.") + ] + + pre = Preprocessor( + language="en", + filter_stopwords=True, + unigram_normalizer="lemma", + use_ngrams=True, + ngram_min=2, + ngram_max=2, + ) + + pre.documents = docs + output = pre.generate_normalized_output() + + # Basic structure checks + assert len(output) == 2 + assert output[0].doc_id == "1" + assert output[1].doc_id == "2" + + # Tokens must not be empty + assert len(output[0].tokens) > 0 + assert len(output[1].tokens) > 0 + + # Check that lemmatization worked + # "running" → "run" + assert "run" in output[0].tokens + + # Stopwords filtered → "are" removed + assert "are" not in output[0].tokens + + # Check n-gram generation (bigram because ngram_min=ngram_max=2) + # Example bigram from doc1: "dog run" (if spacy lemmatizes) + bigrams_doc1 = [tok for tok in output[0].tokens if " " in tok] + assert len(bigrams_doc1) > 0 # at least one n-gram produced From e36e7acd2c04a68cc7d7163d5ba7f09ee0afe52b Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 15 Dec 2025 13:25:55 -0600 Subject: [PATCH 9/9] chore: add postgres test container --- .github/workflows/ci.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 60e9630..3e28428 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -78,6 +78,19 @@ jobs: --health-interval 5s --health-retries 5 --health-timeout 5s + postgres: + image: postgres:15 + ports: + - 5432:5432 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + options: >- + --health-cmd="pg_isready -U postgres" + --health-interval=5s + --health-retries=10 + --health-timeout=5s steps: - uses: actions/checkout@v4