diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f436ae..cf66d5c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,30 +1,36 @@ fail_fast: false repos: + # Python linting and formatting - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.1 + rev: v0.14.14 hooks: - - id: ruff + - id: ruff-check args: [--fix, --ignore, E722] - id: ruff-format + + # YAML formatting - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.14.0 + rev: v2.16.0 hooks: - id: pretty-format-yaml args: [--autofix, --indent, '4', --preserve-quotes] - files: ^(?!.*docker-compose.yml)$ -#- repo: https://github.com/pre-commit/pre-commit-hooks -# rev: v4.6.0 -# hooks: -# - id: pretty-format-json -# args: [--autofix, --indent, '4', --no-sort-keys] + + # JSON formatting +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: pretty-format-json + args: [--autofix, --indent, '4', --no-sort-keys] + + # TOML formatting and sorting - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.14.0 + rev: v2.16.0 hooks: - id: pretty-format-toml args: [--autofix] - repo: https://github.com/pappasam/toml-sort - rev: v0.24.2 + rev: v0.24.3 hooks: - id: toml-sort args: [-ia] diff --git a/README.md b/README.md index 569ab90..c70fd17 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ -# PELinker +# PELinker (Property Entity LINKER) + +Entity linking for BERT-like models -A service for entity linking of properties ## Developer notes diff --git a/data/derived/100_freq_embeddings.pkl b/data/derived/100_freq_embeddings.pkl deleted file mode 100644 index 242f4d9..0000000 Binary files a/data/derived/100_freq_embeddings.pkl and /dev/null differ diff --git a/data/derived/491_within_n_across_disps.csv b/data/derived/491_within_n_across_disps.csv deleted file mode 100644 index aade892..0000000 --- a/data/derived/491_within_n_across_disps.csv +++ /dev/null @@ -1,492 +0,0 @@ -within_verbs,across_verbs -0.605480839972912,0.725044923255003 -0.49736222140377945,0.631654891899663 -0.47168608927506156,0.6138123822114263 -0.4742792113299598,0.7023129564307181 -0.4169654134273455,0.7616582546135822 -0.5073276350776147,0.7010983548259359 -0.5080952583325881,0.6184856455982031 -0.6336920196694058,0.7135379130646389 -0.5096374318767398,0.6247402327348868 -0.44965059748390324,0.6823137186224669 -0.5177768097930858,0.7348989092314762 -0.5264673201782423,0.5656085507436923 -0.45703008446181675,0.6753671791010405 -0.48451652957508223,0.675969522638134 -0.5793227974132165,0.7934024646556234 -0.4913358732252781,0.6095717139270582 -0.45817187199836795,0.64373781232785 -0.4859303505313909,0.6564623362610232 -0.5217150473881683,0.6633335815310539 -0.45370235145655574,0.7058189496751563 -0.520530325532616,0.6495729620128191 -0.4853643349500969,0.7018000450919247 -0.480950212116653,0.5894266788293532 -0.44550003460714416,0.62064761324277 -0.607334509814434,0.6488159916632716 -0.4877989698269567,0.5836701510547643 -0.4826012084755831,0.6052885169187038 -0.491130469858808,0.738665553634054 -0.5283830946696207,0.7423043754047103 -0.6058576848053016,0.6718158813486695 -0.49611379015129337,0.6865991262904046 -0.45995816364604886,0.74150820599475 -0.4693663152080771,0.5758253147021858 -0.465618984115237,0.6288827659106568 -0.4275132711943728,0.6064278036275366 -0.522436400030277,0.8633923120617873 -0.4874375235782103,0.6406178862813954 -0.4750879879671092,0.7103825562514424 -0.43435649184083336,0.7567480449667344 -0.47588486149035236,0.6669337847163456 -0.5122148682566616,0.7088152200286199 -0.5278252347390088,0.7557996255172601 -0.5121233945081163,0.7106874805636786 -0.45985626317125283,0.7479672908574394 -0.49211643399141797,0.6178351055158131 -0.4663617291798036,0.690202691786898 -0.4726400200652983,0.6449681048260761 -0.4794069529899159,0.6749247992817111 -0.32895459201304594,0.8163349355607313 -0.4669423497568996,0.7065692348167673 -0.556727180680389,0.6460490110795182 -0.5118774918835025,0.6106099825723187 -0.45413602944260634,0.6372996361435752 -0.42567105798092547,0.7766913773120602 -0.5150247407300836,0.793609809691233 -0.5398529540916295,0.7383232563468183 -0.5028750299429781,0.6800528762773124 -0.3581015171984239,0.6377255102090702 -0.5227575192012937,0.6733601957479756 -0.5561123224654,0.6870150416476007 -0.5292444825767806,0.7455346450004855 -0.5340326012837339,0.7051355779081702 -0.4675557089419409,0.7572185625207623 -0.46524793800760417,0.6744336427288068 -0.4912697940303896,0.6656903958528677 -0.43641262967072025,0.6949632756319819 -0.4703460579253098,0.6917245639856182 -0.5019123156227828,0.6705063329217806 -0.48739246003339043,0.61614298564692 -0.4567562698403358,0.7723161076764565 -0.3929110063584417,0.8134693933213919 -0.4535871712949118,0.7127874987153471 -0.5557559228921515,0.6333975467102619 -0.48030355881692355,0.7895453070079386 -0.44636368750524097,0.7812104003247197 -0.4450164826288345,0.6219141498532731 -0.36083337421876777,0.6886393143430082 -0.33826329175074293,0.5892837933941131 -0.4491586433039609,0.6886572009849073 -0.3400201248591462,0.6693439430473546 -0.5571581929004141,0.7112815305580468 -0.4569447371083403,0.7650726020960796 -0.46033377732874714,0.7453510224901295 -0.44268074752299935,0.7030451135368131 -0.4746925279879096,0.7014440156208497 -0.5193301186750272,0.6274909187742723 -0.44953475131540727,0.6884262541419361 -0.4065687938934455,0.6521468055941153 -0.4815863944197614,0.5871107344493782 -0.5096991073374229,0.655158528103925 -0.4708724170991446,0.6933973547197685 -0.47086235510365,0.7114821521021251 -0.4544683681732724,0.6915068684665931 -0.4407412055689817,0.6581853768507563 -0.3648591879017801,0.6384008360322971 -0.5617535875444779,0.7653312128224149 -0.3910765851239329,0.6466664058114333 -0.4111000573165128,0.5544918087487614 -0.4523271254093024,0.6336410628792969 -0.42161837033265237,0.6975447261250193 -0.4809417385156048,0.6860294254949358 -0.5508333930969999,0.6563558619536491 -0.3977168026923137,0.6488118554755715 -0.49910387271882295,0.7579884987929474 -0.37622353737507935,0.6361923092558474 -0.3574804529704139,0.6420250567314957 -0.42384896262923943,0.6654405685528438 -0.4317548645465034,0.5995038701009657 -0.5387502473946272,0.6909213572010845 -0.4947213495323156,0.7816281935184247 -0.44802125929586695,0.6376577751780176 -0.4846832750231929,0.6537935489957094 -0.4600624238588429,0.7053979706279919 -0.46100788617193844,0.6257681827834849 -0.377514311859985,0.6457564538259768 -0.4578190858733184,0.6995302979249203 -0.41874412472653233,0.6728141720798153 -0.473149252233069,0.7400684749188887 -0.5003692414253775,0.6579748519226128 -0.38671387150327524,0.6507910260223319 -0.4329288435011897,0.7554428872961404 -0.38937738950985945,0.6632714422558932 -0.507866448194171,0.6697989654204488 -0.4503577175594915,0.6436987778546469 -0.4004246379411161,0.7084190193924603 -0.5175674155634723,0.592000461758388 -0.49096655150152685,0.7179735738467282 -0.40157507962559785,0.724116842640191 -0.4996563962257568,0.7219602737166688 -0.4091447739402725,0.6607966997588274 -0.4908000549408518,0.654323415307034 -0.4029540032740767,0.7752269484038838 -0.4896063399150976,0.7151692301592091 -0.37365763622593734,0.6605479646305524 -0.45346212226665594,0.7753901076453064 -0.33296084571574036,0.614266466139344 -0.40595239179797177,0.7232822774566101 -0.4512160027410494,0.704067822452151 -0.387437481995008,0.690940315995756 -0.3712252979731263,0.6892111587672914 -0.5711661659061097,0.703213254888616 -0.46938253599058066,0.6541600457644752 -0.4664196694872788,0.5938181337796151 -0.4682109685010753,0.6839917471696231 -0.4012940182279379,0.6963437212566033 -0.5566515181145882,0.6597889423241242 -0.5103069429820604,0.6485648974692793 -0.5547128212607425,0.7739386059086971 -0.40791332374782513,0.7189626112158645 -0.43831715375374275,0.713504833105968 -0.37238584111033374,0.6988144453295679 -0.48974065751301804,0.6958484714351436 -0.5692746243412625,0.7403417508986659 -0.41830905885228964,0.7634910496425027 -0.42199736012733086,0.7174980202292595 -0.49464193272381196,0.7361537788265963 -0.40907087347300763,0.6947946456557822 -0.5083428799414953,0.716828781048775 -0.4831330925746159,0.6247491194793605 -0.44865348138723904,0.5758531785946958 -0.4577811559637835,0.7379324357150788 -0.522405194942378,0.6737231435721743 -0.3862756721518123,0.6616181297653976 -0.4839168415317975,0.7271759685965089 -0.4110369138397671,0.7350427686567975 -0.40245055365606325,0.5796611914894637 -0.46790976219762026,0.672467715775991 -0.33134028011154776,0.7404073648550494 -0.4467874961511285,0.7531835861421509 -0.4088755433995198,0.7282914303989606 -0.4207078706101553,0.5930391708000566 -0.4117520706022664,0.6670040951943181 -0.42262382063582715,0.6521819959973684 -0.5139962578168572,0.6383414092367927 -0.3548675056073501,0.5756398781300174 -0.42737402271464386,0.6247226107248017 -0.4029245441915341,0.647720891088715 -0.4782163437043551,0.6493472650600458 -0.4084167466961997,0.6582527366417187 -0.4475291225009991,0.72742828667379 -0.5134000921112114,0.697318202211257 -0.3327686918306017,0.670927312357904 -0.542421551246306,0.7508850033882188 -0.41017350697467575,0.7686777031689295 -0.5334854982285328,0.6520753496640578 -0.5158682775368698,0.6457766299734807 -0.461559559918012,0.7704836184047354 -0.463496040926273,0.6401845701253146 -0.4665695906964886,0.7419659836170712 -0.4177874796925448,0.631331360120928 -0.4703878859858616,0.6346517952850421 -0.46087266397833115,0.700380940984194 -0.411991398297398,0.6685049441165432 -0.4822643197177507,0.7051680221161787 -0.4517640130810059,0.7408881809307233 -0.4321050011953649,0.6223072091852055 -0.5107466717954819,0.6079034790413904 -0.35654712490855694,0.621035503229739 -0.47036771057183174,0.6607480721163265 -0.3970297011370373,0.5894988720629702 -0.5599467559265916,0.6983675149602062 -0.46535963592278895,0.6969992977512803 -0.41361443993831637,0.6222560626131873 -0.4401822208387065,0.6218675074291452 -0.4432148267348234,0.541456109615857 -0.3501775777648271,0.7014905938688097 -0.44790214814925716,0.6018382883180227 -0.3962874390994017,0.5915417550719401 -0.4863839234197195,0.6625220093412677 -0.44006434290460616,0.7511768119652019 -0.45087688544555077,0.6053088424500135 -0.5067792135238026,0.6456383855686012 -0.5015016746613962,0.681988995962167 -0.4772336166423342,0.6484208029970459 -0.4173796169176804,0.7072181642840298 -0.4165336020457162,0.680543214775675 -0.3661609696700325,0.6608371913564056 -0.391987820121063,0.7236968493302014 -0.3892719685118593,0.7788697431673581 -0.5301085138999533,0.7626243342095277 -0.40200739338602365,0.6926561696541189 -0.4455053646464521,0.6246017619993043 -0.37040012916985976,0.7379868002866075 -0.4467781146748516,0.7216118393512461 -0.37154841956817497,0.6817295372907666 -0.4283913961094055,0.6845269183355495 -0.4702484572609351,0.6914749284126309 -0.3250931779686398,0.709621715508677 -0.4475749519391271,0.5679401498619403 -0.3882477519584314,0.699734760648275 -0.47159840677368403,0.6802578520016836 -0.41909666112789046,0.6190248792008989 -0.4006203140881012,0.66626339061815 -0.43973851716347107,0.7711336279177594 -0.3828021992232982,0.5882801305744978 -0.48363001474834144,0.7406587809313476 -0.43276226247017857,0.605605136421127 -0.4134575158857422,0.5839298825616782 -0.44400261430935034,0.6465157306811863 -0.4469422220118166,0.6102083504533458 -0.42724207569630906,0.6624145462549679 -0.39839353854392734,0.7422760205833286 -0.4110615323138288,0.6776750744430879 -0.5228710908838383,0.6133393705720591 -0.3718251851015608,0.6014913801460962 -0.47173545138570167,0.7384099486166896 -0.314055821103469,0.7420212466178266 -0.3568781263819845,0.7334176056287606 -0.36628422285571244,0.5335654862619377 -0.401689334888296,0.49793864189914905 -0.4394530071033062,0.6866140374925823 -0.4858423959549596,0.6015158082597444 -0.4010009213773241,0.7007909137939402 -0.4147436144935248,0.5780459509303775 -0.4325167875234476,0.7247228336126706 -0.5255723955954461,0.6369121713007524 -0.45691415833608584,0.6725831607799382 -0.48701232313631854,0.6231924690328337 -0.4104198083234973,0.6346867369681876 -0.4154368871178765,0.7859391905024669 -0.42651858714997465,0.6349205353041888 -0.48696750152345664,0.6741890539200615 -0.4651053785619632,0.6878417474715798 -0.30544723084808073,0.6411847677172069 -0.5303273754144491,0.6254257774800678 -0.4153165184126042,0.6132076719575392 -0.4273529921890888,0.7251932559382348 -0.4432850675573397,0.7039513592146693 -0.39259405956005106,0.6752339140851509 -0.41699584433004183,0.654170912515384 -0.4378219504936188,0.693439988514547 -0.31074410517129336,0.6433117321786577 -0.4379154266725257,0.7329514302320006 -0.41838542638802123,0.7552836784227681 -0.4258031164208473,0.6618401320220635 -0.41051809655809735,0.7408174411896121 -0.44127126368588554,0.6965947848753219 -0.3879306405891426,0.6900284837385406 -0.3526267071505626,0.6505614909934571 -0.4547952257966208,0.6419340168983827 -0.43942794506539207,0.6772180750347245 -0.43904410066043037,0.7204214852544447 -0.37094734277040536,0.6884043051213539 -0.4804032025669604,0.6361341500395522 -0.4748681695187831,0.7038381554545363 -0.3752978586523322,0.6245646819364572 -0.3604660202267007,0.7102459866672436 -0.4313042617858951,0.6414565748978879 -0.3491083548036878,0.6836895447655255 -0.5063967905582591,0.7198761534611415 -0.5022123443442756,0.6325419073307752 -0.4514332279853225,0.7527208421769097 -0.4539609570140348,0.6856336671542795 -0.46024996276761243,0.670160098014123 -0.4032590875644408,0.6231571902786621 -0.5042226199579852,0.6581530974385711 -0.3749593199345647,0.5859377714566751 -0.474002091440451,0.5987310819504316 -0.3753882692708233,0.6293707706548016 -0.402447506553297,0.6165443835253203 -0.32945609530556363,0.7797694581363979 -0.33705929560993114,0.704550266120669 -0.3259150315399129,0.6806397994818961 -0.4074551472030225,0.6870636885356911 -0.4358242407512993,0.7380298780080875 -0.392500806676426,0.5452713712173056 -0.3725922709887964,0.6541188379541371 -0.38545822238503524,0.6738650951992445 -0.41487752507585124,0.6796227412409733 -0.4808943021452444,0.6848458788853872 -0.4709071139801141,0.7141419969359782 -0.37427889497038747,0.5914655527513522 -0.40032897214204166,0.684157089975876 -0.37819282887819583,0.6674277224501204 -0.36388799507810266,0.6709420745813102 -0.42700816749942666,0.7376558015111455 -0.4458915308306818,0.6856536890990212 -0.43924881033949437,0.6813977909855944 -0.40495675320442287,0.6911519270636368 -0.540182774807035,0.6416565314633761 -0.44648993367211515,0.5956692254378788 -0.4614870741199872,0.7765101484959016 -0.4437215349616229,0.6864939253515959 -0.3480917485915793,0.6471402273770314 -0.318324820026019,0.691188692620949 -0.3784229521039853,0.695009384357639 -0.3985385731774097,0.6323510482356004 -0.37871609434592923,0.6464530754139789 -0.420537273088553,0.7262314999568562 -0.4464314909253557,0.6809161443292338 -0.2937141301156161,0.5921438712272182 -0.332501602418597,0.6385528146702355 -0.2683299019117491,0.7170842964628498 -0.4768886642968768,0.6712431158219913 -0.4243744129628049,0.669719833562517 -0.4112139009604527,0.6874873347067759 -0.4258918483361846,0.5046472638000384 -0.3706097657119352,0.6529285814536034 -0.3720089400729066,0.6490451602352165 -0.2910956531664201,0.5823520420865577 -0.3288821702409864,0.6862355030747569 -0.4608462213552861,0.643857548934823 -0.38719954965418146,0.6683210889253145 -0.4385585385046104,0.6681024089102245 -0.40826224600084876,0.5765348411523948 -0.502343610548671,0.6942562016732381 -0.4376420809921928,0.5858832581699851 -0.322886775456483,0.7642576280788437 -0.4629167293407771,0.7076195499950553 -0.25648699420986293,0.5819044791657042 -0.384952427932897,0.6619620804121928 -0.4825520353996003,0.7464263651272439 -0.39743901189155795,0.6390506192677402 -0.4491942924590527,0.7008493719991722 -0.44547117320838325,0.6059368917360985 -0.5020909391761998,0.7573940243942867 -0.4411911454870951,0.6594391839994898 -0.35972294488000905,0.680168975847298 -0.3495776972600118,0.6588697043517336 -0.4706264783666526,0.6822399413802366 -0.4575663986142285,0.6885515052243139 -0.3999867850487907,0.7143818747114001 -0.3868416440584384,0.6920739438946413 -0.4475814495780464,0.6728913256262967 -0.4211681455642312,0.726190895065533 -0.45139797371520496,0.6266613200385976 -0.42749077245206824,0.5824493413667338 -0.4363580018747338,0.6427561356450857 -0.43631441003877325,0.6121405361172033 -0.4210684257053611,0.6080818246820725 -0.43818950594823103,0.6212786570291743 -0.3902767156628187,0.6989424157102436 -0.40192102594779255,0.6927744174044579 -0.32510487049903625,0.5831071309501307 -0.4112817335428219,0.6036841773257691 -0.27578499551928565,0.6315093433253087 -0.3643091828333205,0.7013233681277065 -0.39773286987510925,0.6984213247218786 -0.4072921305080137,0.6605768914176604 -0.3805568403549015,0.6374875389036991 -0.39579939722026947,0.6368387996482997 -0.49078203854883956,0.6749652111273917 -0.43452132343707983,0.6897640352302 -0.3626592265552786,0.657312666632607 -0.33541375769575726,0.6809286383957143 -0.2872490782172016,0.7091035570866387 -0.3835868858416108,0.7170936023183017 -0.4055573008609051,0.7534918452836743 -0.36120720128130757,0.6604007209745034 -0.36458314732269825,0.694152449377389 -0.4964665907512039,0.7230510993199195 -0.30779629013493404,0.6286074241330063 -0.47585143764389853,0.6120085357977869 -0.28269331375125745,0.6564745204352749 -0.44064895563133694,0.6413320087726189 -0.38767606692636314,0.6936527105292679 -0.4399126018576885,0.6458326449334223 -0.35960195987856514,0.7131802117187407 -0.4440495963243875,0.7163923836506433 -0.3938313617688598,0.6342305966232035 -0.3599330528518706,0.6348398261177491 -0.47262295305947055,0.6789989794489294 -0.3653900747625449,0.580416316921563 -0.39928074910856204,0.7833512251615905 -0.34715152754608775,0.7108957680562618 -0.3949075686072314,0.6840850731104757 -0.37490224472644007,0.588320271354348 -0.3393764315660381,0.6329918882347758 -0.3816604842909984,0.5978764250535629 -0.4739573512039859,0.7324331324123289 -0.3361253977731877,0.6195327388800251 -0.28878528715556,0.6532902310714384 -0.446517794219802,0.677241436838184 -0.3569465341673962,0.711526786848606 -0.09904138325179207,0.6888696627981844 -0.3694819894944994,0.7058785588490529 -0.3999237917162196,0.7264527799322226 -0.4620937048073881,0.6769205500448008 -0.3226403080004566,0.6508420498472192 -0.4478549504777613,0.605872134392586 -0.4344638475943639,0.6038073993896459 -0.384952427932897,0.6005010000979168 -0.2858920308035506,0.7575404253712411 -0.4619957231871289,0.6656746981333183 -0.3808520637291561,0.7459199590514725 -0.3626965609998325,0.6374949100276223 -0.3156695348284401,0.6221098998857035 -0.35216008587677455,0.6307232238446118 -0.31961147685815283,0.7036006987600455 -0.46593227898830486,0.6728317645770132 -0.4431557614474359,0.6842628680958961 -0.20771332545327162,0.7235942521697355 -0.29647574090828765,0.6791536723871815 -0.3622954466397087,0.6411763630136317 -0.2516864467436095,0.6829493055714859 -0.3525364278117469,0.6328894318238335 -0.3233488079793177,0.6371572986444344 -0.4433135163993962,0.7211199459527639 -0.3974874687136971,0.675857173560631 -0.36599666093940814,0.7177726764805604 -0.4136557902862971,0.6306205992381831 -0.324702451208668,0.6383390560921148 -0.36870173937243716,0.5336972007917005 -0.2634496678777473,0.7139459549497681 -0.40030148824628514,0.534450427009259 -0.2823007282228538,0.685692783261218 -0.38571132627907767,0.6637856465714131 -0.34246705708069414,0.638110040498204 -0.4636648106328049,0.7922432549025404 -0.4471763032792981,0.6051288405638067 -0.451588483448622,0.7138932699404216 -0.3909026341623874,0.6534206819580887 -0.38639537554502185,0.6166020483188487 -0.3519221314944825,0.6761171167634745 -0.3977425985749451,0.6445348475529444 -0.41855020982737645,0.66109572546289 -0.4032770580314131,0.657534645430372 -0.40435384732883023,0.6547536355955197 -0.4496839908742663,0.6825280800867187 -0.30322485514096936,0.7408376104122038 -0.237710770696659,0.7125461958832695 -0.3625794963151563,0.6382131356543783 -0.3388994983211692,0.6568785394723037 -0.39594947555479904,0.6173107052375011 -0.3904468576682564,0.6625875297533569 -0.4074209783203813,0.7148809549422189 -0.37425050712880487,0.633072059317657 -0.36888240875524125,0.5804305191251875 -0.37977564981070994,0.6852620979660202 -0.40910389599122104,0.5175842960078423 -0.32872678270800365,0.724180941586436 -0.30937109739105845,0.673869753684424 -0.4626102897888014,0.6284981339699873 -0.3728933031274825,0.6996243304049029 -0.371293488940554,0.6504547061489132 -0.4644627104469697,0.6496135416358549 -0.3677529799107022,0.6103225382953412 -0.3420542335760234,0.7066333808873914 -0.44698164662538475,0.48824363447870156 -0.30546080222247857,0.5834806278985212 -0.39736913544922614,0.6870741770969471 -0.28332634253316963,0.576837929787108 -0.4030382823300008,0.6659948066609327 -0.45954305512528937,0.7131022324025604 -0.3283838166414001,0.6926718598484425 -0.3588280405958526,0.7186070394509304 -0.3516939832552742,0.6673542912886128 -0.283266582717373,0.6766299135883178 -0.3204124739472318,0.7247314078462532 -0.33904636400740623,0.7259311481840701 -0.2133236517188936,0.6614990678580702 diff --git a/data/derived/properties.synthesis.0.sem-div.csv b/data/derived/properties.synthesis.0.sem-div.csv deleted file mode 100644 index 16bf5ba..0000000 --- a/data/derived/properties.synthesis.0.sem-div.csv +++ /dev/null @@ -1,21 +0,0 @@ -entity_id,label,cluster_id,cluster_size -BFO.0000050,part of,12,64 -BFO.0000051,has part,3,59 -BFO.0000067,contains process,19,29 -RO.0002566,causally influences,1,28 -RO.0002213,positively regulates,29,26 -RO.0002331,involved in,6,24 -RO.0002200,has phenotype,31,23 -RO.0002089,starts before,7,22 -RO.0002490,existence overlaps,8,14 -RO.0015012,reciprocal of,17,14 -RO.0002083,before,2,12 -RO.0002326,contributes to,18,12 -PEL.000013,forces,4,10 -RO.0009003,immersed in,0,9 -RO.0002461,partner in,16,9 -RO.0002019,has ligand,32,8 -RO.0002578,directly regulates,10,7 -RO.0002434,interacts with,11,7 -RO.0002456,pollinated by,15,7 -RO.0002503,towards,5,5 diff --git a/data/derived/properties.synthesis.2.csv b/data/derived/properties.synthesis.2.csv new file mode 100644 index 0000000..8980023 --- /dev/null +++ b/data/derived/properties.synthesis.2.csv @@ -0,0 +1,340 @@ +entity_id,label,description,example +BFO.0000051,has part,a core relation that holds between a whole and its part, +BFO.0000054,realized in,, +BFO.0000055,realizes,, +BFO.0000062,preceded by,"x is preceded by y if and only if the time point at which y ends is before or equivalent to the time point at which x starts. Formally: x preceded by y iff ω(y) <= α(x), where α is a function that maps a process to a start point, and ω is a function that maps a process to an end point.", +BFO.0000063,precedes,"x precedes y if and only if the time point at which x ends is before or equivalent to the time point at which y starts. Formally: x precedes y iff ω(x) <= α(y), where α is a function that maps a process to a start point, and ω is a function that maps a process to an end point.", +BFO.0000066,occurs in,b occurs_in c =def b is a process and c is a material entity or immaterial entity& there exists a spatiotemporal region r and b occupies_spatiotemporal_region r.& forall(t) if b exists_at t then c exists_at t & there exist spatial regions s and s’ where & b spatially_projects_onto s at t& c is occupies_spatial_region s’ at t& s is a proper_continuant_part_of s’ at t, +BFO.0000067,contains process,[copied from inverse property 'occurs in'] b occurs_in c =def b is a process and c is a material entity or immaterial entity& there exists a spatiotemporal region r and b occupies_spatiotemporal_region r.& forall(t) if b exists_at t then c exists_at t & there exist spatial regions s and s’ where & b spatially_projects_onto s at t& c is occupies_spatial_region s’ at t& s is a proper_continuant_part_of s’ at t, +PEL.000000,activates,"convert (something, such as a provitamin) into a biologically active derivative", +PEL.000001,actuates,cause to act in a particular way; motivate, +PEL.000002,associates,combine or join with other parts, +PEL.000003,binds to,"to combine with, form a bond with, or be taken up by a chemical or chemical structure"," ""An enzyme is structured in such a way as to be able to bind with its substrate""" +PEL.000004,causes,make happen, +PEL.000005,constrains,"severely restrict the scope, extent, or activity of", +PEL.000006,controls,determine the behaviour or supervise the running of, +PEL.000007,degrades,break down or deteriorate chemically, +PEL.000008,destroys,end the existence of something by damaging or attacking it, +PEL.000009,directs,control the operations of; manage or govern, +PEL.000010,downregulates,"lower the rate or level of (a process such as gene expression, or sensitivity to a physiologically active substance) by downregulation", +PEL.000011,enhances,"intensify, increase, or further improve the quality", +PEL.000012,facilitates,make (an action or process) easy or easier, +PEL.000013,forces,make something happen, +PEL.000014,hinders,make it difficult for something to happen, +PEL.000015,inactivates,make inactive or inoperative, +PEL.000016,increases,"become or make greater in size, amount, or degree", +PEL.000017,induces,bring about or give rise to,none of these measures induced a change of policy +PEL.000018,inhibits,"hinder, restrain, or prevent (an action or process)", +PEL.000019,initiates,cause (a process or action) to begin,he proposes to initiate discussions on planning procedures +PEL.000022,limits,curtail or reduce in quantity or extent, +PEL.000023,mentions,refer to something, +PEL.000024,overproduces,produce an excessive amount,"if soils are too fertile, the vines can overproduce, leading to vegetal aromas and flavors" +PEL.000025,phosphorylates,introduce a phosphate group into (a molecule or compound), +PEL.000026,produces,cause (a particular result or situation) to happen or exist, +PEL.000027,promotes,, +PEL.000028,provokes,call forth an action, +PEL.000029,reduces,"diminish in size, amount, extent, or number", +PEL.000030,repress,inactivate (a gene or formation of a gene product), +PEL.000031,reverts,return to previous state, +PEL.000032,secretes,"(of a cell, gland, or organ) produce and discharge (a substance)",insulin is secreted in response to rising levels of glucose in the blood +PEL.000033,shuts,cause to cease or suspend an operation or activity, +PEL.000035,suppress,inhibit the genetic expression of, +PEL.000036,triggers,initiate or set off, +PEL.000037,upregulates,"increase the rate or level of (a process such as gene expression, or sensitivity to a physiologically active substance) by upregulation", +RO.0000053,has characteristic,Inverse of characteristic_of, +RO.0000056,participates in,"a relation between a continuant and a process, in which the continuant is somehow involved in the process", +RO.0000057,has participant,"a relation between a process and a continuant, in which the continuant is somehow involved in the process", +RO.0000058,is concretized as,"A relationship between a generically dependent continuant and a specifically dependent continuant, in which the generically dependent continuant depends on some independent continuant in virtue of the fact that the specifically dependent continuant also depends on that same independent continuant. A generically dependent continuant may be concretized as multiple specifically dependent continuants.", +RO.0000059,concretizes,"A relationship between a specifically dependent continuant and a generically dependent continuant, in which the generically dependent continuant depends on some independent continuant in virtue of the fact that the specifically dependent continuant also depends on that same independent continuant. Multiple specifically dependent continuants can concretize the same generically dependent continuant.", +RO.0000085,has function,"a relation between an independent continuant (the bearer) and a function, in which the function specifically depends on the bearer for its existence", +RO.0000086,has quality,"a relation between an independent continuant (the bearer) and a quality, in which the quality specifically depends on the bearer for its existence", +RO.0000087,has role,"a relation between an independent continuant (the bearer) and a role, in which the role specifically depends on the bearer for its existence", +RO.0000091,has disposition,"a relation between an independent continuant (the bearer) and a disposition, in which the disposition specifically depends on the bearer for its existence", +RO.0001000,derives from,"a relation between two distinct material entities, the new entity and the old entity, in which the new entity begins to exist when the old entity ceases to exist, and the new entity inherits the significant portion of the matter of the old entity", +RO.0001001,derives into,"a relation between two distinct material entities, the old entity and the new entity, in which the new entity begins to exist when the old entity ceases to exist, and the new entity inherits the significant portion of the matter of the old entity", +RO.0001022,has allergic trigger,"A relation between a condition (a phenotype or disease) of a host and a material entity, in which the material entity is not part of the host, and is considered harmless to non-allergic hosts, and the condition results in pathological processes that include an abnormally strong immune response against the material entity.", +RO.0001023,has autoimmune trigger,"A relation between a condition (a phenotype or disease) of a host and a material entity, in which the material entity is part of the host itself, and the condition results in pathological processes that include an abnormally strong immune response against the material entity.", +RO.0001025,located in,"a relation between two independent continuants, the target and the location, in which the target is entirely within the location", +RO.0002001,aligned with,, +RO.0002002,has 2D boundary,"a relation between a material entity and a 2D immaterial entity (the boundary), in which the boundary delimits the material entity", +RO.0002004,tracheates,The relationship that holds between a trachea or tracheole and an antomical structure that is contained in (and so provides an oxygen supply to)., +RO.0002007,bounding layer of,"X outer_layer_of Y iff: +. X :continuant that bearer_of some PATO:laminar +. X part_of Y +. exists Z :surface +. X has_boundary Z +. Z boundary_of Y + +has_boundary: http://purl.obolibrary.org/obo/RO_0002002 +boundary_of: http://purl.obolibrary.org/obo/RO_0002000", +RO.0002008,coincident with,A relation that holds between two linear structures that are approximately parallel to each other for their entire length and where either the two structures are adjacent to each other or one is part of the other., +RO.0002009,cell expresses,"A relation that applies between a cell(c) and a gene(g) , where the process of 'transcription, DNA templated (GO_0006351)' is occuring in in cell c and that process has input gene g.", +RO.0002011,regulates transport of,A relationship that holds between a process that regulates a transport process and the entity transported by that process., +RO.0002012,occurrent part of,A part of relation that applies only between occurrents., +RO.0002017,has component activity,, +RO.0002018,has component process,"w 'has process component' p if p and w are processes, w 'has part' p and w is such that it can be directly disassembled into into n parts p, p2, p3, ..., pn, where these parts are of similar type.", +RO.0002019,has ligand,"A relationship that holds between between a receptor and an chemical entity, typically a small molecule or peptide, that carries information between cells or compartments of a cell and which binds the receptor and regulates its effector function.", +RO.0002020,transports,Holds between p and c when p is a transport process or transporter activity and the outcome of this p is to move c from one location to another., +RO.0002021,occurs across,"A relationship between a process and a barrier, where the process occurs in a region spanning the barrier. For cellular processes the barrier is typically a membrane. Examples include transport across a membrane and membrane depolarization.", +RO.0002022,directly regulated by,, +RO.0002025,has effector activity,"A 'has effector activity' B if A and B are GO molecular functions (GO_0003674), A 'has component activity' B and B is the effector (output function) of B. Each compound function has only one effector activity.", +RO.0002084,during which ends,, +RO.0002085,encompasses,, +RO.0002086,ends after,, +RO.0002087,immediately preceded by,, +RO.0002088,during which starts,, +RO.0002089,starts before,, +RO.0002090,immediately precedes,, +RO.0002091,starts during,, +RO.0002092,happens during,, +RO.0002093,ends during,, +RO.0002100,has soma location,Relation between a neuron and a material anatomical entity that its soma is part of., +RO.0002101,fasciculates with,"relationship between a neuron and a neuron projection bundle (e.g.- tract or nerve bundle) that one or more of its projections travels through. +", +RO.0002103,synapsed by,Relation between an anatomical structure (including cells) and a neuron that chemically synapses to it. , +RO.0002111,releases neurotransmitter,, +RO.0002120,synapsed to," Relation between a neuron and an anatomical structure (including cells) that it chemically synapses to. + ", +RO.0002121,dendrite synapsed in,"Relation between a neuron and some structure (e.g.- a brain region) in which its dendrite receives synaptic input. + + ", +RO.0002131,overlaps,x overlaps y if and only if there exists some z such that x has part z and z part of y, +RO.0002134,innervates,"Relation between a 'neuron projection bundle' and a region in which one or more of its component neuron projections either synapses to targets or receives synaptic input. +T innervates some R +Expands_to: T has_fasciculating_neuron_projection that synapse_in some R.", +RO.0002151,partially overlaps,"x partially overlaps y iff there exists some z such that z is part of x and z is part of y, and it is also the case that neither x is part of y or y is part of x", +RO.0002158,shares ancestor with,two individual entities d1 and d2 stand in a shares_ancestor_with relation if and only if there exists some a such that d1 derived_by_descent_from a and d2 derived_by_descent_from a., +RO.0002163,spatially disjoint from,A is spatially_disjoint_from B if and only if they have no parts in common, +RO.0002170,connected to,"a is connected to b if and only if a and b are discrete structure, and there exists some connecting structure c, such that c connects a and b", +RO.0002176,connects,"c connects a if and only if there exist some b such that a and b are similar parts of the same system, and c connects b, specifically, c connects a with b. When one structure connects two others it unites some aspect of the function or role they play within the system.", +RO.0002178,supplies,"Relation between an arterial structure and another structure, where the arterial structure acts as a conduit channeling fluid, substance or energy.", +RO.0002179,drains,"Relation between an collecting structure and another structure, where the collecting structure acts as a conduit channeling fluid, substance or energy away from the other structure.", +RO.0002180,has component,"w 'has component' p if w 'has part' p and w is such that it can be directly disassembled into into n parts p, p2, p3, ..., pn, where these parts are of similar type.", +RO.0002200,has phenotype,"A relationship that holds between a biological entity and a phenotype. Here a phenotype is construed broadly as any kind of quality of an organism part, a collection of these qualities, or a change in quality or qualities (e.g. abnormally increased temperature). The subject of this relationship can be an organism (where the organism has the phenotype, i.e. the qualities inhere in parts of this organism), a genomic entity such as a gene or genotype (if modifications of the gene or the genotype causes the phenotype), or a condition such as a disease (such that if the condition inheres in an organism, then the organism has the phenotype).", +RO.0002202,develops from,x develops from y if and only if either (a) x directly develops from y or (b) there exists some z such that x directly develops from z and z develops from y, +RO.0002203,develops into,inverse of develops from, +RO.0002205,has gene product,x has gene product y if and only if x is a gene (SO:0000704) that participates in some gene expression process (GO:0010467) where the output of that process is either y or something that is ribosomally translated from y, +RO.0002206,expressed in,"x expressed in y if and only if there is a gene expression process (GO:0010467) that occurs in y, and one of the following holds: (i) x is a gene, and x is transcribed into a transcript as part of the gene expression process (ii) x is a transcript, and the transcription of x is part of the gene expression process (iii) x is a mature gene product such as a protein, and x was translated or otherwise processes from a transcript that was transcribed as part of this gene expression process", +RO.0002207,directly develops from,"Candidate definition: x directly_develops from y if and only if there exists some developmental process (GO:0032502) p such that x and y both participate in p, and x is the output of p and y is the input of p, and a substantial portion of the matter of x comes from y, and the start of x is coincident with or after the end of y.", +RO.0002209,has parasitoid,inverse of parasitoid of, +RO.0002210,directly develops into,inverse of directly develops from, +RO.0002211,regulates,"p regulates q iff p is causally upstream of q, the execution of p is not constant and varies according to specific conditions, and p influences the rate or magnitude of execution of q due to an effect either on some enabler of q or some enabler of a part of q.", +RO.0002212,negatively regulates,"p negatively regulates q iff p regulates q, and p decreases the rate or magnitude of execution of q.", +RO.0002213,positively regulates,"p positively regulates q iff p regulates q, and p increases the rate or magnitude of execution of q.", +RO.0002214,has prototype,"x has prototype y if and only if x is an instance of C and y is a prototypical instance of C. For example, every instance of heart, both normal and abnormal is related by the has prototype relation to some instance of a ""canonical"" heart, which participates in blood circulation.", +RO.0002219,surrounded by,"x surrounded_by y if and only if (1) x is adjacent to y and for every region r that is adjacent to x, r overlaps y (2) the shared boundary between x and y occupies the majority of the outermost boundary of x", +RO.0002221,surrounds,inverse of surrounded by, +RO.0002222,temporally related to,, +RO.0002223,starts,inverse of starts with, +RO.0002224,starts with,"x starts with y if and only if x has part y and the time point at which x starts is equivalent to the time point at which y starts. Formally: α(y) = α(x) ∧ ω(y) < ω(x), where α is a function that maps a process to a start point, and ω is a function that maps a process to an end point.", +RO.0002226,develops in,x develops_in y if x is located in y whilst x is developing, +RO.0002227,obligate parasite of,A sub-relation of parasite-of in which the parasite that cannot complete its life cycle without a host., +RO.0002229,ends,inverse of ends with, +RO.0002230,ends with,"x ends with y if and only if x has part y and the time point at which x ends is equivalent to the time point at which y ends. Formally: α(y) > α(x) ∧ ω(y) = ω(x), where α is a function that maps a process to a start point, and ω is a function that maps a process to an end point.", +RO.0002231,has start location,x 'has starts location' y if and only if there exists some process z such that x 'starts with' z and z 'occurs in' y, +RO.0002232,has end location,x 'has end location' y if and only if there exists some process z such that x 'ends with' z and z 'occurs in' y, +RO.0002233,has input,"p has input c iff: p is a process, c is a material entity, c is a participant in p, c is present at the start of p, and the state of c is modified during p.", +RO.0002234,has output,"p has output c iff c is a participant in p, c is present at the end of p, and c is not present in the same state at the beginning of p.", +RO.0002238,has component participant,X 'has component participant' Y means X 'has participant' Y and there is a cardinality constraint that specifies the numbers of Ys., +RO.0002240,has exposure receptor,"A broad relationship between an exposure event or process and any entity (e.g., an organism, organism population, or an organism part) that interacts with an exposure stimulus during the exposure event.", +RO.0002241,has exposure stressor,"A broad relationship between an exposure event or process and any agent, stimulus, activity, or event that causes stress or tension on an organism and interacts with an exposure receptor during an exposure event.", +RO.0002242,has exposure route,A broad relationship between an exposure event or process and a process by which the exposure stressor comes into contact with the exposure receptor, +RO.0002245,over-expressed in,"g is over-expressed in t iff g is expressed in t, and the expression level of g is increased relative to some background.", +RO.0002246,under-expressed in,"g is under-expressed in t iff g is expressed in t, and the expression level of g is decreased relative to some background.", +RO.0002248,has active ingredient,"A relationship that holds between a substance and a chemical entity, if the chemical entity is part of the substance, and the chemical entity forms the biologically active component of the substance.", +RO.0002252,connecting branch of,"b connecting-branch-of s iff b is connected to s, and there exists some tree-like structure t such that the mereological sum of b plus s is either the same as t or a branching-part-of t.", +RO.0002253,has connecting branch,inverse of connecting branch of, +RO.0002255,developmentally contributes to,inverse of has developmental contribution from, +RO.0002256,developmentally induced by,"t1 induced_by t2 if there is a process of developmental induction (GO:0031128) with t1 and t2 as interacting participants. t2 causes t1 to change its fate from a precursor material anatomical entity type T to T', where T' develops_from T", +RO.0002257,developmentally induces,Inverse of developmentally induced by, +RO.0002258,developmentally preceded by,"Candidate definition: x developmentally related to y if and only if there exists some developmental process (GO:0032502) p such that x and y both participates in p, and x is the output of p and y is the input of p", +RO.0002260,has biological role,c has-biological-role r iff c has-role r and r is a biological role (CHEBI:24432), +RO.0002261,has application role,c has-application-role r iff c has-role r and r is an application role (CHEBI:33232), +RO.0002262,has chemical role,c has-chemical-role r iff c has-role r and r is a chemical role (CHEBI:51086), +RO.0002263,acts upstream of,"c acts upstream of p if and only if c enables some f that is involved in p' and p' occurs chronologically before p, is not part of p, and affects the execution of p. c is a material entity and f, p, p' are processes.", +RO.0002286,developmentally succeeded by,Inverse of developmentally preceded by, +RO.0002291,ubiquitously expressed in,"x is ubiquitously expressed in y if and only if x is expressed in y, and the majority of cells in y express x", +RO.0002292,expresses,"y expresses x if and only if there is a gene expression process (GO:0010467) that occurs in y, and one of the following holds: (i) x is a gene, and x is transcribed into a transcript as part of the gene expression process (ii) x is a transcript, and x was transcribed from a gene as part of the gene expression process (iii) x is a mature gene product (protein or RNA), and x was translated or otherwise processed from a transcript that was transcribed as part of the gene expression process.", +RO.0002293,ubiquitously expresses,inverse of ubiquiotously expressed in, +RO.0002303,has habitat,"x 'has habitat' y if and only if: x is an organism, y is a habitat, and y can sustain and allow the growth of a population of xs.", +RO.0002309,has exposure stimulus,"A relationship between an exposure event or process and any agent, stimulus, activity, or event that causally effects an organism and interacts with an exposure receptor during an exposure event.", +RO.0002320,evolutionarily related to,A relationship that holds via some environmental process, +RO.0002321,ecologically related to,A relationship that is mediated in some way by the environment or environmental feature (ENVO:00002297), +RO.0002322,confers advantage in,, +RO.0002323,mereotopologically related to,A mereological relationship or a topological relationship, +RO.0002324,developmentally related to,A relationship that holds between entities participating in some developmental process (GO:0032502), +RO.0002325,colocalizes with,a colocalizes_with b if and only if a is transiently or peripherally associated with b[GO]., +RO.0002326,contributes to,, +RO.0002327,enables,c enables p iff c is capable of p and c acts to execute p., +RO.0002328,functionally related to,"A grouping relationship for any relationship directly involving a function, or that holds because of a function of one of the related entities.", +RO.0002330,genomically related to,"holds between two entities when some genome-level process such as gene expression is involved. This includes transcriptional, spliceosomal events. These relations can be used between either macromolecule entities (such as regions of nucleic acid) or between their abstract informational counterparts.", +RO.0002331,involved in,"c involved_in p if and only if c enables some process p', and p' is part of p", +RO.0002332,regulates levels of,p regulates levels of c if p regulates some amount (PATO:0000070) of c, +RO.0002333,enabled by,inverse of enables, +RO.0002334,regulated by,inverse of regulates, +RO.0002335,negatively regulated by,inverse of negatively regulates, +RO.0002336,positively regulated by,inverse of positively regulates, +RO.0002340,imports,"Holds between p and c when p is a transportation or localization process and the outcome of this process is to move c to a destination that is part of some s, where the start location of c is part of the region that surrounds s.", +RO.0002345,exports,"Holds between p and c when p is a transportation or localization process and the outcome of this process is to move c to a destination that is part of some s, where the end location of c is part of the region that surrounds s.", +RO.0002351,has member,has member is a mereological relation between a collection and an item., +RO.0002360,has dendrite location,, +RO.0002371,attached to,"a is attached to b if and only if a and b are discrete objects or object parts, and there are physical connections between a and b such that a force pulling a will move b, or a force pulling b will move a", +RO.0002372,has muscle origin,"m has_muscle_origin s iff m is attached_to s, and it is the case that when m contracts, s does not move. The site of the origin tends to be more proximal and have greater mass than what the other end attaches to.", +RO.0002373,has muscle insertion,"m has_muscle_insertion s iff m is attaches_to s, and it is the case that when m contracts, s moves. Insertions are usually connections of muscle via tendon to bone.", +RO.0002380,branching part of,x is a branching part of y if and only if x is part of y and x is connected directly or indirectly to the main stem of y, +RO.0002400,has direct input,"p has direct input c iff c is a participant in p, c is present at the start of p, and the state of c is modified during p.", +RO.0002407,indirectly positively regulates,p indirectly positively regulates q iff p is indirectly causally upstream of q and p positively regulates q., +RO.0002409,indirectly negatively regulates,p indirectly negatively regulates q iff p is indirectly causally upstream of q and p negatively regulates q., +RO.0002410,causally related to,"relation that links two events, processes, states, or objects such that one event, process, state, or object (a cause) contributes to the production of another event, process, state, or object (an effect) where the cause is partly or wholly responsible for the effect, and the effect is partly or wholly dependent on the cause.", +RO.0002413,provides input for,"p provides input for q iff p is immediately causally upstream of q, and there exists some c such that p has_output c and q has_input c.", +RO.0002424,differs in,, +RO.0002432,is active in,"c executes activity in d if and only if c enables p and p occurs_in d. Assuming no action at a distance by gene products, if a gene product enables (is capable of) a process that occurs in some structure, it must have at least some part in that structure.", +RO.0002434,interacts with,A relationship that holds between two entities in which the processes executed by the two entities are causally connected., +RO.0002435,genetically interacts with,, +RO.0002436,molecularly interacts with,An interaction relationship in which the two partners are molecular entities that directly physically interact with each other for example via a stable binding interaction or a brief interaction during which one modifies the other., +RO.0002437,biotically interacts with,An interaction relationship in which at least one of the partners is an organism and the other is either an organism or an abiotic entity with which the organism interacts., +RO.0002438,trophically interacts with,An interaction relationship in which the partners are related via a feeding relationship., +RO.0002439,preys on,"An interaction relationship involving a predation process, where the subject kills the target in order to eat it or to feed to siblings, offspring or group members", +RO.0002440,symbiotically interacts with,A biotic interaction in which the two organisms live together in more or less intimate association., +RO.0002441,commensually interacts with,An interaction relationship between two organisms living together in more or less intimate association in a relationship in which one benefits and the other is unaffected (GO)., +RO.0002442,mutualistically interacts with,An interaction relationship between two organisms living together in more or less intimate association in a relationship in which both organisms benefit from each other (GO)., +RO.0002445,parasitized by,Inverse of parasite of, +RO.0002451,transmitted by,A relationship that holds between a disease and organism, +RO.0002452,has symptom,A relation that holds between a disease or an organism and a phenotype, +RO.0002454,has host,"X 'has host' y if and only if: x is an organism, y is an organism, and x can live on the surface of or within the body of y", +RO.0002455,pollinates,, +RO.0002456,pollinated by,, +RO.0002457,acquires nutrients from,Inverse of provides nutrients for, +RO.0002458,preyed upon by,inverse of preys on, +RO.0002459,is vector for,, +RO.0002460,has vector,, +RO.0002465,is symbiosis,, +RO.0002466,is commensalism,, +RO.0002467,is mutualism,, +RO.0002468,is parasitism,, +RO.0002469,provides nutrients for,A biotic interaction where a material entity provides nutrition for an organism., +RO.0002470,eats,A biotic interaction where one organism consumes a material entity through a type of mouth or other oral opening., +RO.0002471,is eaten by,Inverse of eats, +RO.0002472,is evidence for,"A relationship between a piece of evidence a and some entity b, where b is an information content entity, material entity or process, and +the a supports either the existence of b, or the truth value of b.", +RO.0002473,composed primarily of,x composed_primarily_of y if and only if more than half of the mass of x is made from y or units of the same type as y., +RO.0002480,ubiquitinates,An interaction relation between x and y in which x catalyzes a reaction in which one or more ubiquitin groups are added to y, +RO.0002481,is kinase activity,, +RO.0002482,is ubiquitination,, +RO.0002485,receives input from,, +RO.0002486,sends output to,, +RO.0002488,existence starts during,x existence starts during y if and only if the time point at which x starts is after or equivalent to the time point at which y starts and before or equivalent to the time point at which y ends. Formally: x existence starts during y iff α(x) >= α(y) & α(x) <= ω(y)., +RO.0002489,existence starts with,x starts ends with y if and only if the time point at which x starts is equivalent to the time point at which y starts. Formally: x existence starts with y iff α(x) = α(y)., +RO.0002490,existence overlaps,x existence overlaps y if and only if either (a) the start of x is part of y or (b) the end of x is part of y. Formally: x existence starts and ends during y iff (α(x) >= α(y) & α(x) <= ω(y)) OR (ω(x) <= ω(y) & ω(x) >= α(y)), +RO.0002492,existence ends during,x existence ends during y if and only if the time point at which x ends is before or equivalent to the time point at which y ends and after or equivalent to the point at which y starts. Formally: x existence ends during y iff ω(x) <= ω(y) and ω(x) >= α(y)., +RO.0002493,existence ends with,x existence ends with y if and only if the time point at which x ends is equivalent to the time point at which y ends. Formally: x existence ends with y iff ω(x) = ω(y)., +RO.0002502,depends on,, +RO.0002505,has intermediate,"p has intermediate c if and only if p has parts p1, p2 and p1 has output c, and p2 has input c", +RO.0002507,determined by,"s determined by f if and only if s is a type of system, and f is a material entity that is part of s, such that f exerts a strong causal influence on the functioning of s, and the removal of f would cause the collapse of s.", +RO.0002508,determines,inverse of determined by, +RO.0002510,transcribed from,x is transcribed from y if and only if x is synthesized from template y, +RO.0002511,transcribed to,inverse of transcribed from, +RO.0002513,ribosomally translates to,inverse of ribosomal translation of, +RO.0002514,sequentially related to,A relation that holds between two entities that have the property of being sequences or having sequences. , +RO.0002516,has start sequence,"x has start sequence y if the start of x is identical to the start of y, and x has y as a subsequence", +RO.0002518,has end sequence,"x has end sequence y if the end of x is identical to the end of y, and x has y as a subsequence", +RO.0002522,bounds sequence of,"x bounds the sequence of y iff the upstream-most part of x is upstream of or coincident with the upstream-most part of y, and the downstream-most part of x is downstream of or coincident with the downstream-most part of y", +RO.0002524,has subsequence,x has subsequence y iff all of the sequence parts of y are sequence parts of x, +RO.0002525,is subsequence of,inverse of has subsequence, +RO.0002551,has skeleton,A relation between a segment or subdivision of an organism and the maximal subdivision of material entities that provides structural support for that segment or subdivision., +RO.0002554,hyperparasitized by,inverse of hyperparasite of, +RO.0002557,has pathogen,A host interaction where the smaller of the two members of a symbiosis causes a disease in the larger member, +RO.0002558,has evidence,inverse of is evidence for, +RO.0002559,causally influenced by,, +RO.0002566,causally influences,"The entity or characteristic A is causally upstream of the entity or characteristic B, A having an effect on B. An entity corresponds to any biological type of entity as long as a mass is measurable. A characteristic corresponds to a particular specificity of an entity (e.g., phenotype, shape, size).", +RO.0002567,biomechanically related to,A relation that holds between elements of a musculoskeletal system or its analogs., +RO.0002568,has muscle antagonist,"m1 has_muscle_antagonist m2 iff m1 has_muscle_insertion s, m2 has_muscle_insection s, m1 acts in opposition to m2, and m2 is responsible for returning the structure to its initial position.", +RO.0002569,has branching part,inverse of branching part of, +RO.0002573,has modifier,A relation that holds between an attribute or a qualifier and another attribute., +RO.0002578,directly regulates,p directly regulates q iff p is immediately causally upstream of q and p regulates q., +RO.0002596,capable of regulating,"Holds between c and p if and only if c is capable of some activity a, and a regulates p.", +RO.0002607,is marker for,"c is marker for d iff the presence or occurrence of d is correlated with the presence of occurrence of c, and the observation of c is used to infer the presence or occurrence of d. Note that this does not imply that c and d are in a direct causal relationship, as it may be the case that there is a third entity e that stands in a direct causal relationship with c and d.", +RO.0002610,correlated with,"A relationship that holds between two entities, where the entities exhibit a statistical dependence relationship. The entities may be statistical variables, or they may be other kinds of entities such as diseases, chemical entities or processes.", +RO.0002615,has model,Inverse of is-model-of, +RO.0002618,visits,, +RO.0002619,visited by,, +RO.0002622,visits flowers of,, +RO.0002624,lays eggs in,, +RO.0002626,kills,, +RO.0002627,is killed by,, +RO.0002629,directly positively regulates,"p directly positively regulates q iff p is immediately causally upstream of q, and p positively regulates q.", +RO.0002630,directly negatively regulates,"p directly negatively regulates q iff p is immediately causally upstream of q, and p negatively regulates q.", +RO.0002633,has ectoparasite,inverse of ectoparasite of, +RO.0002635,has endoparasite,, +RO.0002637,has mesoparasite,inverse of mesoparasite of, +RO.0002639,has intercellular endoparasite,inverse of intercellular endoparasite of, +RO.0002641,has intracellular endoparasite,inverse of intracellular endoparasite of, +RO.0002801,co-roosts with,"Two or more individuals sharing the same roost site (cave, mine, tree or tree hollow, animal burrow, leaf tent, rock crack, space in man-made structure, etc.). Individuals that are sharing a communal roost may be said to be co-roosting. The roost may be either a day roost where the individuals rest during daytime hours, or a night roost where individuals roost to feed, groom, or rest in between flights and/or foraging bouts. Communal roosting as thus defined is an umbrella term within which different specialized types -- which are not mutually exclusive -- may be recognized based on taxonomy and the temporal and spatial relationships of the individuals that are co-roosting.", +RO.0002803,has reservoir host,inverse of reservoir host of, +RO.0003001,produced by,a produced_by b iff some process that occurs_in b has_output a., +RO.0003002,represses expression of,Holds between entity A (a transcription factor) and a nucleic acid B if and only if A down-regulates the expression of B. The nucleic acid can be a gene or an mRNA., +RO.0003003,increases expression of,Holds between entity A (a transcription factor) and nucleic acid B if and only if A up-regulates the expression of B. The nucleic acid can be a gene or mRNA., +RO.0003303,causes condition,"A relationship between an entity (e.g. a genotype, genetic variation, chemical, or environmental exposure) and a condition (a phenotype or disease), where the entity has some causal role for the condition.", +RO.0003304,contributes to condition,"A relationship between an entity (e.g. a genotype, genetic variation, chemical, or environmental exposure) and a condition (a phenotype or disease), where the entity has some contributing role that influences the condition.", +RO.0003307,ameliorates condition,"A relationship between an entity (e.g. a genotype, genetic variation, chemical, or environmental exposure) and a condition (a phenotype or disease), where the presence of the entity reduces or eliminates some or all aspects of the condition.", +RO.0003308,correlated with condition,A relationship between an entity and a condition (phenotype or disease) with which it exhibits a statistical dependence relationship., +RO.0003309,exacerbates condition,"A relationship between an entity (e.g. a chemical, environmental exposure, or some form of genetic variation) and a condition (a phenotype or disease), where the presence of the entity worsens some or all aspects of the condition.", +RO.0003310,condition ameliorated by,"A relationship between a condition (a phenotype or disease) and an entity (e.g. a chemical, environmental exposure, or some form of genetic variation) where some or all aspects of the condition are reduced or eliminated by the presence of the entity.", +RO.0003311,condition exacerbated by,"A relationship between a condition (a phenotype or disease) and an entity (e.g. a chemical, environmental exposure, or some form of genetic variation) where some or all aspects of the condition are worsened by the presence of the entity.", +RO.0004008,has primary output,"p has primary output c if (a) p has output c and (b) the goal of process is to modify, produce, or transform c.", +RO.0004009,has primary input,"p has primary input c if (a) p has input c and (b) the goal of process is to modify, consume, or transform c.", +RO.0004026,disease has location,A relationship between a disease and an anatomical entity where the disease has one or more features that are located in that entity., +RO.0004029,disease has feature,"A relationship between a disease and some feature of that disease, where the feature is either a phenotype or an isolated disease.", +RO.0004031,enables subfunction,"Holds between an entity and an process P where the entity enables some larger compound process, and that larger process has-part P.", +RO.0007000,has driver,"A relation between two entities, in which one of the entities is any natural or human-influenced factor that directly or indirectly causes a change in the other entity.", +RO.0007001,has disease driver,"A relation between an entity and a disease of a host, in which the entity is not part of the host itself, and the condition results in pathological processes.", +RO.0008502,has epiphyte,inverse of epiphyte of, +RO.0008504,kleptoparasitized by,inverse of kleptoparasite of, +RO.0008505,creates habitat for,An interaction relationship wherein one organism creates a structure or environment that is lived in by another organism., +RO.0008506,ecologically co-occurs with,An interaction relationship describing organisms that often occur together at the same time and space or in the same environment., +RO.0008507,lays eggs on,An interaction relationship in which organism a lays eggs on the outside surface of organism b. Organism b is neither helped nor harmed in the process of egg laying or incubation., +RO.0008509,has roost,"x 'has roost' y if and only if: x is an organism, y is a habitat, and y can support rest behaviors x.", +RO.0009001,has substance added,"""has substance added"" is a relation existing between a (physical) entity and a substance in which the entity has had the substance added to it at some point in time.", +RO.0009002,has substance removed,"""has substance removed"" is a relation existing between two physical entities in which the first entity has had the second entity (a substance) removed from it at some point in time.", +RO.0009003,immersed in,"""immersed in"" is a relation between a (physical) entity and a fluid substance in which the entity is wholely or substantially surrounded by the substance.", +RO.0009004,has consumer,'has consumer' is a relation between a material entity and an organism in which the former can normally be digested or otherwise absorbed by the latter without immediate or persistent ill effect., +RO.0009006,assay measures characteristic,"A relation between an assay and a characteristic, in which the assay generates a data item which is a measure of a characteristic.", +RO.0010001,generically depends on,A generically dependent continuant *b* generically depends on an independent continuant *c* at time *t* means: there inheres in *c* a specifically deendent continuant which concretizes *b* at *t*., +RO.0010002,is carrier of,*b* is carrier of *c* at time *t* if and only if *c* *g-depends on* *b* at *t*, +RO.0011002,regulates activity of,"The entity A has an activity that regulates an activity of the entity B. For example, A and B are gene products where the catalytic activity of A regulates the kinase activity of B.", +RO.0011003,regulates quantity of,The entity A has an activity that regulates the quantity or abundance or concentration of the entity B., +RO.0011014,destabilizes quantity of,An entity A directly interacts with B and A has an activity that decreases the amount of an entity B by degradating it., +RO.0011015,stabilizes quantity of,An entity A physically interacts with B and A has an activity that increases the amount of an entity B by stabilizing it., +RO.0012010,removes input for,"p removes input for q iff p is causally upstream of q, there exists some c such that p has_input c and q has_input c, p reduces the levels of c, and c is rate limiting for execution of q.", +RO.0012012,indirectly regulates,p indirectly regulates q iff p is indirectly causally upstream of q and p regulates q., +RO.0015001,has exemplar data,A relation between a material entity and some data in which the data is taken as exemplifying the material entity., +RO.0015010,has relative magnitude,, +RO.0015011,has cross section,"s3 has_cross_section s3 if and only if : there exists some 2d plane that intersects the bearer of s3, and the impression of s3 upon that plane has shape quality s2.", +RO.0016002,has disease,A relationship that holds between an organism and a disease. Here a disease is construed broadly as a disposition to undergo pathological processes that exists in an organism because of one or more disorders in that organism., +RO.0016004,has exposure medium,"X has exposure medium Y if X is an exposure event (process), Y is a material entity, and the stimulus for X is transmitted or carried in Y.", +RO.0017001,device utilizes material,"X device utilizes material Y means X and Y are material entities, and X is capable of some process P that has input Y.", +RO.0017003,positively correlated with,A relation between entities in which one increases or decreases as the other does the same., +RO.0017004,negatively correlated with,A relation between entities in which one increases as the other decreases., +RO.0017005,contains measured amount,A relation between a container and measurement datum that specifies the actual amount of material in the container., +RO.0017006,has maximum capacity,A relation that relates a container to a measurement datum that specifies the maximum capacity of the container. Capacity can refer to either weight or volume., +RO.0017008,owns,A primitive relation that holds between entities x and y in which y is at x's full disposal., +RO.0017009,is owned by,Inverse of the owns relation., +RO.0018001,is myristoyltransferase activity,Helper relation for OWL definition of RO:0018002 myristoylates, +RO.0018003,myristoylated by,inverse of myristoylates, +RO.0018027,is agonist of,a relation between a ligand (material entity) and a receptor (material entity) that implies the binding of the ligand to the receptor activates some activity of the receptor, +RO.0018029,is antagonist of,a relation between a ligand (material entity) and a receptor (material entity) that implies the binding of the ligand to the receptor reduces some activity of the receptor to basal level, +RO.0018036,is tautomer of,"Two chemicals are tautomers if they can be readily interconverted. + +This commonly refers to prototropy in which a hydrogen's position is changed, such as between ketones and enols. This is also often observed in heterocyclic rings, e.g., ones containing nitrogens and/or have aryl functional groups containing heteroatoms.", +RO.0018038,has functional parent,"Chemical A has functional parent Chemical B if there is chemical transformation through which chemical B can be produced from chemical A. + +For example, the relationship between a salt and a freebased compound is a ""has functional parent"" relationship.", +RO.0018039,is enantiomer of,"Chemicals A and B are enantiomers if they share the same molecular graph except the change of the configuration of substituents around exactly one chiral center. + +A chemical with no chiral centers can not have an enantiomer. A chemical with multiple chiral centers can have multiple enantiomers, but its enantiomers are not themselves enantiomers (they are diastereomers).", +RO.0018040,has parent hydride,Chemical A has parent hydride Chemical B if there exists a molecular graphical transformation where functional groups on A are replaced with hydrogens in order to yield B., +RO.0019000,regulates characteristic,A relationship that holds between a process and a characteristic in which process (P) regulates characteristic (C) iff: P results in the existence of C OR affects the intensity or magnitude of C., +RO.0019001,positively regulates characteristic,A relationship that holds between a process and a characteristic in which process (P) positively regulates characteristic (C) iff: P results in an increase in the intensity or magnitude of C., +RO.0019002,negatively regulates characteristic,A relationship that holds between a process and a characteristic in which process (P) negatively regulates characteristic (C) iff: P results in a decrease in the intensity or magnitude of C., +RO.0020202,has numerator,A relationship between a ratio or proportion and its dividend., +RO.0020203,has denominator,A relationship between a ratio or proportion and its divisor., diff --git a/data/derived/properties.synthesis.2.diverse.csv b/data/derived/properties.synthesis.2.diverse.csv new file mode 100644 index 0000000..b0051bc --- /dev/null +++ b/data/derived/properties.synthesis.2.diverse.csv @@ -0,0 +1,11 @@ +entity_id,label,score,icluster +RO.0002351,has member,0.8852101037159592,0 +BFO.0000067,contains process,0.8590712292239969,1 +RO.0002092,happens during,0.8830400824506608,2 +PEL.000013,forces,0.7458585441264061,3 +RO.0002626,kills,0.7877325132306823,4 +RO.0002331,involved in,0.9025573146738246,5 +BFO.0000051,has part,0.9040580382836347,6 +RO.0003001,produced by,0.8628781747555726,7 +RO.0002615,has model,0.8580200770429982,8 +RO.0002001,aligned with,0.7397136387102962,9 diff --git a/pelinker/analysis.py b/pelinker/analysis.py index 249b0d7..cb43737 100644 --- a/pelinker/analysis.py +++ b/pelinker/analysis.py @@ -200,133 +200,6 @@ def cosine_similarity_std(tensor): return std_dev -def adjust_cluster_count( - df_umap: pd.DataFrame, - umap_columns: list[str], - current_n_clusters: int, - target_n_clusters: int, - base_min_cluster_size: int, -) -> tuple[pd.DataFrame, int, hdbscan.HDBSCAN]: - """ - Adjust HDBSCAN clustering to get closer to target number of clusters. - - Args: - df_umap: DataFrame with UMAP-reduced embeddings - umap_columns: List of column names for UMAP dimensions - current_n_clusters: Current number of clusters - target_n_clusters: Desired number of clusters - base_min_cluster_size: Base min_cluster_size to adjust from - - Returns: - tuple: (clustered_dataframe, final_n_clusters, clusterer) - """ - # Initial clusterer - initial_clusterer = hdbscan.HDBSCAN( - min_cluster_size=base_min_cluster_size, gen_min_span_tree=True - ) - initial_labels = initial_clusterer.fit_predict(df_umap[umap_columns]) - - if current_n_clusters == target_n_clusters: - initial_df = df_umap.copy() - initial_df["class"] = initial_labels - return initial_df, current_n_clusters, initial_clusterer - - best_n_clusters = current_n_clusters - best_df = df_umap.copy() - best_clusterer = initial_clusterer - - # Try increasing min_cluster_size to reduce number of clusters - if current_n_clusters > target_n_clusters: - for size_mult in [1.2, 1.5, 2.0, 2.5]: - test_size = int(base_min_cluster_size * size_mult) - if test_size >= len(df_umap): - continue - clusterer = hdbscan.HDBSCAN( - min_cluster_size=test_size, gen_min_span_tree=True - ) - labels = clusterer.fit_predict(df_umap[umap_columns]) - test_n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - if abs(test_n_clusters - target_n_clusters) < abs( - best_n_clusters - target_n_clusters - ): - best_n_clusters = test_n_clusters - df_test = df_umap.copy() - df_test["class"] = labels - best_df = df_test - best_clusterer = clusterer - if best_n_clusters == target_n_clusters: - break - - # Try decreasing min_cluster_size to increase number of clusters - elif current_n_clusters < target_n_clusters: - for size_mult in [0.8, 0.6, 0.5, 0.4]: - test_size = max(2, int(base_min_cluster_size * size_mult)) - clusterer = hdbscan.HDBSCAN( - min_cluster_size=test_size, gen_min_span_tree=True - ) - labels = clusterer.fit_predict(df_umap[umap_columns]) - test_n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - if abs(test_n_clusters - target_n_clusters) < abs( - best_n_clusters - target_n_clusters - ): - best_n_clusters = test_n_clusters - df_test = df_umap.copy() - df_test["class"] = labels - best_df = df_test - best_clusterer = clusterer - if best_n_clusters == target_n_clusters: - break - - return best_df, best_n_clusters, best_clusterer - - -def cluster_with_target_count( - df_umap: pd.DataFrame, - umap_columns: list[str], - target_n_clusters: int, - base_min_cluster_size: int | None = None, -) -> tuple[pd.DataFrame, int, float]: - """ - Cluster data to get approximately target number of clusters and compute DBCV score. - - Args: - df_umap: DataFrame with UMAP-reduced embeddings - umap_columns: List of column names for UMAP dimensions - target_n_clusters: Desired number of clusters - base_min_cluster_size: Starting min_cluster_size (if None, estimates from data size) - - Returns: - tuple: (clustered_dataframe, actual_n_clusters, dbcv_score) - """ - if base_min_cluster_size is None: - # Estimate starting point: aim for clusters of roughly equal size - base_min_cluster_size = max(2, len(df_umap) // (target_n_clusters * 3)) - - # Start with estimated size - clusterer = hdbscan.HDBSCAN( - min_cluster_size=base_min_cluster_size, gen_min_span_tree=True - ) - labels = clusterer.fit_predict(df_umap[umap_columns]) - current_n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - - # Adjust to get closer to target - df_clustered, final_n_clusters, final_clusterer = adjust_cluster_count( - df_umap, - umap_columns, - current_n_clusters, - target_n_clusters, - base_min_cluster_size, - ) - - # Compute DBCV score from the final clusterer - if hasattr(final_clusterer, "relative_validity_"): - dbcv = float(final_clusterer.relative_validity_) - else: - dbcv = 0.0 # Invalid clustering - - return df_clustered, final_n_clusters, dbcv - - def get_word_frequencies_from_library( language: str = "en", wordlist: str = "best", @@ -374,9 +247,20 @@ def __getitem__(self, word: str) -> float: def _measure_label_simplicity( label: str, word_frequencies: Mapping[str, float], - stopwords: Iterable[str] = ("is", "of", "the", "a", "an", "to", "for", "or", "in"), + stopwords: Iterable[str] = ( + "is", + "of", + "the", + "a", + "an", + "to", + "for", + "or", + "in", + "has", + ), zero_freq_penalty: float = 1e-8, - multiword_penalty: float = 0.15, + multiword_penalty: float = 0.2, stopword_penalty: float = 0.3, ) -> dict[str, int | float]: """...""" @@ -428,142 +312,6 @@ def _measure_label_simplicity( } -def find_cluster_centers( - df_clustered: pd.DataFrame, - id_column: str = "id", - label_column: str = "label", - min_cluster_size: int = 5, - max_complexity_chars: int | None = None, - max_complexity_words: int | None = None, - min_simplicity_score: float | None = None, - max_word_length: int | None = None, - word_frequencies: dict[str, float] | None = None, -) -> list[dict]: - """ - Find a representative item for each cluster, filtering by size and complexity. - - Args: - df_clustered: DataFrame with cluster assignments in 'class' column - id_column: Name of column containing item IDs - label_column: Name of column containing item labels - min_cluster_size: Minimum number of members required (default: 5) - max_complexity_chars: Maximum character count for candidates (None = no limit) - max_complexity_words: Maximum word count for candidates (None = no limit) - min_simplicity_score: Minimum simplicity score (based on word frequency harmonic mean) - Higher = simpler. None = no limit. - max_word_length: Maximum length for any word in the label (None = no limit) - word_frequencies: Optional dictionary mapping words to frequencies for simplicity calculation. - - Returns: - List of dictionaries with cluster_id, cluster_size, center_id, center_label, - sorted by cluster_size (descending) - """ - cluster_results = [] - - # Filter out noise points and group by cluster - df_valid = df_clustered[df_clustered["class"] != -1].copy() - - for cluster_id, cluster_data in df_valid.groupby("class"): - cluster_size = len(cluster_data) - - # Skip small clusters - if cluster_size < min_cluster_size: - continue - - # Compute simplicity scores and filter candidates in one pass - valid_candidates = [] - - for idx, row in cluster_data.iterrows(): - label = str(row[label_column]) - - # Apply quick filters first (before expensive simplicity calculation) - if max_word_length is not None: - if any(len(word) > max_word_length for word in label.split()): - continue - - # Compute simplicity metrics - complexity = _measure_label_simplicity( - label, word_frequencies=word_frequencies - ) - - # Apply complexity filters - if ( - max_complexity_chars is not None - and complexity["char_count"] > max_complexity_chars - ): - continue - if ( - max_complexity_words is not None - and complexity["word_count"] > max_complexity_words - ): - continue - if ( - min_simplicity_score is not None - and complexity["simplicity_score"] < min_simplicity_score - ): - continue - - # This candidate passes all filters - valid_candidates.append((idx, complexity["simplicity_score"])) - - # Skip cluster if no valid candidates - if not valid_candidates: - continue - - # Select the candidate with the highest simplicity score - best_idx, _ = max(valid_candidates, key=lambda x: x[1]) - selected_row = cluster_data.loc[best_idx] - - cluster_results.append( - { - "cluster_id": cluster_id, - "cluster_size": cluster_size, - "center_id": selected_row[id_column], - "center_label": selected_row[label_column], - } - ) - - # Sort by cluster size (largest first) - cluster_results.sort(key=lambda x: x["cluster_size"], reverse=True) - return cluster_results - - -def compute_dbcv_after_filtering( - df_clustered: pd.DataFrame, - umap_columns: list[str], - valid_cluster_ids: set[int], -) -> float: - """ - Compute DBCV score after filtering out perplex clusters. - - Args: - df_clustered: DataFrame with cluster assignments - umap_columns: List of UMAP column names - valid_cluster_ids: Set of cluster IDs to keep - - Returns: - DBCV score for filtered clusters - """ - - # Filter to only valid clusters (exclude noise and filtered-out clusters) - df_filtered = df_clustered[df_clustered["class"].isin(valid_cluster_ids)].copy() - - if len(df_filtered) < 2: - return 0.0 - - # Re-fit HDBSCAN on filtered data to get DBCV - # We need to estimate min_cluster_size - use a reasonable default - min_size = max( - 2, len(df_filtered) // 20 - ) # At least 2, but aim for ~20 points per cluster - clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, gen_min_span_tree=True) - - if hasattr(clusterer, "relative_validity_"): - return float(clusterer.relative_validity_) - else: - return 0.0 - - def compute_hungarian_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float: """ Compute clustering accuracy using Hungarian algorithm to optimally match @@ -759,3 +507,111 @@ def embeddings_dict_to_dataframe( label_list.append(label) return pd.DataFrame({"id": id_list, "label": label_list, "embed": embeddings_list}) + + +def compute_kb_generality_scores( + embeddings: np.ndarray, + labels: list[str], + k_neighbors: int = 10, + metric: str = "cosine", + word_frequencies: Mapping[str, float] | None = None, + density_weight: float = 0.5, +) -> np.ndarray: + """ + Compute generality scores for entities based on KB statistics. + + Combines embedding-space density with label simplicity to identify generic vs specific terms. + Generic terms tend to have: + - Many similar neighbors (high density) + - High average similarity to neighbors + - Shorter, simpler labels (fewer words, common words) + - Central position in semantic space + + Args: + embeddings: Array of shape (n_points, n_features) containing embeddings + labels: List of labels corresponding to embeddings + k_neighbors: Number of nearest neighbors to consider + metric: Distance metric ('cosine' or 'euclidean') + word_frequencies: Optional word frequency mapping for simplicity scoring + density_weight: Weight for embedding density vs label simplicity (0.0 = pure simplicity, 1.0 = pure density) + + Returns: + Array of generality scores (higher = more generic), shape (n_points,) + """ + from sklearn.neighbors import NearestNeighbors + + n_points = embeddings.shape[0] + k_neighbors = min(k_neighbors, n_points - 1) + + if k_neighbors < 1: + return np.ones(n_points) + + # Normalize embeddings for cosine distance + if metric == "cosine": + embeddings_norm = embeddings / ( + np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-8 + ) + else: + embeddings_norm = embeddings + + # Find k nearest neighbors for each point + nn = NearestNeighbors(n_neighbors=k_neighbors + 1, metric=metric) + nn.fit(embeddings_norm) + distances, indices = nn.kneighbors(embeddings_norm) + + # Compute embedding-space density scores + density_scores = np.zeros(n_points) + + for i in range(n_points): + # Get neighbors (excluding self) + neighbor_distances = distances[i, 1:] + + # Convert distances to similarities (for cosine: similarity = 1 - distance) + if metric == "cosine": + similarities = 1.0 - neighbor_distances + else: + # For euclidean, use inverse distance (with smoothing) + similarities = 1.0 / (1.0 + neighbor_distances) + + # Density = average similarity to neighbors + # Higher similarity means the term is in a dense region (more generic) + density_scores[i] = similarities.mean() + + density_scores = np.log(density_scores) + # Normalize density scores to [0, 1] range + if density_scores.max() > density_scores.min(): + density_scores_norm = (density_scores - density_scores.min()) / ( + density_scores.max() - density_scores.min() + ) + else: + density_scores_norm = np.ones_like(density_scores) + + # Compute label simplicity scores + if word_frequencies is None: + word_frequencies = {} + + simplicity_scores = np.zeros(n_points) + for i, label in enumerate(labels): + simplicity_metrics = _measure_label_simplicity( + str(label), word_frequencies=word_frequencies + ) + simplicity_scores[i] = simplicity_metrics["simplicity_score"] + + simplicity_scores = np.log(simplicity_scores) + + # Normalize simplicity scores to [0, 1] range + if simplicity_scores.max() > simplicity_scores.min(): + simplicity_scores_norm = (simplicity_scores - simplicity_scores.min()) / ( + simplicity_scores.max() - simplicity_scores.min() + ) + else: + simplicity_scores_norm = np.ones_like(simplicity_scores) + + # Combine density and simplicity scores + # Shorter, simpler terms should be preferred even if density is similar + generality_scores = ( + density_weight * density_scores_norm + + (1 - density_weight) * simplicity_scores_norm + ) + + return generality_scores diff --git a/pelinker/plotting.py b/pelinker/plotting.py index f66eb2a..f7a160c 100644 --- a/pelinker/plotting.py +++ b/pelinker/plotting.py @@ -99,11 +99,8 @@ def plot_metrics_with_error_bars( markersize=8, err_kws={"alpha": 0.3, "linewidth": 1.5}, ) - # axes[2].set_yscale("log") axes[1].set_xlabel("min_cluster_size", fontsize=12, fontweight="bold") - axes[1].set_ylabel( - "n_clusters (log scale)", fontsize=12, fontweight="bold", color=colors[2] - ) + axes[1].set_ylabel("n clusters", fontsize=12, fontweight="bold", color=colors[2]) axes[1].set_title( "Number of Clusters vs. min_cluster_size", fontsize=13, fontweight="bold" ) diff --git a/pelinker/reporting.py b/pelinker/reporting.py index 22dba66..0fbecab 100644 --- a/pelinker/reporting.py +++ b/pelinker/reporting.py @@ -1,4 +1,3 @@ -import logging from dataclasses import dataclass import pandas as pd @@ -18,71 +17,3 @@ class ClusteringReport: hungarian_accuracy: float | None = ( None # Hungarian matching accuracy (None if not computed) ) - - -def log_clustering_scores(results: list[dict], logger: logging.Logger) -> None: - """ - Log clustering scores for different target cluster counts. - - Args: - results: List of result dictionaries with target_count, actual_count, scores - logger: Logger instance for output - """ - logger.info("\n" + "=" * 80) - logger.info("CLUSTERING SCORES") - logger.info("=" * 80) - logger.info( - "%-15s %-15s %-20s %-20s %-15s", - "Target", - "Actual", - "Score (before)", - "Score (after)", - "Valid Clusters", - ) - logger.info("-" * 85) - - for r in results: - logger.info( - "%-15d %-15d %-20.4f %-20.4f %-15d", - r["target_count"], - r["actual_count"], - r["score_before_filtering"], - r["score_after_filtering"], - r["n_valid_clusters"], - ) - - logger.info("=" * 80) - - -def log_clustering_results(cluster_results: list[dict], logger: logging.Logger) -> None: - """ - Log clustering results in a formatted table. - - Args: - cluster_results: List of cluster result dictionaries - logger: Logger instance for output - """ - logger.info("\n" + "=" * 80) - logger.info("CLUSTER DETAILS") - logger.info("=" * 80) - logger.info("Total clusters: %d (excluding noise)", len(cluster_results)) - logger.info("\nCluster details:") - logger.info( - "%-10s %-30s %-50s %-10s", - "Cluster ID", - "Center ID", - "Center Label", - "Size", - ) - logger.info("-" * 100) - - for cr in cluster_results: - logger.info( - "%-10d %-30s %-50s %-10d", - cr["cluster_id"], - str(cr["center_id"])[:30], - str(cr["center_label"])[:50], - cr["cluster_size"], - ) - - logger.info("=" * 80) diff --git a/run/README.md b/run/README.md new file mode 100644 index 0000000..106cd3f --- /dev/null +++ b/run/README.md @@ -0,0 +1,117 @@ +# Run Scripts Documentation + +This directory contains scripts for preprocessing knowledge bases, embedding corpora, and analyzing embedding quality. + +## Directory Structure + +``` +run/ +├── README.md # This file +├── embed_kb_corpus.py # Embed knowledge base corpus +├── loop.embed.kb.corpus.sh # Batch embedding script +├── preprocessing/ # Property knowledge base generation +│ ├── extract_properties_go.py # Extract from GO-CAMs ontology +│ ├── extract_properties_ro.py # Extract from Relations Ontology +│ └── merge_properties.py # Merge properties from all sources +├── analysis/ # Embedding quality evaluation +│ ├── clustering_quality.py # Measure clustering quality of embeddings +│ └── select_diverse_entities.py # Select diverse entity subsets +└── obsolete/ # Deprecated scripts (not actively maintained) + ├── analysis/ + ├── experiments/ + ├── preprocessing/ + └── testing/ +``` + +## Preprocessing Scripts + +Scripts in the `preprocessing/` directory generate property knowledge base files from various ontology sources. + +### `extract_properties_go.py` + +Extracts property definitions from the Gene Ontology (GO) Causal Activity Models (GO-CAMs) ontology. + +- **Input**: `data/raw/GO-CAMs.ttl.gz` (Turtle format ontology file) +- **Output**: + - `data/derived/properties.go.csv` - Extracted properties with entity IDs, labels, and descriptions + - `data/derived/properties.go.failed.csv` - Entities that failed to fetch from the OLS API +- **Process**: Queries the GO-CAMs ontology for object properties, then fetches detailed metadata from the EBI OLS API + +### `extract_properties_ro.py` + +Extracts property definitions from the Relations Ontology (RO). + +- **Input**: `data/raw/ro.owl` (OWL format ontology file) +- **Output**: `data/derived/properties.ro.csv` - Extracted properties with entity IDs, labels, and descriptions +- **Process**: Parses the RO OWL file and extracts object properties with their labels and descriptions + +### `merge_properties.py` + +Merges properties from multiple sources (RO, GO, and custom properties) into a unified knowledge base. + +- **Inputs**: + - `data/derived/properties.ro.csv` + - `data/raw/properties.csv` (custom properties) + - Latest versioned synthesis file (if exists) +- **Output**: `data/derived/properties.synthesis.{version}.csv` - Merged property knowledge base +- **Process**: + - Merges RO properties, existing PEL properties, and new custom properties + - Filters out obsolete or deprecated properties + - Assigns entity IDs to new properties (PEL.{number} format) + - Removes duplicates, prioritizing entries with descriptions + - Only creates a new version if entity IDs have changed + +## Embedding Scripts + +### `embed_kb_corpus.py` + +Embeds a knowledge base corpus using transformer models for downstream analysis and linking tasks. + +- **Purpose**: Processes text corpora and extracts mentions of properties, generating embeddings for each mention +- **Inputs**: + - `--input-text-table-path`: TSV/CSV file with `pmid` and `text` columns (optionally gzipped) + - `--properties-txt-path`: Newline-separated list of properties/patterns to search for +- **Output**: Parquet file containing extracted mentions with their embeddings +- **Features**: + - Supports multiple model types (biobert, pubmedbert, scibert, etc.) + - Configurable layer selection for embeddings + - GPU acceleration support + - Streaming processing with chunking for large datasets + - Extracts mentions at multiple word grouping levels (W1, W2, W3) + +## Analysis Scripts + +Scripts in the `analysis/` directory evaluate embedding quality and select diverse entities. + +### `clustering_quality.py` + +Measures the quality of embeddings obtained from `embed_kb_corpus.py` by evaluating clustering performance. + +- **Purpose**: Evaluates how well embeddings cluster semantically similar properties together +- **Input**: Directory containing parquet files (pattern: `res__.parquet`) +- **Outputs**: + - `results.csv` - Summary table with metrics for each model/layer combination + - `model.perf.heatmap.png` - Heatmap of best scores across models + - `model.hungarian_accuracy.heatmap.png` - Heatmap of Hungarian matching accuracy (if available) + - `{model}_{layer}.png` - Metrics plots for each model/layer + - `umap_best.html` - Interactive UMAP visualization of the best performing model +- **Key Features**: + - Evaluates multiple model/layer combinations + - Optimizes cluster size using various metrics + - Supports multiple sampling runs for statistical robustness + - **Optional**: `--selected-labels-kb-path` parameter to evaluate quality over a specific subset of labels from a selected knowledge base CSV file +- **Metrics**: Best cluster size, number of properties, clustering score, Hungarian matching accuracy + +### `select_diverse_entities.py` + +Selects semantically diverse entities from a knowledge base using clustering-based selection. + +- **Purpose**: Identifies a diverse subset of entities that represent the semantic space of the full knowledge base +- **Input**: CSV/TSV file with entity IDs and labels +- **Output**: CSV file with selected diverse entities +- **Process**: + - Embeds all labels using a transformer model + - Applies PCA for dimensionality reduction + - Uses K-means clustering to identify diverse groups + - Selects the most representative entity from each cluster (preferring generic/simple terms) +- **Use Case**: Useful for creating evaluation sets or reducing knowledge base size while maintaining semantic coverage diff --git a/run/analysis/clustering.quality.py b/run/analysis/clustering_quality.py similarity index 99% rename from run/analysis/clustering.quality.py rename to run/analysis/clustering_quality.py index 6e91e4b..5f085c9 100644 --- a/run/analysis/clustering.quality.py +++ b/run/analysis/clustering_quality.py @@ -27,10 +27,6 @@ from pelinker.transform import TransformConfig -# estimate_model is now estimate_model_clustering in pelinker.analysis -# parse_filename is now parse_model_filename in pelinker.ops - - @click.command() @click.option( "--input-dir", diff --git a/run/analysis/select_diverse_entities.py b/run/analysis/select_diverse_entities.py new file mode 100644 index 0000000..d57da7d --- /dev/null +++ b/run/analysis/select_diverse_entities.py @@ -0,0 +1,290 @@ +import logging +import pathlib + +import click +import pandas as pd +import torch +import spacy +import numpy as np +from sklearn.decomposition import PCA + +from pelinker.model import LinkerModel +from pelinker.ops import load_dataframe +from pelinker.util import load_models, embed_texts +from sklearn.cluster import KMeans + +from pelinker.analysis import ( + embeddings_dict_to_dataframe, + get_word_frequencies_from_library, + compute_kb_generality_scores, +) + +logger = logging.getLogger(__name__) + + +def select_diverse_entities( + embeddings_dict: dict[str, tuple[str, torch.Tensor]], + id_column: str, + label_column: str, + n_select: int = 10, + *, + pca_components: int = 20, + random_state=12, + metric: str = "cosine", +) -> pd.DataFrame: + """ + Select semantically diverse entities using farthest point sampling. + + Uses weighted FPS to balance semantic diversity with preference for generic/simple terms. + Can filter out very specific/technical terms before selection. + + Args: + embeddings_dict: Dictionary mapping id -> (label, embedding) + id_column: Name of column containing item IDs + label_column: Name of column containing item labels + n_select: Number of entities to select (default: 15) + pca_components: Number of PCA components if use_pca=True + metric: Distance metric ('cosine' or 'euclidean') + random_state: Random seed for selecting first point (None = deterministic, uses first point) + prefer_simple: If True, use weighted FPS that prefers simpler/generic labels + + Returns: + DataFrame with selected entities (columns: id_column, label_column, and metadata) + """ + logger.info( + "Selecting %d diverse entities from %d candidates", + n_select, + len(embeddings_dict), + ) + + # Convert to dataframe + df = embeddings_dict_to_dataframe(embeddings_dict) + + # Rename columns to match the actual column names + df = df.rename(columns={"id": id_column, "label": label_column}) + + # Get word frequencies for filtering and scoring + word_frequencies = get_word_frequencies_from_library(language="en", wordlist="best") + + # Extract embeddings for KB-based analysis + embeddings = np.stack(df["embed"].values) + + # Compute KB-based generality scores if requested + logger.info("Computing KB-based generality scores (density + label simplicity)...") + # Use raw embeddings for generality computation (before PCA) + labels_list = df[label_column].astype(str).tolist() + kb_generality_scores = compute_kb_generality_scores( + embeddings, + labels_list, + k_neighbors=min(10, len(df) - 1), + metric=metric, + word_frequencies=word_frequencies, + density_weight=0.4, + ) + + pca = PCA(n_components=min(pca_components, len(df) - 1)) + embeddings = pca.fit_transform(embeddings) + logger.info( + "PCA explained variance ratio: %.3f", pca.explained_variance_ratio_.sum() + ) + + preference_scores = kb_generality_scores + + kmeans = KMeans(n_clusters=n_select, random_state=random_state, n_init="auto").fit( + embeddings + ) + + labels = kmeans.labels_ + + dfw = df[[id_column, label_column]].copy() + dfw["score"] = preference_scores + dfw["icluster"] = labels + df_selected = dfw.groupby("icluster").apply(lambda x: x.loc[x["score"].idxmax()]) + return df_selected + + +@click.command() +@click.option( + "--input-table-path", + type=click.Path(path_type=pathlib.Path), + required=True, + help="Path to the dataframe to load (CSV/TSV, optionally gzipped).", +) +@click.option( + "--label-column", + type=click.STRING, + default="label", + show_default=True, + help="Column containing the labels/phrases to embed.", +) +@click.option( + "--id-column", + type=click.STRING, + default="entity_id", + show_default=True, + help="Column containing the IDs to use as dictionary keys.", +) +@click.option( + "--n-select", + type=click.INT, + default=10, + help="Number of diverse entities to select (10-20 recommended).", +) +@click.option( + "--model-type", + type=click.STRING, + default="pubmedbert", + help="Backbone model identifier passed to pelinker.util.load_models.", +) +@click.option( + "--layers-spec", + type=click.STRING, + default="4", + help="Layer spec string (digits for token layers).", +) +@click.option( + "--use-gpu", + is_flag=True, + help="Move the encoder model to CUDA if available.", +) +@click.option( + "--pca-components", + type=click.INT, + default=20, + help="Number of PCA components for dimensionality reduction.", +) +@click.option( + "--metric", + type=click.Choice(["cosine", "euclidean"]), + default="cosine", + help="Distance metric for FPS.", +) +@click.option( + "--random-state", + type=click.INT, + default=13, + help="Random seed for selecting first point (None = deterministic, uses first point).", +) +@click.option( + "--output-path", + type=click.Path(path_type=pathlib.Path), + required=True, + help="Path for saving selected entities CSV file.", +) +def run( + input_table_path, + label_column, + id_column, + n_select, + model_type, + layers_spec, + use_gpu, + pca_components, + metric, + random_state, + output_path, +): + """ + Select semantically diverse entities using farthest point sampling. + + This approach is deterministic and stable, unlike clustering-based methods. + """ + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + ) + + layers = LinkerModel.str2layers(layers_spec) + + logger.info("Loading dataframe from %s", input_table_path) + df = load_dataframe(input_table_path) + + if label_column not in df.columns: + raise click.BadParameter( + f"Column '{label_column}' not found in dataframe columns {list(df.columns)}", + param_hint="--label-column", + ) + + if id_column not in df.columns: + raise click.BadParameter( + f"Column '{id_column}' not found in dataframe columns {list(df.columns)}", + param_hint="--id-column", + ) + + tokenizer, model = load_models(model_type) + + if use_gpu: + if torch.cuda.is_available(): + logger.info("Moving model to CUDA") + model = model.to("cuda") + else: + logger.warning("CUDA not available, falling back to CPU") + + # Load spacy model for texts_to_vrep + logger.info("Loading spaCy model") + nlp = spacy.load("en_core_web_trf") + + # Filter rows where both id and label are not null + df_filtered = df[[id_column, label_column]].dropna() + if df_filtered.empty: + logger.warning( + "No rows with both '%s' and '%s' columns non-null", id_column, label_column + ) + return + + # Filter out empty labels + valid_mask = df_filtered[label_column].apply( + lambda x: pd.notna(x) and str(x).strip() != "" + ) + df_valid = df_filtered[valid_mask] + + if df_valid.empty: + logger.warning("No rows with non-empty labels after filtering") + return + + ids = df_valid[id_column].tolist() + labels = df_valid[label_column].tolist() + + logger.info("Embedding %d labels...", len(labels)) + # Convert embeddings to list format + text_embeddings = embed_texts( + labels, + tokenizer=tokenizer, + model=model, + layers=layers, + nlp=nlp, + ) + + # Create dictionary mapping id -> (label, embedding) + result = {} + for id_val, label, emb in zip(ids, labels, text_embeddings): + result[str(id_val)] = (str(label), emb) + + logger.info( + "Embedded %d items from columns '%s' (labels) and '%s' (ids)", + len(result), + label_column, + id_column, + ) + + # Select diverse entities + selected_df = select_diverse_entities( + result, + id_column=id_column, + label_column=label_column, + n_select=n_select, + pca_components=pca_components, + metric=metric, + random_state=random_state, + ) + + # Save results + output_path = output_path.expanduser() + output_path.parent.mkdir(parents=True, exist_ok=True) + logger.info("Saving selected entities to %s", output_path) + selected_df.to_csv(output_path, index=False) + logger.info("Saved %d diverse entities", len(selected_df)) + + +if __name__ == "__main__": + run() diff --git a/run/experiments/__init__.py b/run/obsolete/__init__.py similarity index 100% rename from run/experiments/__init__.py rename to run/obsolete/__init__.py diff --git a/run/testing/__init__.py b/run/obsolete/analysis/__init__.py similarity index 100% rename from run/testing/__init__.py rename to run/obsolete/analysis/__init__.py diff --git a/run/analysis/plot_discrim_dist.py b/run/obsolete/analysis/plot_discrim_dist.py similarity index 100% rename from run/analysis/plot_discrim_dist.py rename to run/obsolete/analysis/plot_discrim_dist.py diff --git a/run/analysis/plot_dispersion_hist.py b/run/obsolete/analysis/plot_dispersion_hist.py similarity index 100% rename from run/analysis/plot_dispersion_hist.py rename to run/obsolete/analysis/plot_dispersion_hist.py diff --git a/run/analysis/plot_interterm_dist.py b/run/obsolete/analysis/plot_interterm_dist.py similarity index 100% rename from run/analysis/plot_interterm_dist.py rename to run/obsolete/analysis/plot_interterm_dist.py diff --git a/run/obsolete/csvcut.py b/run/obsolete/csvcut.py new file mode 100644 index 0000000..fe6f8d0 --- /dev/null +++ b/run/obsolete/csvcut.py @@ -0,0 +1,8 @@ +import csv +import sys + +reader = csv.reader(sys.stdin) +writer = csv.writer(sys.stdout) + +for row in reader: + writer.writerow(row[1:]) diff --git a/pelinker/writer.py b/run/obsolete/experiments/__init__.py similarity index 100% rename from pelinker/writer.py rename to run/obsolete/experiments/__init__.py diff --git a/run/experiments/pattern_tensor_align.py b/run/obsolete/experiments/pattern_tensor_align.py similarity index 100% rename from run/experiments/pattern_tensor_align.py rename to run/obsolete/experiments/pattern_tensor_align.py diff --git a/run/obsolete/preprocessing/__init__.py b/run/obsolete/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/run/preprocessing/disambiguate.py b/run/obsolete/preprocessing/disambiguate.py similarity index 100% rename from run/preprocessing/disambiguate.py rename to run/obsolete/preprocessing/disambiguate.py diff --git a/run/save_model.py b/run/obsolete/save_model.py similarity index 100% rename from run/save_model.py rename to run/obsolete/save_model.py diff --git a/run/serve.py b/run/obsolete/serve.py similarity index 100% rename from run/serve.py rename to run/obsolete/serve.py diff --git a/run/test.pat.align.sh b/run/obsolete/test.pat.align.sh similarity index 100% rename from run/test.pat.align.sh rename to run/obsolete/test.pat.align.sh diff --git a/run/obsolete/testing/__init__.py b/run/obsolete/testing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/run/testing/find_occurences.py b/run/obsolete/testing/find_occurences.py similarity index 100% rename from run/testing/find_occurences.py rename to run/obsolete/testing/find_occurences.py diff --git a/run/testing/run_pel_description.py b/run/obsolete/testing/run_pel_description.py similarity index 100% rename from run/testing/run_pel_description.py rename to run/obsolete/testing/run_pel_description.py diff --git a/run/testing/run_pel_test.py b/run/obsolete/testing/run_pel_test.py similarity index 100% rename from run/testing/run_pel_test.py rename to run/obsolete/testing/run_pel_test.py diff --git a/run/testing/test_linking.py b/run/obsolete/testing/test_linking.py similarity index 100% rename from run/testing/test_linking.py rename to run/obsolete/testing/test_linking.py diff --git a/run/testing/test_server.py b/run/obsolete/testing/test_server.py similarity index 100% rename from run/testing/test_server.py rename to run/obsolete/testing/test_server.py diff --git a/run/render_divergent_properties.py b/run/render_divergent_properties.py deleted file mode 100644 index fdc484a..0000000 --- a/run/render_divergent_properties.py +++ /dev/null @@ -1,473 +0,0 @@ -import json -import logging -import pathlib - -import click -import pandas as pd -import torch -import spacy - -from pelinker.model import LinkerModel -from pelinker.ops import load_dataframe -from pelinker.util import load_models, embed_texts -from pelinker.analysis import ( - cluster_with_target_count, - find_cluster_centers, - embeddings_dict_to_dataframe, - get_word_frequencies_from_library, - compute_dbcv_after_filtering, -) -from pelinker.transform import TransformConfig, get_umap_columns, transform_embeddings -from pelinker.reporting import log_clustering_scores, log_clustering_results - -logger = logging.getLogger(__name__) - - -def _perform_clustering_analysis( - embeddings_dict: dict[str, tuple[str, torch.Tensor]], - id_column: str, - label_column: str, - transform_config: TransformConfig, - *, - target_cluster_counts: list[int] | None = None, - min_cluster_size: int = 5, - max_chars: int | None = 30, - max_words: int | None = 4, - min_simplicity_score: float | None = 5e-8, - max_word_length: int | None = 22, - results_dir: pathlib.Path | None = None, - sem_div_kb_path: pathlib.Path | None = None, -) -> dict: - """ - Perform clustering analysis on embeddings and output results for multiple cluster counts. - - Args: - embeddings_dict: Dictionary mapping id -> (label, embedding) - id_column: Name of column containing item IDs - label_column: Name of column containing item labels - transform_config: TransformConfig instance specifying transformation parameters - target_cluster_counts: List of target cluster counts to evaluate (default: [5, 10, 15, 20]) - min_cluster_size: Minimum cluster size for filtering - max_chars: Maximum characters for simplest example - max_words: Maximum words for simplest example - min_simplicity_score: Minimum simplicity score threshold - max_word_length: Maximum length for any word in the label (None = no limit) - results_dir: Output directory for saving JSON report (None = don't save) - sem_div_kb_path: Path for saving reduced KB dataframe (None = don't save) - - Returns: - Dictionary with report data - """ - if target_cluster_counts is None: - target_cluster_counts = [5, 10, 15, 20, 30] - - logger.info("Performing clustering analysis...") - - # Convert embeddings to dataframe format - df_emb = embeddings_dict_to_dataframe(embeddings_dict) - - # Apply PCA -> UMAP reduction pipeline - logger.info( - "Applying PCA(%d) -> UMAP(%d) reduction pipeline...", - transform_config.pca_components, - transform_config.umap_components, - ) - df_umap = transform_embeddings( - df_emb, config=transform_config, embed_column="embed" - ) - umap_columns = get_umap_columns(transform_config) - - # Get word frequencies from external library for better simplicity scoring - logger.info("Loading word frequencies from wordfreq library...") - word_frequencies = get_word_frequencies_from_library(language="en", wordlist="best") - if word_frequencies is None: - logger.warning( - "wordfreq library not available. Install with: pip install wordfreq\n" - "Falling back to word length-based simplicity scoring." - ) - else: - logger.info("Using wordfreq library for frequency lookups") - - logger.info( - "Filtering clusters: min_size=%d, max_chars=%s, max_words=%s, min_simplicity=%.2e", - min_cluster_size, - max_chars if max_chars else "unlimited", - max_words if max_words else "unlimited", - min_simplicity_score if min_simplicity_score else 0.0, - ) - - # Compute clustering scores for each target count - logger.info( - "Computing clustering scores for target counts: %s", target_cluster_counts - ) - results = [] - - for target_count in target_cluster_counts: - logger.info("Clustering for target count: %d...", target_count) - df_clustered, actual_count, score = cluster_with_target_count( - df_umap, umap_columns, target_count - ) - - # Find valid clusters after filtering - cluster_results = find_cluster_centers( - df_clustered, - min_cluster_size=min_cluster_size, - max_complexity_chars=max_chars, - max_complexity_words=max_words, - min_simplicity_score=min_simplicity_score, - max_word_length=max_word_length, - word_frequencies=word_frequencies, - ) - valid_cluster_ids = {cr["cluster_id"] for cr in cluster_results} - - # Compute DBCV score after filtering - filtered_score = compute_dbcv_after_filtering( - df_clustered, umap_columns, valid_cluster_ids - ) - - results.append( - { - "target_count": target_count, - "actual_count": actual_count, - "score_before_filtering": score, - "score_after_filtering": filtered_score, - "n_valid_clusters": len(valid_cluster_ids), - "df": df_clustered, - "cluster_results": cluster_results, - } - ) - logger.info( - " Target: %d, Actual: %d, Before filtering: %.4f, After filtering: %.4f, Valid clusters: %d", - target_count, - actual_count, - score, - filtered_score, - len(valid_cluster_ids), - ) - - # Output summary table - log_clustering_scores(results, logger) - - # Output detailed results for the best scoring clustering (or 15 clusters if available) - best_result = max(results, key=lambda x: x["score_after_filtering"]) - default_result = next( - (r for r in results if r["target_count"] == max(target_cluster_counts)), - best_result, - ) - - logger.info( - "\nShowing cluster details for %d clusters (score after filtering: %.4f)...", - default_result["target_count"], - default_result["score_after_filtering"], - ) - - logger.info("\n--- Cluster representatives (simplest examples) ---") - log_clustering_results(default_result["cluster_results"], logger) - - # Build report - report = { - "clustering_results": [ - { - "target_count": r["target_count"], - "actual_count": r["actual_count"], - "score_before_filtering": r["score_before_filtering"], - "score_after_filtering": r["score_after_filtering"], - "n_valid_clusters": r["n_valid_clusters"], - } - for r in results - ], - "best_result": { - "target_count": default_result["target_count"], - "actual_count": default_result["actual_count"], - "score_before_filtering": default_result["score_before_filtering"], - "score_after_filtering": default_result["score_after_filtering"], - "n_valid_clusters": default_result["n_valid_clusters"], - "clusters": default_result["cluster_results"], - }, - "filtering_parameters": { - "min_cluster_size": min_cluster_size, - "max_chars": max_chars, - "max_words": max_words, - "min_simplicity_score": min_simplicity_score, - }, - } - - # Build dataframe with selected labels and IDs - selected_data = [] - for cluster_result in default_result["cluster_results"]: - selected_data.append( - { - id_column: cluster_result["center_id"], - label_column: cluster_result["center_label"], - "cluster_id": cluster_result["cluster_id"], - "cluster_size": cluster_result["cluster_size"], - } - ) - - df_selected = pd.DataFrame(selected_data) - - # Save JSON report - if results_dir: - results_dir = results_dir.expanduser() - results_dir.mkdir(parents=True, exist_ok=True) - json_path = results_dir / "semantic_divergent_clustering_report.json" - logger.info("Saving JSON report to %s", json_path) - with open(json_path, "w") as f: - json.dump(report, f, indent=2, default=str) - else: - logger.info("No results directory specified, skipping JSON report output") - - # Save reduced KB dataframe - if sem_div_kb_path: - sem_div_kb_path = sem_div_kb_path.expanduser() - sem_div_kb_path.parent.mkdir(parents=True, exist_ok=True) - logger.info("Saving reduced KB to %s", sem_div_kb_path) - - # Detect format from extension - if sem_div_kb_path.suffix == ".csv": - df_selected.to_csv(sem_div_kb_path, index=False) - else: - logger.warning( - "Unknown file extension, defaulting to CSV format. " - "Supported: .csv, .tsv, .parquet" - ) - csv_path = sem_div_kb_path.with_suffix(".csv") - df_selected.to_csv(csv_path, index=False) - logger.info("Saved to %s instead", csv_path) - else: - logger.info("No reduced KB path specified, skipping reduced KB output") - - return { - "report": report, - "selected_dataframe": df_selected, - } - - -@click.command() -@click.option( - "--input-table-path", - type=click.Path(path_type=pathlib.Path), - required=True, - help="Path to the dataframe to load (CSV/TSV, optionally gzipped).", -) -@click.option( - "--label-column", - type=click.STRING, - required=True, - help="Column containing the labels/phrases to embed.", -) -@click.option( - "--id-column", - type=click.STRING, - required=True, - help="Column containing the IDs to use as dictionary keys.", -) -@click.option( - "--model-type", - type=click.STRING, - default="biobert", - show_default=True, - help="Backbone model identifier passed to pelinker.util.load_models.", -) -@click.option( - "--layers-spec", - type=click.STRING, - default="1", - show_default=True, - help="Layer spec string (digits for token layers).", -) -@click.option( - "--use-gpu", - is_flag=True, - default=False, - help="Move the encoder model to CUDA if available.", -) -@click.option( - "--min-cluster-size", - type=click.INT, - default=5, - show_default=True, - help="Minimum number of members required in a cluster.", -) -@click.option( - "--max-chars", - type=click.INT, - default=30, - show_default=True, - help="Maximum character count for simplest example in cluster.", -) -@click.option( - "--max-words", - type=click.INT, - default=2, - show_default=True, - help="Maximum word count for simplest example in cluster.", -) -@click.option( - "--min-simplicity-score", - type=click.FLOAT, - default=5e-8, - show_default=True, - help="Minimum simplicity score (based on word frequency harmonic mean).", -) -@click.option( - "--max-word-length", - type=click.INT, - default=22, - show_default=True, - help="Maximum length for any word in the label (labels with words longer than this are discarded).", -) -@click.option( - "--results-dir", - type=click.Path(path_type=pathlib.Path), - default=None, - help="Output directory for saving JSON clustering report.", -) -@click.option( - "--sem-div-kb-path", - type=click.Path(path_type=pathlib.Path), - default=None, - help="Path for saving semantically divergent KB dataframe (selected labels). Supports .csv", -) -@click.option( - "--umap-dim", - type=click.INT, - default=4, - show_default=True, - help="UMAP dimensionality for clustering (range: 3-5).", -) -@click.option( - "--pca-components", - type=click.INT, - default=50, - show_default=True, - help="Number of PCA components for dimensionality reduction.", -) -def run( - input_table_path, - label_column, - id_column, - model_type, - layers_spec, - use_gpu, - min_cluster_size, - max_chars, - max_words, - min_simplicity_score, - max_word_length, - results_dir, - sem_div_kb_path, - umap_dim, - pca_components, -): - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - ) - - layers = LinkerModel.str2layers(layers_spec) - - logger.info("Loading dataframe from %s", input_table_path) - df = load_dataframe(input_table_path) - - if label_column not in df.columns: - raise click.BadParameter( - f"Column '{label_column}' not found in dataframe columns {list(df.columns)}", - param_hint="--label-column", - ) - - if id_column not in df.columns: - raise click.BadParameter( - f"Column '{id_column}' not found in dataframe columns {list(df.columns)}", - param_hint="--id-column", - ) - - tokenizer, model = load_models(model_type) - - if use_gpu: - if torch.cuda.is_available(): - logger.info("Moving model to CUDA") - model = model.to("cuda") - else: - logger.warning("CUDA not available, falling back to CPU") - - # Load spacy model for texts_to_vrep - logger.info("Loading spaCy model") - nlp = spacy.load("en_core_web_trf") - - # Filter rows where both id and label are not null - df_filtered = df[[id_column, label_column]].dropna() - if df_filtered.empty: - logger.warning( - "No rows with both '%s' and '%s' columns non-null", id_column, label_column - ) - return {} - - # Filter out empty labels and keep track of valid indices - valid_mask = df_filtered[label_column].apply( - lambda x: pd.notna(x) and str(x).strip() != "" - ) - df_valid = df_filtered[valid_mask] - - if df_valid.empty: - logger.warning("No rows with non-empty labels after filtering") - return {} - - ids = df_valid[id_column].tolist() - labels = df_valid[label_column].tolist() - - # Convert embeddings to list format - text_embeddings = embed_texts( - labels, - tokenizer=tokenizer, - model=model, - layers=layers, - nlp=nlp, - ) - - # Create dictionary mapping id -> (label, embedding) - result = {} - for id_val, label, emb in zip(ids, labels, text_embeddings): - result[str(id_val)] = (str(label), emb) - - logger.info( - "Embedded %d items from columns '%s' (labels) and '%s' (ids)", - len(result), - label_column, - id_column, - ) - for idx, (id_val, (label, embedding)) in enumerate(result.items()): - logger.info( - "Sample #%d id='%s' label='%s' -> dim=%d", - idx + 1, - str(id_val)[:30], - str(label)[:60], - len(embedding), - ) - if idx >= 2: - break - - # Create transform config from CLI options - transform_config = TransformConfig( - pca_components=pca_components, - umap_components=umap_dim, - ) - - # Perform clustering - _ = _perform_clustering_analysis( - result, - label_column=label_column, - id_column=id_column, - transform_config=transform_config, - min_cluster_size=min_cluster_size, - max_chars=max_chars, - max_words=max_words, - min_simplicity_score=min_simplicity_score, - max_word_length=max_word_length, - results_dir=results_dir, - sem_div_kb_path=sem_div_kb_path, - ) - - -if __name__ == "__main__": - run()