diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..287a2f0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + diff --git a/GMSDB.py b/GMSDB.py index a75f6ba..34856f5 100644 --- a/GMSDB.py +++ b/GMSDB.py @@ -1,17 +1,19 @@ -## @package gmsdb_v2.1 +# @package gmsdb_v2.2 # Made under GNU GPL v3.0 # by Oleg I.Berngardt, 2023 -# This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as +# This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public +# License as # published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. # You should have received a copy of the GNU General Public License along with this program. # If not, see . - + import sys import numpy as np import matplotlib.pyplot as pp import pickle +import scipy.stats as ss from sklearn.preprocessing import StandardScaler from sklearn.metrics import pairwise_distances @@ -21,102 +23,101 @@ # import mahalanobis as mh_dist from mlxtend.plotting import plot_decision_regions - -import scipy.stats as ss from numpy.linalg import inv from statsmodels.stats.multitest import multipletests -def getMahalanobisMatrix(X,gm,alpha=0.05,show=False,N=-1): - '''! Calculates $p_{ij}$ matrix of Mahalanobis intercluster distances using given significance level alpha - @param X - source dataset - @param gm - trained GaussianMixture class - @param alpha significance level (recommended default value 0.05) - @param show Debug flag, shows distributions of p_{ij} and p_{ii} into img/ folder. Default False - @param N - limits processing large datasets, use default -1 - ''' - percent=int(alpha*100) -# N=-1 - Y=gm.predict(X) - cov=gm.covariances_ -# print(cov.shape) - DD=np.zeros((gm.n_components,gm.n_components)) - DN=np.zeros((gm.n_components,gm.n_components)) - pd_ii=np.zeros((1,1)) - pd_jj=np.zeros((1,1)) - pd_ij=np.zeros((1,1)) - pd_ji=np.zeros((1,1)) - for i in range(gm.n_components): - for j in range(gm.n_components): -# print('process ',i,j,end=' ') - - - if X[Y==i].shape[0]>2 and X[Y==j].shape[0]>2: - pd_ij = pairwise_distances(X[Y==i][:N],X[Y==j][:N],metric='mahalanobis',VI=inv(cov[j])) - pd_ij=pd_ij.reshape(pd_ij.shape[0]*pd_ij.shape[1]) - pd_ji = pairwise_distances(X[Y==j][:N],X[Y==i][:N],metric='mahalanobis',VI=inv(cov[i])) - pd_ji=pd_ji.reshape(pd_ji.shape[0]*pd_ji.shape[1]) - if X[Y==i].shape[0]>2 and X[Y==i].shape[0]>2: - pd_ii = pairwise_distances(X[Y==i][:N],X[Y==i][:N],metric='mahalanobis',VI=inv(cov[i])) - pd_ii=pd_ii.reshape(pd_ii.shape[0]*pd_ii.shape[1]) - if X[Y==j].shape[0]>2 and X[Y==j].shape[0]>2: - pd_jj = pairwise_distances(X[Y==j][:N],X[Y==j][:N],metric='mahalanobis',VI=inv(cov[j])) - pd_jj=pd_jj.reshape(pd_jj.shape[0]*pd_jj.shape[1]) - - - cond=np.where(np.isnan(pd_ii),False,True) - pd_ii=pd_ii[cond] - cond=np.where(np.isnan(pd_jj),False,True) - pd_jj=pd_jj[cond] - cond=np.where(np.isnan(pd_ij),False,True) - pd_ij=pd_ij[cond] - cond=np.where(np.isnan(pd_ji),False,True) - pd_ji=pd_ji[cond] - - d_ii=np.median(pd_ii) - d_jj=np.median(pd_jj) - - if pd_ii.shape[0]*pd_ij.shape[0]*pd_ji.shape[0]*pd_jj.shape[0]>0: - d_ij=max(np.percentile(pd_ij,percent),np.percentile(pd_ji,percent)) - elif pd_ij.shape[0]*pd_ji.shape[0]==0: - if pd_ij.shape[0]>pd_ji.shape[0]: - d_ij=np.percentile(pd_ij,percent) - elif pd_ji.shape[0]>pd_ij.shape[0]: - d_ij=np.percentile(pd_ji,percent) - else: - d_ij=-1 - - if d_ij>=0: - DD[i,j]+=d_ij - DN[i,j]+=1 - DD[i,i]+=d_ii - DN[i,i]+=1 - DD[j,j]+=d_jj - DN[j,j]+=1 - - if show: - pp.figure() - fig,axs=pp.subplots(2,1) - axs[0].hist(pd_ii[np.where(np.isnan(pd_ii),False,True)],bins=100,histtype='step',density=True,label='ii') - axs[0].hist(pd_ij[np.where(np.isnan(pd_ij),False,True)],bins=100,histtype='step',density=True,label='ij') - axs[0].hist(pd_ji[np.where(np.isnan(pd_ji),False,True)],bins=100,histtype='step',density=True,label='ji') - axs[0].hist(pd_jj[np.where(np.isnan(pd_jj),False,True)],bins=100,histtype='step',density=True,label='jj') - axs[0].legend() - cond=np.where((Y==i) | (Y==j)) - axs[1].scatter(X[cond,0],X[cond,1],c=Y[cond]) - pp.savefig('img/'+str(i)+'-'+str(j)+'.jpg') - pp.close() - DD/=DN - cond=np.where((np.isnan(DD) | np.isinf(DD)),True,False) - notcond=np.where((np.isnan(DD) | np.isinf(DD)),False,True) - max1=DD[notcond].max() - DD[cond]=max1 - return DD - - - -def ConfiDistance(MatrixClustersAll,gm=False, eps=1.,show=False,\ - alpha=0.1,bh_mh_correct=False,dim=2): - '''! Joins clusters into superclusters using DBSCAN and calculates MC quality index + +def getMahalanobisMatrix(X, gm, alpha=0.05, show=False, N=-1): + """ Calculates $p_{ij}$ matrix of Mahalanobis intercluster distances using given significance level alpha + @param X - source dataset + @param gm - trained GaussianMixture class + @param alpha significance level (recommended default value 0.05) + @param show Debug flag, shows distributions of p_{ij} and p_{ii} into img/ folder. Default False + @param N - limits processing large datasets, use default -1 + """ + percent = int(alpha * 100) + # N=-1 + Y = gm.predict(X) + cov = gm.covariances_ + # print(cov.shape) + DD = np.zeros((gm.n_components, gm.n_components)) + DN = np.zeros((gm.n_components, gm.n_components)) + pd_ii = np.zeros((1, 1)) + pd_jj = np.zeros((1, 1)) + pd_ij = np.zeros((1, 1)) + pd_ji = np.zeros((1, 1)) + for i in range(gm.n_components): + for j in range(gm.n_components): + # print('process ',i,j,end=' ') + + if X[Y == i].shape[0] > 2 and X[Y == j].shape[0] > 2: + pd_ij = pairwise_distances(X[Y == i][:N], X[Y == j][:N], metric='mahalanobis', VI=inv(cov[j])) + pd_ij = pd_ij.reshape(pd_ij.shape[0] * pd_ij.shape[1]) + pd_ji = pairwise_distances(X[Y == j][:N], X[Y == i][:N], metric='mahalanobis', VI=inv(cov[i])) + pd_ji = pd_ji.reshape(pd_ji.shape[0] * pd_ji.shape[1]) + if X[Y == i].shape[0] > 2 and X[Y == i].shape[0] > 2: + pd_ii = pairwise_distances(X[Y == i][:N], X[Y == i][:N], metric='mahalanobis', VI=inv(cov[i])) + pd_ii = pd_ii.reshape(pd_ii.shape[0] * pd_ii.shape[1]) + if X[Y == j].shape[0] > 2 and X[Y == j].shape[0] > 2: + pd_jj = pairwise_distances(X[Y == j][:N], X[Y == j][:N], metric='mahalanobis', VI=inv(cov[j])) + pd_jj = pd_jj.reshape(pd_jj.shape[0] * pd_jj.shape[1]) + + cond = np.where(np.isnan(pd_ii), False, True) + pd_ii = pd_ii[cond] + cond = np.where(np.isnan(pd_jj), False, True) + pd_jj = pd_jj[cond] + cond = np.where(np.isnan(pd_ij), False, True) + pd_ij = pd_ij[cond] + cond = np.where(np.isnan(pd_ji), False, True) + pd_ji = pd_ji[cond] + + d_ii = np.median(pd_ii) + d_jj = np.median(pd_jj) + + if pd_ii.shape[0] * pd_ij.shape[0] * pd_ji.shape[0] * pd_jj.shape[0] > 0: + d_ij = max(np.percentile(pd_ij, percent), np.percentile(pd_ji, percent)) + elif pd_ij.shape[0] * pd_ji.shape[0] == 0: + if pd_ij.shape[0] > pd_ji.shape[0]: + d_ij = np.percentile(pd_ij, percent) + elif pd_ji.shape[0] > pd_ij.shape[0]: + d_ij = np.percentile(pd_ji, percent) + else: + d_ij = -1 + + if d_ij >= 0: + DD[i, j] += d_ij + DN[i, j] += 1 + DD[i, i] += d_ii + DN[i, i] += 1 + DD[j, j] += d_jj + DN[j, j] += 1 + + if show: + pp.figure() + fig, axs = pp.subplots(2, 1) + axs[0].hist(pd_ii[np.where(np.isnan(pd_ii), False, True)], bins=100, histtype='step', density=True, + label='ii') + axs[0].hist(pd_ij[np.where(np.isnan(pd_ij), False, True)], bins=100, histtype='step', density=True, + label='ij') + axs[0].hist(pd_ji[np.where(np.isnan(pd_ji), False, True)], bins=100, histtype='step', density=True, + label='ji') + axs[0].hist(pd_jj[np.where(np.isnan(pd_jj), False, True)], bins=100, histtype='step', density=True, + label='jj') + axs[0].legend() + cond = np.where((Y == i) | (Y == j)) + axs[1].scatter(X[cond, 0], X[cond, 1], c=Y[cond]) + pp.savefig('img/' + str(i) + '-' + str(j) + '.jpg') + pp.close() + DD /= DN + cond = np.where((np.isnan(DD) | np.isinf(DD)), True, False) + notcond = np.where((np.isnan(DD) | np.isinf(DD)), False, True) + max1 = DD[notcond].max() + DD[cond] = max1 + return DD + + +def ConfiDistance(MatrixClustersAll, gm=False, eps=1., show=False, alpha=0.1, bh_mh_correct=False, dim=2): + """ Joins clusters into super clusters using DBSCAN and calculates MC quality index @param MatrixClustersAll - source Mahalanobis distance matrix of clusters @param gm - trained GaussianMixture class (not used) @param eps - hyperparameter for DBSCAN @@ -131,74 +132,74 @@ def ConfiDistance(MatrixClustersAll,gm=False, eps=1.,show=False,\ @param bh_mh_correct - correct distances using Benjamini-Hochberg method, default False @param dim - dimension of source data - ''' - - BORDER=np.sqrt(ss.chi2(df=dim).ppf(1.-alpha)*2) - MC=MatrixClustersAll.copy() - for i in range(MC.shape[0]): - MC[i,i]=0. - -# ======== Correct by Benjamini-Hochberg - if bh_mh_correct: - pvals=1-ss.chi2(df=dim).cdf(MC.reshape(MC.shape[0]*MC.shape[1])**2/2) - rej,p_corr,a1,a2=multipletests(pvals,alpha=alpha,method='fdr_bh') - eeps=1e-10 - MC=np.sqrt(ss.chi2(df=dim).ppf(np.abs(1.-p_corr-eeps))*2).reshape(MC.shape[0],MC.shape[1]) + """ + + BORDER = np.sqrt(ss.chi2(df=dim).ppf(1. - alpha) * 2) + MC = MatrixClustersAll.copy() for i in range(MC.shape[0]): - MC[i,i]=0. -# ====================================== - - idx,labels_superclusters=dbscan(MC, metric='precomputed', eps=eps,min_samples=1) - - total_superclusters=len(set(list(labels_superclusters))) - if(total_superclusters)<2: - return -1e100,labels_superclusters - -# mean distance to closest cluster + 1/2 cluster size - ISCmin=np.ones((total_superclusters,total_superclusters))*1e100 - for i_scl in range(total_superclusters): - for j_scl in range(total_superclusters): - index_i=labels_superclusters==i_scl - index_j=labels_superclusters==j_scl - MC=MatrixClustersAll.copy() - np.fill_diagonal(MC,1e100) - ISCmin[i_scl,j_scl]=(MC[index_i,:][:,index_j]).min() - index_i=labels_superclusters==i_scl - ISCmin[i_scl,i_scl]=np.diag(MatrixClustersAll)[index_i].min() - if show: - print('tot:',total_superclusters) - - ISCmin2=ISCmin - - np.fill_diagonal(ISCmin2,1e100) - distA=ISCmin2.min(axis=1) - dist=distA[distA>BORDER].shape[0]/distA.shape[0] - - if show: - print('dist:\n',dist) - return dist,labels_superclusters - - -class GMSDB(): - def __init__(self,min_components=2,step_components=1,n_components=2,verbose=False,show=False,metric='LH',\ - border_percentile=0.001,alpha_stage2=0.05,alpha_stage4=0.1,show_mah=False, \ - show_clusters=False,autostop=True,bh_mh_correct=False,rand_search=0,rand_level=0.5, max_iter=200): - '''! Init class + MC[i, i] = 0. + + # ======== Correct by Benjamini-Hochberg + if bh_mh_correct: + pvals = 1 - ss.chi2(df=dim).cdf(MC.reshape(MC.shape[0] * MC.shape[1]) ** 2 / 2) + rej, p_corr, a1, a2 = multipletests(pvals, alpha=alpha, method='fdr_bh') + eeps = 1e-10 + MC = np.sqrt(ss.chi2(df=dim).ppf(np.abs(1. - p_corr - eeps)) * 2).reshape(MC.shape[0], MC.shape[1]) + for i in range(MC.shape[0]): + MC[i, i] = 0. + # ====================================== + + idx, labels_superclusters = dbscan(MC, metric='precomputed', eps=eps, min_samples=1) + + total_superclusters = len(set(list(labels_superclusters))) + if (total_superclusters) < 2: + return -1e100, labels_superclusters + + # mean distance to closest cluster + 1/2 cluster size + ISCmin = np.ones((total_superclusters, total_superclusters)) * 1e100 + for i_scl in range(total_superclusters): + for j_scl in range(total_superclusters): + index_i = labels_superclusters == i_scl + index_j = labels_superclusters == j_scl + MC = MatrixClustersAll.copy() + np.fill_diagonal(MC, 1e100) + ISCmin[i_scl, j_scl] = (MC[index_i, :][:, index_j]).min() + index_i = labels_superclusters == i_scl + ISCmin[i_scl, i_scl] = np.diag(MatrixClustersAll)[index_i].min() + if show: + print('tot:', total_superclusters) + + ISCmin2 = ISCmin + + np.fill_diagonal(ISCmin2, 1e100) + distA = ISCmin2.min(axis=1) + dist = distA[distA > BORDER].shape[0] / distA.shape[0] + + if show: + print('dist:\n', dist) + return dist, labels_superclusters + + +class GMSDB: + def __init__(self, min_components=2, step_components=1, n_components=2, verbose=False, show=False, metric='LH', + border_percentile=0.001, alpha_stage2=0.05, alpha_stage4=0.1, show_mah=False, show_clusters=False, + autostop=True, bh_mh_correct=False, rand_search=0, rand_level=0.5, max_iter=200, centroid_type='mean'): + """ Init class @param n_components maximal number of gaussian clusters, the more - the slower the stage 1. - I use 100 for simple cases (dot examples), + I use 100 for simple cases (dot examples), and 1000 with speed up search paramters (below: 2,100,1000,0.5) for complex cases (image examples) @param alpha_stage2 significance level for stage 2. Recommended standard value 0.05 @param alpha_stage4 significance level for stage 4. Recommended value 0.1 @param min_components minimal number of gaussian clusters (stage 1), in most cases use 2 - @param step_components fast algorithm parameter for BIC search (stage 1). - Recommended values: 100 (if n_components>=1000); - 20 if 100=1000); + 20 if 100=1000); - ## 20 if 1000: - np.random.shuffle(search_idx) - search_idx=list(search_idx) - eps_supermin=1e100 - eps_supermax=-1e100 - eps_checked={} - - if self.rand_search>0: - np.random.shuffle(search_idx) - for eps_itt in search_idx[:self.rand_search]: - eps=eps_np_mid[eps_itt] - if eps eps_supermin: - continue - - MatrixTmp=Matrix.copy() #self.getMatrix(X,clusters) - cdist,labels=ConfiDistance(MatrixTmp,self.gm,eps=eps,show=self.verbose,\ - alpha=self.alpha_stage4,bh_mh_correct=self.bh_mh_correct,dim=self.MH_dim) - if 1>=cdist>=0.: - eps_checked[eps_itt]=cdist - elif cdist<=0.: - eps_checked[eps_itt]=1. - - if cdist>=1.0 or len(set(list(labels)))==1 or cdist==-1e100: - if self.verbose: - print('cmp 2 eps_smin,eps:',eps_supermin,eps,flush=True) - if epseps_supermax: - eps_supermax=eps - if self.verbose: - print('changed 1 eps_smax,eps_min limits:',eps_supermax,eps_supermin,flush=True) - if eps_supermax>0 and eps_supermin<1e100 and (1.>cdist>self.rand_level): - break - - if self.verbose: - print('will search over eps_smax,eps_min limits:',eps_supermax,eps_supermin,flush=True) - for eid in eps_checked.keys(): - print(eid,eps_np_mid[eid],eps_checked[eid],file=sys.stderr) -# quit() -# /Random search - - search_idx=np.array(range(eps_np_mid.shape[0])) - search_idx=list(search_idx) - - for eps_itt in search_idx: - eps=eps_np_mid[eps_itt] - # skip variants outside current limits - if eps<=eps_supermax: - continue - if eps>eps_supermin: - continue - - for subitt in range(self.SUBITT): - MatrixTmp=Matrix.copy() #self.getMatrix(X,clusters) - cdist,labels=ConfiDistance(MatrixTmp,self.gm,eps=eps,show=self.verbose,\ - alpha=self.alpha_stage4,bh_mh_correct=self.bh_mh_correct,dim=self.MH_dim) - - if self.show_clusters: - print('showing cluster...', eps_itt) - pp.figure() - pp.scatter(X[:,0],X[:,1],c=labels[Y]) - pp.savefig('img/cl-'+str(eps_itt)+'.jpg') - pp.close() - - tot_cl=len(set(list(labels))) - - if tot_cl in checked_indexes: - checked_indexes[tot_cl]+=1 - if tot_cl>1: - checked_indexes[1]=0 - - eps_history[eps]={'classes':tot_cl,'labels':labels,'cdist':cdist} - - if self.verbose: - with open('checked_indexes.json','wb') as f: - pickle.dump(checked_indexes,f) - f.close() - with open('eps_history.json','wb') as f: - pickle.dump(eps_history,f) - f.close() - - if self.verbose: - print('check',eps_itt,'/',eps_np_mid.shape[0],' eps:',eps,'cdist',cdist,'labels:',len(set(list(labels))),'eps lim:',eps_supermax,eps_supermin) - - if cdist>cdist_opt and cdist_opt<=1.: - cdist_opt=cdist - eps_opt=eps - self.best_cluster_num=tot_cl - if self.verbose: - print('it',eps_itt,'cdist',cdist,'labels:',len(set(list(labels)))) - optimals.append({'eps':eps,'classes':tot_cl,'labels':labels,'cdist':cdist}) - with open('optimals.json','wb') as f: - pickle.dump(optimals,f) - f.close() - print('============== optimal!!! ===============',flush=True) - self.labels=labels - - if len(set(list(labels)))<=1 and checked_indexes[1]>10: - if self.verbose: - print('============== no more itterations needed - only 1 class ! ==========',flush=True) - return self.best_cluster_num,self.labels - - if cdist>=1.0 and self.autostop: - if self.verbose: - print('============== no more itterations needed - absolute optimum reached! ==========',flush=True) - return self.best_cluster_num,self.labels - - if len(set(list(labels)))>self.opt_bics: - if self.verbose: - print('it',eps_itt,'cdist',cdist,'labels:',len(set(list(labels)))) - print('============== unexpected break!!! ===============',flush=True) - return self.best_cluster_num,self.labels - - - if len(set(list(labels)))<=1 and checked_indexes[1]>10: - if self.verbose: - print('============== no more itterations needed - only 1 class ! ==========',flush=True) - return self.best_cluster_num,self.labels -# final return - - if self.verbose: - print('============== Unpredicted return ! ==========',cdist_opt,self.best_cluster_num,self.labels,eps_supermax,eps_supermin,flush=True) - return self.best_cluster_num,self.labels - - - def fit(self,X0,show_closest=False,plot_decisions=False,max_bic_search_size=-1): - # show_closest=False - show 2 clusters closesest to given. Debug option - # plot_decisions=False - plot decision regions when fitting. Debug option - # max_bic_search_size=-1 - if >0 use for bic search stage (stage1) shorter dataset to provide faster search. Debug option - ss=StandardScaler() - X=ss.fit_transform(X0) - if max_bic_search_size>0: - if max_bic_search_size<=1: - max_bic_search_size=int(X0.shape[0]*max_bic_search_size) -# print('max bic search:',max_bic_search_size) - if max_bic_search_size>0: - idx=np.array(range(X.shape[0])) - np.random.shuffle(idx) - Xbic=X[idx[:max_bic_search_size]] - else: - Xbic=X - self.MH_dim=X0.shape[1] - self.ss=ss - best_bic=1e100 - best_C=-1 - mymin=self.min_components - mymax=self.n_components - mystep=self.step_components - for itt in range(10): - for C in range(mymin,mymax,mystep): - - if not C in self.gms: - gm=GaussianMixture(n_components=C,max_iter=self.max_iter) - gm.fit(Xbic) - else: - gm=self.gms[C] - - yp=gm.predict(Xbic) - top_bic=[] - for i in range(100): - idx3=np.random.randint(0,Xbic.shape[0]-1,Xbic.shape[0]) - top_bic.append(gm.bic(Xbic[idx3])) - top_bic=np.array(top_bic) - self.bics[C]=top_bic.mean()+1.96*top_bic.std() - -# self.bics[C]=gm.bic(Xbic) - self.gms[C]=gm - - if self.bics[C]1): - mymin=best_C-mystep-1 - if mymin<=2: - mymin=2 - mymax=best_C+mystep - if mystep>5: - mystep=mystep//5 - else: - mystep=1 - if self.verbose: - print('new itt:',mymin,mymax,mystep) - else: - break - self.opt_bics=np.argmin(self.bics[self.min_components:])+self.min_components - if self.verbose: - print('optbic:',self.opt_bics,flush=True) - - Y=self.gm.predict(X) - - clusters=set(list(Y)) - Matrix=self.getMatrix(X,clusters) -# show_closest=False - if show_closest: - for i in range(Matrix.shape[0]): - MC=Matrix.copy() - MC[i,i]=1e100 - iclosest=np.argmin(MC[i,:]) - Yc=Y.copy() - Yc[Yc==i]=-1 - Yc[Yc==iclosest]=-2 - MC[i,iclosest]=1e100 - iclosest2=np.argmin(MC[i,:]) - Yc[Yc==iclosest2]=-3 - Yc[Yc>=0]=0 - pp.figure() - for c in set(list(Yc)): - pp.scatter(X[Yc==c,0],X[Yc==c,1],label=str(c)) - pp.legend() - pp.savefig('img/f-'+str(i)+'.jpg') - pp.close() - quit() - if self.verbose: - print('prep matr:\n',np.round(Matrix,2)) - - eps_vals=set(list(Matrix.reshape(Matrix.shape[0]*Matrix.shape[1]))) - eps_np=np.array(sorted(eps_vals)) - eps_np_mid=(eps_np[1:]+eps_np[:-1])/2. - - if self.verbose: - print('unique matrix vals:',eps_np_mid,eps_np_mid.shape[0]) - - best_cl_num,superlabels=self.getOptSuperclusters(Matrix,self.gm,X,Y,eps_np_mid) - - - - def predict(self,X0): - X=self.ss.transform(X0) - Y=self.gm.predict(X) - return self.labels[Y] - - def predict_proba(self,X0): - X=self.ss.transform(X0) - Y=self.gm.predict_proba(X) - superclusters=set(list(self.labels)) - res=np.zeros((Y.shape[0],len(superclusters))) - for scl in superclusters: - res[:,scl]=Y[:,self.labels==scl].sum(axis=1) -# res/=np.expand_dims(res.sum(axis=1),-1) - return res - - def fit_predict(self,X): - self.fit(X) - return self.predict(X) - - def aic(self,X): - return 0 - - def bic(self,X): - return 0 - + """ + # ==== input parameters + # 1.Basic parameters: + self.n_components = n_components ## @param n_components maximal number of gaussian clusters, the more - the slower the stage 1. + ## I use 100 for simple cases (dot examples), + ## and 1000 with speed up search paramters (below: 2,100,1000,0.5) for complex cases (image examples) + + self.alpha_stage2 = alpha_stage2 # @param alpha_stage2 significance level for stage 2. Recommended standard value 0.05 + self.alpha_stage4 = alpha_stage4 # @param alpha_stage4 significance level for stage 4. Recommended value 0.1 + self.bh_mh_correct = bh_mh_correct # @param bh_mh_correct use for multiple hypothesis correcting. Default False + + # + # 2.Speed up search parameters: + self.min_components = min_components # @param min_components minimal number of gaussian clusters (stage 1), in most cases use 2 + self.step_components = step_components # @param step_components fast algorithm parameter for BIC search (stage 1). + # Recommended values: 100 (if n_components>=1000); + # 20 if 100 0: + np.random.shuffle(search_idx) + search_idx = list(search_idx) + eps_supermin = 1e100 + eps_supermax = -1e100 + eps_checked = {} + + if self.rand_search > 0: + np.random.shuffle(search_idx) + for eps_itt in search_idx[:self.rand_search]: + eps = eps_np_mid[eps_itt] + if eps < eps_supermax: + continue + if eps > eps_supermin: + continue + + MatrixTmp = Matrix.copy() #self.getMatrix(X,clusters) + cdist, labels = ConfiDistance(MatrixTmp, self.gm, eps=eps, show=self.verbose, + alpha=self.alpha_stage4, bh_mh_correct=self.bh_mh_correct, + dim=self.MH_dim) + if 1 >= cdist >= 0.: + eps_checked[eps_itt] = cdist + elif cdist <= 0.: + eps_checked[eps_itt] = 1. + + if cdist >= 1.0 or len(set(list(labels))) == 1 or cdist == -1e100: + if self.verbose: + print('cmp 2 eps_smin,eps:', eps_supermin, eps, flush=True) + if eps < eps_supermin: + eps_supermin = eps + if self.verbose: + print('changed 2 eps_smax,eps_min limits:', eps_supermax, eps_supermin, flush=True) + + if 0. <= cdist < 1.: + if self.verbose: + print('cmp 1 eps_smax,eps:', eps_supermax, eps, flush=True) + if eps > eps_supermax: + eps_supermax = eps + if self.verbose: + print('changed 1 eps_smax,eps_min limits:', eps_supermax, eps_supermin, flush=True) + if eps_supermax > 0 and eps_supermin < 1e100 and (1. > cdist > self.rand_level): + break + + if self.verbose: + print('will search over eps_smax,eps_min limits:', eps_supermax, eps_supermin, flush=True) + for eid in eps_checked.keys(): + print(eid, eps_np_mid[eid], eps_checked[eid], file=sys.stderr) + # quit() + # /Random search + + search_idx = np.array(range(eps_np_mid.shape[0])) + search_idx = list(search_idx) + + for eps_itt in search_idx: + eps = eps_np_mid[eps_itt] + # skip variants outside current limits + if eps <= eps_supermax: + continue + if eps > eps_supermin: + continue + + for subitt in range(self.SUBITT): + MatrixTmp = Matrix.copy() #self.getMatrix(X,clusters) + cdist, labels = ConfiDistance(MatrixTmp, self.gm, eps=eps, show=self.verbose, + alpha=self.alpha_stage4, bh_mh_correct=self.bh_mh_correct, + dim=self.MH_dim) + + if self.show_clusters: + print('showing cluster...', eps_itt) + pp.figure() + pp.scatter(X[:, 0], X[:, 1], c=labels[Y]) + pp.savefig('img/cl-' + str(eps_itt) + '.jpg') + pp.close() + + tot_cl = len(set(list(labels))) + + if tot_cl in checked_indexes: + checked_indexes[tot_cl] += 1 + if tot_cl > 1: + checked_indexes[1] = 0 + + eps_history[eps] = {'classes': tot_cl, 'labels': labels, 'cdist': cdist} + + if self.verbose: + with open('checked_indexes.json', 'wb') as f: + pickle.dump(checked_indexes, f) + f.close() + with open('eps_history.json', 'wb') as f: + pickle.dump(eps_history, f) + f.close() + + if self.verbose: + print('check', eps_itt, '/', eps_np_mid.shape[0], ' eps:', eps, 'cdist', cdist, 'labels:', + len(set(list(labels))), 'eps lim:', eps_supermax, eps_supermin) + + if cdist > cdist_opt and cdist_opt <= 1.: + cdist_opt = cdist + eps_opt = eps + self.best_cluster_num = tot_cl + if self.verbose: + print('it', eps_itt, 'cdist', cdist, 'labels:', len(set(list(labels)))) + optimals.append({'eps': eps, 'classes': tot_cl, 'labels': labels, 'cdist': cdist}) + with open('optimals.json', 'wb') as f: + pickle.dump(optimals, f) + f.close() + print('============== optimal!!! ===============', flush=True) + self.labels = labels + + if len(set(list(labels))) <= 1 and checked_indexes[1] > 10: + if self.verbose: + print('============== no more itterations needed - only 1 class ! ==========', flush=True) + return self.best_cluster_num, self.labels + + if cdist >= 1.0 and self.autostop: + if self.verbose: + print('============== no more itterations needed - absolute optimum reached! ==========', + flush=True) + return self.best_cluster_num, self.labels + + if len(set(list(labels))) > self.opt_bics: + if self.verbose: + print('it', eps_itt, 'cdist', cdist, 'labels:', len(set(list(labels)))) + print('============== unexpected break!!! ===============', flush=True) + return self.best_cluster_num, self.labels + + if len(set(list(labels))) <= 1 and checked_indexes[1] > 10: + if self.verbose: + print('============== no more itterations needed - only 1 class ! ==========', flush=True) + return self.best_cluster_num, self.labels + # final return + + if self.verbose: + print('============== Unpredicted return ! ==========', cdist_opt, self.best_cluster_num, self.labels, + eps_supermax, eps_supermin, flush=True) + return self.best_cluster_num, self.labels + + def fit(self, X0, show_closest=False, plot_decisions=False, max_bic_search_size=-1): + """ Train model + @param X0: + @param show_closest: Show two clusters closest to give. Debug option + @param plot_decisions: Plot decision regions when fitting. Debug option + @param max_bic_search_size: If >0 uses for bic search stage (stage1) shorter dataset to provide faster search. Debug option + """ + + X = ss.fit_transform(X0) + if max_bic_search_size > 0: + if max_bic_search_size <= 1: + max_bic_search_size = int(X0.shape[0] * max_bic_search_size) + # print('max bic search:',max_bic_search_size) + if max_bic_search_size > 0: + idx = np.array(range(X.shape[0])) + np.random.shuffle(idx) + Xbic = X[idx[:max_bic_search_size]] + else: + Xbic = X + self.MH_dim = X0.shape[1] + + best_bic = 1e100 + best_C = -1 + mymin = self.min_components + mymax = self.n_components + mystep = self.step_components + for itt in range(10): + for C in range(mymin, mymax, mystep): + + if not C in self.gms: + gm = GaussianMixture(n_components=C, max_iter=self.max_iter) + gm.fit(Xbic) + else: + gm = self.gms[C] + + yp = gm.predict(Xbic) + top_bic = [] + for i in range(100): + idx3 = np.random.randint(0, Xbic.shape[0] - 1, Xbic.shape[0]) + top_bic.append(gm.bic(Xbic[idx3])) + top_bic = np.array(top_bic) + self.bics[C] = top_bic.mean() + 1.96 * top_bic.std() + + # self.bics[C]=gm.bic(Xbic) + self.gms[C] = gm + + if self.bics[C] < best_bic: + best_bic = self.bics[C] + best_C = C + self.gm = gm + self.yp = yp + + if plot_decisions: + pp.figure() + # pp.scatter(X[:,0],X[:,1],c=yp) + plot_decision_regions(Xbic, yp, clf=self.gm) + pp.savefig('bicfile.png') + pp.close() + + if self.verbose: + print("C:", C, 'bic:', self.bics[C], flush=True) + if (mystep > 1): + mymin = best_C - mystep - 1 + if mymin <= 2: + mymin = 2 + mymax = best_C + mystep + if mystep > 5: + mystep = mystep // 5 + else: + mystep = 1 + if self.verbose: + print('new itt:', mymin, mymax, mystep) + else: + break + self.opt_bics = np.argmin(self.bics[self.min_components:]) + self.min_components + if self.verbose: + print('optbic:', self.opt_bics, flush=True) + + Y = self.gm.predict(X) + + clusters = set(list(Y)) + Matrix = self.getMatrix(X, clusters) + # show_closest=False + if show_closest: + for i in range(Matrix.shape[0]): + MC = Matrix.copy() + MC[i, i] = 1e100 + iclosest = np.argmin(MC[i, :]) + Yc = Y.copy() + Yc[Yc == i] = -1 + Yc[Yc == iclosest] = -2 + MC[i, iclosest] = 1e100 + iclosest2 = np.argmin(MC[i, :]) + Yc[Yc == iclosest2] = -3 + Yc[Yc >= 0] = 0 + pp.figure() + for c in set(list(Yc)): + pp.scatter(X[Yc == c, 0], X[Yc == c, 1], label=str(c)) + pp.legend() + pp.savefig('img/f-' + str(i) + '.jpg') + pp.close() + quit() + if self.verbose: + print('prep matr:\n', np.round(Matrix, 2)) + + eps_vals = set(list(Matrix.reshape(Matrix.shape[0] * Matrix.shape[1]))) + eps_np = np.array(sorted(eps_vals)) + eps_np_mid = (eps_np[1:] + eps_np[:-1]) / 2. + + if self.verbose: + print('unique matrix vals:', eps_np_mid, eps_np_mid.shape[0]) + + best_cl_num, superlabels = self.getOptSuperclusters(Matrix, self.gm, X, Y, eps_np_mid) + + # calculate centroids + self._calculate_centroids(X) + + def predict(self, X0): + X = self.ss.transform(X0) + Y = self.gm.predict(X) + return self.labels[Y] + + def predict_proba(self, X0): + X = self.ss.transform(X0) + Y = self.gm.predict_proba(X) + superclusters = set(list(self.labels)) + res = np.zeros((Y.shape[0], len(superclusters))) + for scl in superclusters: + res[:, scl] = Y[:, self.labels == scl].sum(axis=1) + # res/=np.expand_dims(res.sum(axis=1),-1) + return res + + def fit_predict(self, X): + self.fit(X) + return self.predict(X) + + def aic(self, X): + return 0 + + def bic(self, X): + return 0 + + def _calculate_centroids(self, X): + """Calculates centroids for each cluster based on the selected method. + @param X: Train dataset + """ + unique_labels = np.unique(self.yp) + centroids = np.zeros((len(unique_labels), X.shape[1])) + + for i, label in enumerate(unique_labels): + points_in_cluster = X[self.yp == label] + + if self.centroid_type == 'mean': + centroid = np.mean(points_in_cluster, axis=0).reshape(1, -1) + elif self.centroid_type == 'median': + centroid = np.median(points_in_cluster, axis=0).reshape(1, -1) + elif self.centroid_type == 'geometric': + centroid = ss.gmean(points_in_cluster, axis=0).reshape(1, -1) + elif self.centroid_type == 'rms': + centroid = np.sqrt(np.mean(np.square(points_in_cluster), axis=0)).reshape(1, -1) + else: + raise ValueError(f"Unknown centroid calculation method: {self.centroid_type}") + + centroids[i] = self.ss.inverse_transform(centroid)[0] + + self.cluster_centers_ = centroids + + def get_centroids(self): + """Return centroids""" + return self.cluster_centers_ diff --git a/README.md b/README.md index adafa2f..b011797 100644 --- a/README.md +++ b/README.md @@ -1,41 +1,53 @@ # GMSDB Clusterer Run for simple test: - - python GMSDB.py - +```shell +python GMSDB.py +``` ## Simple use (sklearn -like style), no speed improvement, default significance level alpha=0.1. Maximal number of gaussian components=50: +```python +from sklearn.datasets import make_blobs from GMSDB import GMSDB -clf=GMSDB(n_components=50) +S,N=1000,5 +X,Y=make_blobs(n_samples=S, n_features=2, centers=N, cluster_std=0.3, random_state=42) +clf=GMSDB(n_components=50) clf.fit(X) - Y=clf.predict(X) +print(clf.get_centroids()) +``` ## Complex use (with speed improvement for stages 1 and 3): -clf=GMSDB(min_components=2,step_components=100,n_components=900,rand_search=1000,rand_level=0.5) +```python +clf = GMSDB(min_components=2,step_components=100,n_components=900,rand_search=1000,rand_level=0.5) +``` ## Complex use (with custom significance level alpha=0.15): +```python clf=GMSDB(n_components=50,alpha_stage2=0.15,alpha_stage4=0.15) +``` ## Verbose use (show debug information): - +```python clf=GMSDB(n_components=50,verbose=True) +``` # Install with pip You could install it from PyPI: - +```shell pip install gmsdb==2.1 +``` Import: - +```python from gmsdb import GMSDB +``` # Paper: https://arxiv.org/pdf/2309.02623v2.pdf