diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..287a2f0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,162 @@
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
diff --git a/GMSDB.py b/GMSDB.py
index a75f6ba..34856f5 100644
--- a/GMSDB.py
+++ b/GMSDB.py
@@ -1,17 +1,19 @@
-## @package gmsdb_v2.1
+# @package gmsdb_v2.2
# Made under GNU GPL v3.0
# by Oleg I.Berngardt, 2023
-# This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as
+# This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
+# License as
# published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with this program.
# If not, see .
-
+
import sys
import numpy as np
import matplotlib.pyplot as pp
import pickle
+import scipy.stats as ss
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
@@ -21,102 +23,101 @@
# import mahalanobis as mh_dist
from mlxtend.plotting import plot_decision_regions
-
-import scipy.stats as ss
from numpy.linalg import inv
from statsmodels.stats.multitest import multipletests
-def getMahalanobisMatrix(X,gm,alpha=0.05,show=False,N=-1):
- '''! Calculates $p_{ij}$ matrix of Mahalanobis intercluster distances using given significance level alpha
- @param X - source dataset
- @param gm - trained GaussianMixture class
- @param alpha significance level (recommended default value 0.05)
- @param show Debug flag, shows distributions of p_{ij} and p_{ii} into img/ folder. Default False
- @param N - limits processing large datasets, use default -1
- '''
- percent=int(alpha*100)
-# N=-1
- Y=gm.predict(X)
- cov=gm.covariances_
-# print(cov.shape)
- DD=np.zeros((gm.n_components,gm.n_components))
- DN=np.zeros((gm.n_components,gm.n_components))
- pd_ii=np.zeros((1,1))
- pd_jj=np.zeros((1,1))
- pd_ij=np.zeros((1,1))
- pd_ji=np.zeros((1,1))
- for i in range(gm.n_components):
- for j in range(gm.n_components):
-# print('process ',i,j,end=' ')
-
-
- if X[Y==i].shape[0]>2 and X[Y==j].shape[0]>2:
- pd_ij = pairwise_distances(X[Y==i][:N],X[Y==j][:N],metric='mahalanobis',VI=inv(cov[j]))
- pd_ij=pd_ij.reshape(pd_ij.shape[0]*pd_ij.shape[1])
- pd_ji = pairwise_distances(X[Y==j][:N],X[Y==i][:N],metric='mahalanobis',VI=inv(cov[i]))
- pd_ji=pd_ji.reshape(pd_ji.shape[0]*pd_ji.shape[1])
- if X[Y==i].shape[0]>2 and X[Y==i].shape[0]>2:
- pd_ii = pairwise_distances(X[Y==i][:N],X[Y==i][:N],metric='mahalanobis',VI=inv(cov[i]))
- pd_ii=pd_ii.reshape(pd_ii.shape[0]*pd_ii.shape[1])
- if X[Y==j].shape[0]>2 and X[Y==j].shape[0]>2:
- pd_jj = pairwise_distances(X[Y==j][:N],X[Y==j][:N],metric='mahalanobis',VI=inv(cov[j]))
- pd_jj=pd_jj.reshape(pd_jj.shape[0]*pd_jj.shape[1])
-
-
- cond=np.where(np.isnan(pd_ii),False,True)
- pd_ii=pd_ii[cond]
- cond=np.where(np.isnan(pd_jj),False,True)
- pd_jj=pd_jj[cond]
- cond=np.where(np.isnan(pd_ij),False,True)
- pd_ij=pd_ij[cond]
- cond=np.where(np.isnan(pd_ji),False,True)
- pd_ji=pd_ji[cond]
-
- d_ii=np.median(pd_ii)
- d_jj=np.median(pd_jj)
-
- if pd_ii.shape[0]*pd_ij.shape[0]*pd_ji.shape[0]*pd_jj.shape[0]>0:
- d_ij=max(np.percentile(pd_ij,percent),np.percentile(pd_ji,percent))
- elif pd_ij.shape[0]*pd_ji.shape[0]==0:
- if pd_ij.shape[0]>pd_ji.shape[0]:
- d_ij=np.percentile(pd_ij,percent)
- elif pd_ji.shape[0]>pd_ij.shape[0]:
- d_ij=np.percentile(pd_ji,percent)
- else:
- d_ij=-1
-
- if d_ij>=0:
- DD[i,j]+=d_ij
- DN[i,j]+=1
- DD[i,i]+=d_ii
- DN[i,i]+=1
- DD[j,j]+=d_jj
- DN[j,j]+=1
-
- if show:
- pp.figure()
- fig,axs=pp.subplots(2,1)
- axs[0].hist(pd_ii[np.where(np.isnan(pd_ii),False,True)],bins=100,histtype='step',density=True,label='ii')
- axs[0].hist(pd_ij[np.where(np.isnan(pd_ij),False,True)],bins=100,histtype='step',density=True,label='ij')
- axs[0].hist(pd_ji[np.where(np.isnan(pd_ji),False,True)],bins=100,histtype='step',density=True,label='ji')
- axs[0].hist(pd_jj[np.where(np.isnan(pd_jj),False,True)],bins=100,histtype='step',density=True,label='jj')
- axs[0].legend()
- cond=np.where((Y==i) | (Y==j))
- axs[1].scatter(X[cond,0],X[cond,1],c=Y[cond])
- pp.savefig('img/'+str(i)+'-'+str(j)+'.jpg')
- pp.close()
- DD/=DN
- cond=np.where((np.isnan(DD) | np.isinf(DD)),True,False)
- notcond=np.where((np.isnan(DD) | np.isinf(DD)),False,True)
- max1=DD[notcond].max()
- DD[cond]=max1
- return DD
-
-
-
-def ConfiDistance(MatrixClustersAll,gm=False, eps=1.,show=False,\
- alpha=0.1,bh_mh_correct=False,dim=2):
- '''! Joins clusters into superclusters using DBSCAN and calculates MC quality index
+
+def getMahalanobisMatrix(X, gm, alpha=0.05, show=False, N=-1):
+ """ Calculates $p_{ij}$ matrix of Mahalanobis intercluster distances using given significance level alpha
+ @param X - source dataset
+ @param gm - trained GaussianMixture class
+ @param alpha significance level (recommended default value 0.05)
+ @param show Debug flag, shows distributions of p_{ij} and p_{ii} into img/ folder. Default False
+ @param N - limits processing large datasets, use default -1
+ """
+ percent = int(alpha * 100)
+ # N=-1
+ Y = gm.predict(X)
+ cov = gm.covariances_
+ # print(cov.shape)
+ DD = np.zeros((gm.n_components, gm.n_components))
+ DN = np.zeros((gm.n_components, gm.n_components))
+ pd_ii = np.zeros((1, 1))
+ pd_jj = np.zeros((1, 1))
+ pd_ij = np.zeros((1, 1))
+ pd_ji = np.zeros((1, 1))
+ for i in range(gm.n_components):
+ for j in range(gm.n_components):
+ # print('process ',i,j,end=' ')
+
+ if X[Y == i].shape[0] > 2 and X[Y == j].shape[0] > 2:
+ pd_ij = pairwise_distances(X[Y == i][:N], X[Y == j][:N], metric='mahalanobis', VI=inv(cov[j]))
+ pd_ij = pd_ij.reshape(pd_ij.shape[0] * pd_ij.shape[1])
+ pd_ji = pairwise_distances(X[Y == j][:N], X[Y == i][:N], metric='mahalanobis', VI=inv(cov[i]))
+ pd_ji = pd_ji.reshape(pd_ji.shape[0] * pd_ji.shape[1])
+ if X[Y == i].shape[0] > 2 and X[Y == i].shape[0] > 2:
+ pd_ii = pairwise_distances(X[Y == i][:N], X[Y == i][:N], metric='mahalanobis', VI=inv(cov[i]))
+ pd_ii = pd_ii.reshape(pd_ii.shape[0] * pd_ii.shape[1])
+ if X[Y == j].shape[0] > 2 and X[Y == j].shape[0] > 2:
+ pd_jj = pairwise_distances(X[Y == j][:N], X[Y == j][:N], metric='mahalanobis', VI=inv(cov[j]))
+ pd_jj = pd_jj.reshape(pd_jj.shape[0] * pd_jj.shape[1])
+
+ cond = np.where(np.isnan(pd_ii), False, True)
+ pd_ii = pd_ii[cond]
+ cond = np.where(np.isnan(pd_jj), False, True)
+ pd_jj = pd_jj[cond]
+ cond = np.where(np.isnan(pd_ij), False, True)
+ pd_ij = pd_ij[cond]
+ cond = np.where(np.isnan(pd_ji), False, True)
+ pd_ji = pd_ji[cond]
+
+ d_ii = np.median(pd_ii)
+ d_jj = np.median(pd_jj)
+
+ if pd_ii.shape[0] * pd_ij.shape[0] * pd_ji.shape[0] * pd_jj.shape[0] > 0:
+ d_ij = max(np.percentile(pd_ij, percent), np.percentile(pd_ji, percent))
+ elif pd_ij.shape[0] * pd_ji.shape[0] == 0:
+ if pd_ij.shape[0] > pd_ji.shape[0]:
+ d_ij = np.percentile(pd_ij, percent)
+ elif pd_ji.shape[0] > pd_ij.shape[0]:
+ d_ij = np.percentile(pd_ji, percent)
+ else:
+ d_ij = -1
+
+ if d_ij >= 0:
+ DD[i, j] += d_ij
+ DN[i, j] += 1
+ DD[i, i] += d_ii
+ DN[i, i] += 1
+ DD[j, j] += d_jj
+ DN[j, j] += 1
+
+ if show:
+ pp.figure()
+ fig, axs = pp.subplots(2, 1)
+ axs[0].hist(pd_ii[np.where(np.isnan(pd_ii), False, True)], bins=100, histtype='step', density=True,
+ label='ii')
+ axs[0].hist(pd_ij[np.where(np.isnan(pd_ij), False, True)], bins=100, histtype='step', density=True,
+ label='ij')
+ axs[0].hist(pd_ji[np.where(np.isnan(pd_ji), False, True)], bins=100, histtype='step', density=True,
+ label='ji')
+ axs[0].hist(pd_jj[np.where(np.isnan(pd_jj), False, True)], bins=100, histtype='step', density=True,
+ label='jj')
+ axs[0].legend()
+ cond = np.where((Y == i) | (Y == j))
+ axs[1].scatter(X[cond, 0], X[cond, 1], c=Y[cond])
+ pp.savefig('img/' + str(i) + '-' + str(j) + '.jpg')
+ pp.close()
+ DD /= DN
+ cond = np.where((np.isnan(DD) | np.isinf(DD)), True, False)
+ notcond = np.where((np.isnan(DD) | np.isinf(DD)), False, True)
+ max1 = DD[notcond].max()
+ DD[cond] = max1
+ return DD
+
+
+def ConfiDistance(MatrixClustersAll, gm=False, eps=1., show=False, alpha=0.1, bh_mh_correct=False, dim=2):
+ """ Joins clusters into super clusters using DBSCAN and calculates MC quality index
@param MatrixClustersAll - source Mahalanobis distance matrix of clusters
@param gm - trained GaussianMixture class (not used)
@param eps - hyperparameter for DBSCAN
@@ -131,74 +132,74 @@ def ConfiDistance(MatrixClustersAll,gm=False, eps=1.,show=False,\
@param bh_mh_correct - correct distances using Benjamini-Hochberg method, default False
@param dim - dimension of source data
- '''
-
- BORDER=np.sqrt(ss.chi2(df=dim).ppf(1.-alpha)*2)
- MC=MatrixClustersAll.copy()
- for i in range(MC.shape[0]):
- MC[i,i]=0.
-
-# ======== Correct by Benjamini-Hochberg
- if bh_mh_correct:
- pvals=1-ss.chi2(df=dim).cdf(MC.reshape(MC.shape[0]*MC.shape[1])**2/2)
- rej,p_corr,a1,a2=multipletests(pvals,alpha=alpha,method='fdr_bh')
- eeps=1e-10
- MC=np.sqrt(ss.chi2(df=dim).ppf(np.abs(1.-p_corr-eeps))*2).reshape(MC.shape[0],MC.shape[1])
+ """
+
+ BORDER = np.sqrt(ss.chi2(df=dim).ppf(1. - alpha) * 2)
+ MC = MatrixClustersAll.copy()
for i in range(MC.shape[0]):
- MC[i,i]=0.
-# ======================================
-
- idx,labels_superclusters=dbscan(MC, metric='precomputed', eps=eps,min_samples=1)
-
- total_superclusters=len(set(list(labels_superclusters)))
- if(total_superclusters)<2:
- return -1e100,labels_superclusters
-
-# mean distance to closest cluster + 1/2 cluster size
- ISCmin=np.ones((total_superclusters,total_superclusters))*1e100
- for i_scl in range(total_superclusters):
- for j_scl in range(total_superclusters):
- index_i=labels_superclusters==i_scl
- index_j=labels_superclusters==j_scl
- MC=MatrixClustersAll.copy()
- np.fill_diagonal(MC,1e100)
- ISCmin[i_scl,j_scl]=(MC[index_i,:][:,index_j]).min()
- index_i=labels_superclusters==i_scl
- ISCmin[i_scl,i_scl]=np.diag(MatrixClustersAll)[index_i].min()
- if show:
- print('tot:',total_superclusters)
-
- ISCmin2=ISCmin
-
- np.fill_diagonal(ISCmin2,1e100)
- distA=ISCmin2.min(axis=1)
- dist=distA[distA>BORDER].shape[0]/distA.shape[0]
-
- if show:
- print('dist:\n',dist)
- return dist,labels_superclusters
-
-
-class GMSDB():
- def __init__(self,min_components=2,step_components=1,n_components=2,verbose=False,show=False,metric='LH',\
- border_percentile=0.001,alpha_stage2=0.05,alpha_stage4=0.1,show_mah=False, \
- show_clusters=False,autostop=True,bh_mh_correct=False,rand_search=0,rand_level=0.5, max_iter=200):
- '''! Init class
+ MC[i, i] = 0.
+
+ # ======== Correct by Benjamini-Hochberg
+ if bh_mh_correct:
+ pvals = 1 - ss.chi2(df=dim).cdf(MC.reshape(MC.shape[0] * MC.shape[1]) ** 2 / 2)
+ rej, p_corr, a1, a2 = multipletests(pvals, alpha=alpha, method='fdr_bh')
+ eeps = 1e-10
+ MC = np.sqrt(ss.chi2(df=dim).ppf(np.abs(1. - p_corr - eeps)) * 2).reshape(MC.shape[0], MC.shape[1])
+ for i in range(MC.shape[0]):
+ MC[i, i] = 0.
+ # ======================================
+
+ idx, labels_superclusters = dbscan(MC, metric='precomputed', eps=eps, min_samples=1)
+
+ total_superclusters = len(set(list(labels_superclusters)))
+ if (total_superclusters) < 2:
+ return -1e100, labels_superclusters
+
+ # mean distance to closest cluster + 1/2 cluster size
+ ISCmin = np.ones((total_superclusters, total_superclusters)) * 1e100
+ for i_scl in range(total_superclusters):
+ for j_scl in range(total_superclusters):
+ index_i = labels_superclusters == i_scl
+ index_j = labels_superclusters == j_scl
+ MC = MatrixClustersAll.copy()
+ np.fill_diagonal(MC, 1e100)
+ ISCmin[i_scl, j_scl] = (MC[index_i, :][:, index_j]).min()
+ index_i = labels_superclusters == i_scl
+ ISCmin[i_scl, i_scl] = np.diag(MatrixClustersAll)[index_i].min()
+ if show:
+ print('tot:', total_superclusters)
+
+ ISCmin2 = ISCmin
+
+ np.fill_diagonal(ISCmin2, 1e100)
+ distA = ISCmin2.min(axis=1)
+ dist = distA[distA > BORDER].shape[0] / distA.shape[0]
+
+ if show:
+ print('dist:\n', dist)
+ return dist, labels_superclusters
+
+
+class GMSDB:
+ def __init__(self, min_components=2, step_components=1, n_components=2, verbose=False, show=False, metric='LH',
+ border_percentile=0.001, alpha_stage2=0.05, alpha_stage4=0.1, show_mah=False, show_clusters=False,
+ autostop=True, bh_mh_correct=False, rand_search=0, rand_level=0.5, max_iter=200, centroid_type='mean'):
+ """ Init class
@param n_components maximal number of gaussian clusters, the more - the slower the stage 1.
- I use 100 for simple cases (dot examples),
+ I use 100 for simple cases (dot examples),
and 1000 with speed up search paramters (below: 2,100,1000,0.5) for complex cases (image examples)
@param alpha_stage2 significance level for stage 2. Recommended standard value 0.05
@param alpha_stage4 significance level for stage 4. Recommended value 0.1
@param min_components minimal number of gaussian clusters (stage 1), in most cases use 2
- @param step_components fast algorithm parameter for BIC search (stage 1).
- Recommended values: 100 (if n_components>=1000);
- 20 if 100=1000);
+ 20 if 100=1000);
- ## 20 if 1000:
- np.random.shuffle(search_idx)
- search_idx=list(search_idx)
- eps_supermin=1e100
- eps_supermax=-1e100
- eps_checked={}
-
- if self.rand_search>0:
- np.random.shuffle(search_idx)
- for eps_itt in search_idx[:self.rand_search]:
- eps=eps_np_mid[eps_itt]
- if eps eps_supermin:
- continue
-
- MatrixTmp=Matrix.copy() #self.getMatrix(X,clusters)
- cdist,labels=ConfiDistance(MatrixTmp,self.gm,eps=eps,show=self.verbose,\
- alpha=self.alpha_stage4,bh_mh_correct=self.bh_mh_correct,dim=self.MH_dim)
- if 1>=cdist>=0.:
- eps_checked[eps_itt]=cdist
- elif cdist<=0.:
- eps_checked[eps_itt]=1.
-
- if cdist>=1.0 or len(set(list(labels)))==1 or cdist==-1e100:
- if self.verbose:
- print('cmp 2 eps_smin,eps:',eps_supermin,eps,flush=True)
- if epseps_supermax:
- eps_supermax=eps
- if self.verbose:
- print('changed 1 eps_smax,eps_min limits:',eps_supermax,eps_supermin,flush=True)
- if eps_supermax>0 and eps_supermin<1e100 and (1.>cdist>self.rand_level):
- break
-
- if self.verbose:
- print('will search over eps_smax,eps_min limits:',eps_supermax,eps_supermin,flush=True)
- for eid in eps_checked.keys():
- print(eid,eps_np_mid[eid],eps_checked[eid],file=sys.stderr)
-# quit()
-# /Random search
-
- search_idx=np.array(range(eps_np_mid.shape[0]))
- search_idx=list(search_idx)
-
- for eps_itt in search_idx:
- eps=eps_np_mid[eps_itt]
- # skip variants outside current limits
- if eps<=eps_supermax:
- continue
- if eps>eps_supermin:
- continue
-
- for subitt in range(self.SUBITT):
- MatrixTmp=Matrix.copy() #self.getMatrix(X,clusters)
- cdist,labels=ConfiDistance(MatrixTmp,self.gm,eps=eps,show=self.verbose,\
- alpha=self.alpha_stage4,bh_mh_correct=self.bh_mh_correct,dim=self.MH_dim)
-
- if self.show_clusters:
- print('showing cluster...', eps_itt)
- pp.figure()
- pp.scatter(X[:,0],X[:,1],c=labels[Y])
- pp.savefig('img/cl-'+str(eps_itt)+'.jpg')
- pp.close()
-
- tot_cl=len(set(list(labels)))
-
- if tot_cl in checked_indexes:
- checked_indexes[tot_cl]+=1
- if tot_cl>1:
- checked_indexes[1]=0
-
- eps_history[eps]={'classes':tot_cl,'labels':labels,'cdist':cdist}
-
- if self.verbose:
- with open('checked_indexes.json','wb') as f:
- pickle.dump(checked_indexes,f)
- f.close()
- with open('eps_history.json','wb') as f:
- pickle.dump(eps_history,f)
- f.close()
-
- if self.verbose:
- print('check',eps_itt,'/',eps_np_mid.shape[0],' eps:',eps,'cdist',cdist,'labels:',len(set(list(labels))),'eps lim:',eps_supermax,eps_supermin)
-
- if cdist>cdist_opt and cdist_opt<=1.:
- cdist_opt=cdist
- eps_opt=eps
- self.best_cluster_num=tot_cl
- if self.verbose:
- print('it',eps_itt,'cdist',cdist,'labels:',len(set(list(labels))))
- optimals.append({'eps':eps,'classes':tot_cl,'labels':labels,'cdist':cdist})
- with open('optimals.json','wb') as f:
- pickle.dump(optimals,f)
- f.close()
- print('============== optimal!!! ===============',flush=True)
- self.labels=labels
-
- if len(set(list(labels)))<=1 and checked_indexes[1]>10:
- if self.verbose:
- print('============== no more itterations needed - only 1 class ! ==========',flush=True)
- return self.best_cluster_num,self.labels
-
- if cdist>=1.0 and self.autostop:
- if self.verbose:
- print('============== no more itterations needed - absolute optimum reached! ==========',flush=True)
- return self.best_cluster_num,self.labels
-
- if len(set(list(labels)))>self.opt_bics:
- if self.verbose:
- print('it',eps_itt,'cdist',cdist,'labels:',len(set(list(labels))))
- print('============== unexpected break!!! ===============',flush=True)
- return self.best_cluster_num,self.labels
-
-
- if len(set(list(labels)))<=1 and checked_indexes[1]>10:
- if self.verbose:
- print('============== no more itterations needed - only 1 class ! ==========',flush=True)
- return self.best_cluster_num,self.labels
-# final return
-
- if self.verbose:
- print('============== Unpredicted return ! ==========',cdist_opt,self.best_cluster_num,self.labels,eps_supermax,eps_supermin,flush=True)
- return self.best_cluster_num,self.labels
-
-
- def fit(self,X0,show_closest=False,plot_decisions=False,max_bic_search_size=-1):
- # show_closest=False - show 2 clusters closesest to given. Debug option
- # plot_decisions=False - plot decision regions when fitting. Debug option
- # max_bic_search_size=-1 - if >0 use for bic search stage (stage1) shorter dataset to provide faster search. Debug option
- ss=StandardScaler()
- X=ss.fit_transform(X0)
- if max_bic_search_size>0:
- if max_bic_search_size<=1:
- max_bic_search_size=int(X0.shape[0]*max_bic_search_size)
-# print('max bic search:',max_bic_search_size)
- if max_bic_search_size>0:
- idx=np.array(range(X.shape[0]))
- np.random.shuffle(idx)
- Xbic=X[idx[:max_bic_search_size]]
- else:
- Xbic=X
- self.MH_dim=X0.shape[1]
- self.ss=ss
- best_bic=1e100
- best_C=-1
- mymin=self.min_components
- mymax=self.n_components
- mystep=self.step_components
- for itt in range(10):
- for C in range(mymin,mymax,mystep):
-
- if not C in self.gms:
- gm=GaussianMixture(n_components=C,max_iter=self.max_iter)
- gm.fit(Xbic)
- else:
- gm=self.gms[C]
-
- yp=gm.predict(Xbic)
- top_bic=[]
- for i in range(100):
- idx3=np.random.randint(0,Xbic.shape[0]-1,Xbic.shape[0])
- top_bic.append(gm.bic(Xbic[idx3]))
- top_bic=np.array(top_bic)
- self.bics[C]=top_bic.mean()+1.96*top_bic.std()
-
-# self.bics[C]=gm.bic(Xbic)
- self.gms[C]=gm
-
- if self.bics[C]1):
- mymin=best_C-mystep-1
- if mymin<=2:
- mymin=2
- mymax=best_C+mystep
- if mystep>5:
- mystep=mystep//5
- else:
- mystep=1
- if self.verbose:
- print('new itt:',mymin,mymax,mystep)
- else:
- break
- self.opt_bics=np.argmin(self.bics[self.min_components:])+self.min_components
- if self.verbose:
- print('optbic:',self.opt_bics,flush=True)
-
- Y=self.gm.predict(X)
-
- clusters=set(list(Y))
- Matrix=self.getMatrix(X,clusters)
-# show_closest=False
- if show_closest:
- for i in range(Matrix.shape[0]):
- MC=Matrix.copy()
- MC[i,i]=1e100
- iclosest=np.argmin(MC[i,:])
- Yc=Y.copy()
- Yc[Yc==i]=-1
- Yc[Yc==iclosest]=-2
- MC[i,iclosest]=1e100
- iclosest2=np.argmin(MC[i,:])
- Yc[Yc==iclosest2]=-3
- Yc[Yc>=0]=0
- pp.figure()
- for c in set(list(Yc)):
- pp.scatter(X[Yc==c,0],X[Yc==c,1],label=str(c))
- pp.legend()
- pp.savefig('img/f-'+str(i)+'.jpg')
- pp.close()
- quit()
- if self.verbose:
- print('prep matr:\n',np.round(Matrix,2))
-
- eps_vals=set(list(Matrix.reshape(Matrix.shape[0]*Matrix.shape[1])))
- eps_np=np.array(sorted(eps_vals))
- eps_np_mid=(eps_np[1:]+eps_np[:-1])/2.
-
- if self.verbose:
- print('unique matrix vals:',eps_np_mid,eps_np_mid.shape[0])
-
- best_cl_num,superlabels=self.getOptSuperclusters(Matrix,self.gm,X,Y,eps_np_mid)
-
-
-
- def predict(self,X0):
- X=self.ss.transform(X0)
- Y=self.gm.predict(X)
- return self.labels[Y]
-
- def predict_proba(self,X0):
- X=self.ss.transform(X0)
- Y=self.gm.predict_proba(X)
- superclusters=set(list(self.labels))
- res=np.zeros((Y.shape[0],len(superclusters)))
- for scl in superclusters:
- res[:,scl]=Y[:,self.labels==scl].sum(axis=1)
-# res/=np.expand_dims(res.sum(axis=1),-1)
- return res
-
- def fit_predict(self,X):
- self.fit(X)
- return self.predict(X)
-
- def aic(self,X):
- return 0
-
- def bic(self,X):
- return 0
-
+ """
+ # ==== input parameters
+ # 1.Basic parameters:
+ self.n_components = n_components ## @param n_components maximal number of gaussian clusters, the more - the slower the stage 1.
+ ## I use 100 for simple cases (dot examples),
+ ## and 1000 with speed up search paramters (below: 2,100,1000,0.5) for complex cases (image examples)
+
+ self.alpha_stage2 = alpha_stage2 # @param alpha_stage2 significance level for stage 2. Recommended standard value 0.05
+ self.alpha_stage4 = alpha_stage4 # @param alpha_stage4 significance level for stage 4. Recommended value 0.1
+ self.bh_mh_correct = bh_mh_correct # @param bh_mh_correct use for multiple hypothesis correcting. Default False
+
+ #
+ # 2.Speed up search parameters:
+ self.min_components = min_components # @param min_components minimal number of gaussian clusters (stage 1), in most cases use 2
+ self.step_components = step_components # @param step_components fast algorithm parameter for BIC search (stage 1).
+ # Recommended values: 100 (if n_components>=1000);
+ # 20 if 100 0:
+ np.random.shuffle(search_idx)
+ search_idx = list(search_idx)
+ eps_supermin = 1e100
+ eps_supermax = -1e100
+ eps_checked = {}
+
+ if self.rand_search > 0:
+ np.random.shuffle(search_idx)
+ for eps_itt in search_idx[:self.rand_search]:
+ eps = eps_np_mid[eps_itt]
+ if eps < eps_supermax:
+ continue
+ if eps > eps_supermin:
+ continue
+
+ MatrixTmp = Matrix.copy() #self.getMatrix(X,clusters)
+ cdist, labels = ConfiDistance(MatrixTmp, self.gm, eps=eps, show=self.verbose,
+ alpha=self.alpha_stage4, bh_mh_correct=self.bh_mh_correct,
+ dim=self.MH_dim)
+ if 1 >= cdist >= 0.:
+ eps_checked[eps_itt] = cdist
+ elif cdist <= 0.:
+ eps_checked[eps_itt] = 1.
+
+ if cdist >= 1.0 or len(set(list(labels))) == 1 or cdist == -1e100:
+ if self.verbose:
+ print('cmp 2 eps_smin,eps:', eps_supermin, eps, flush=True)
+ if eps < eps_supermin:
+ eps_supermin = eps
+ if self.verbose:
+ print('changed 2 eps_smax,eps_min limits:', eps_supermax, eps_supermin, flush=True)
+
+ if 0. <= cdist < 1.:
+ if self.verbose:
+ print('cmp 1 eps_smax,eps:', eps_supermax, eps, flush=True)
+ if eps > eps_supermax:
+ eps_supermax = eps
+ if self.verbose:
+ print('changed 1 eps_smax,eps_min limits:', eps_supermax, eps_supermin, flush=True)
+ if eps_supermax > 0 and eps_supermin < 1e100 and (1. > cdist > self.rand_level):
+ break
+
+ if self.verbose:
+ print('will search over eps_smax,eps_min limits:', eps_supermax, eps_supermin, flush=True)
+ for eid in eps_checked.keys():
+ print(eid, eps_np_mid[eid], eps_checked[eid], file=sys.stderr)
+ # quit()
+ # /Random search
+
+ search_idx = np.array(range(eps_np_mid.shape[0]))
+ search_idx = list(search_idx)
+
+ for eps_itt in search_idx:
+ eps = eps_np_mid[eps_itt]
+ # skip variants outside current limits
+ if eps <= eps_supermax:
+ continue
+ if eps > eps_supermin:
+ continue
+
+ for subitt in range(self.SUBITT):
+ MatrixTmp = Matrix.copy() #self.getMatrix(X,clusters)
+ cdist, labels = ConfiDistance(MatrixTmp, self.gm, eps=eps, show=self.verbose,
+ alpha=self.alpha_stage4, bh_mh_correct=self.bh_mh_correct,
+ dim=self.MH_dim)
+
+ if self.show_clusters:
+ print('showing cluster...', eps_itt)
+ pp.figure()
+ pp.scatter(X[:, 0], X[:, 1], c=labels[Y])
+ pp.savefig('img/cl-' + str(eps_itt) + '.jpg')
+ pp.close()
+
+ tot_cl = len(set(list(labels)))
+
+ if tot_cl in checked_indexes:
+ checked_indexes[tot_cl] += 1
+ if tot_cl > 1:
+ checked_indexes[1] = 0
+
+ eps_history[eps] = {'classes': tot_cl, 'labels': labels, 'cdist': cdist}
+
+ if self.verbose:
+ with open('checked_indexes.json', 'wb') as f:
+ pickle.dump(checked_indexes, f)
+ f.close()
+ with open('eps_history.json', 'wb') as f:
+ pickle.dump(eps_history, f)
+ f.close()
+
+ if self.verbose:
+ print('check', eps_itt, '/', eps_np_mid.shape[0], ' eps:', eps, 'cdist', cdist, 'labels:',
+ len(set(list(labels))), 'eps lim:', eps_supermax, eps_supermin)
+
+ if cdist > cdist_opt and cdist_opt <= 1.:
+ cdist_opt = cdist
+ eps_opt = eps
+ self.best_cluster_num = tot_cl
+ if self.verbose:
+ print('it', eps_itt, 'cdist', cdist, 'labels:', len(set(list(labels))))
+ optimals.append({'eps': eps, 'classes': tot_cl, 'labels': labels, 'cdist': cdist})
+ with open('optimals.json', 'wb') as f:
+ pickle.dump(optimals, f)
+ f.close()
+ print('============== optimal!!! ===============', flush=True)
+ self.labels = labels
+
+ if len(set(list(labels))) <= 1 and checked_indexes[1] > 10:
+ if self.verbose:
+ print('============== no more itterations needed - only 1 class ! ==========', flush=True)
+ return self.best_cluster_num, self.labels
+
+ if cdist >= 1.0 and self.autostop:
+ if self.verbose:
+ print('============== no more itterations needed - absolute optimum reached! ==========',
+ flush=True)
+ return self.best_cluster_num, self.labels
+
+ if len(set(list(labels))) > self.opt_bics:
+ if self.verbose:
+ print('it', eps_itt, 'cdist', cdist, 'labels:', len(set(list(labels))))
+ print('============== unexpected break!!! ===============', flush=True)
+ return self.best_cluster_num, self.labels
+
+ if len(set(list(labels))) <= 1 and checked_indexes[1] > 10:
+ if self.verbose:
+ print('============== no more itterations needed - only 1 class ! ==========', flush=True)
+ return self.best_cluster_num, self.labels
+ # final return
+
+ if self.verbose:
+ print('============== Unpredicted return ! ==========', cdist_opt, self.best_cluster_num, self.labels,
+ eps_supermax, eps_supermin, flush=True)
+ return self.best_cluster_num, self.labels
+
+ def fit(self, X0, show_closest=False, plot_decisions=False, max_bic_search_size=-1):
+ """ Train model
+ @param X0:
+ @param show_closest: Show two clusters closest to give. Debug option
+ @param plot_decisions: Plot decision regions when fitting. Debug option
+ @param max_bic_search_size: If >0 uses for bic search stage (stage1) shorter dataset to provide faster search. Debug option
+ """
+
+ X = ss.fit_transform(X0)
+ if max_bic_search_size > 0:
+ if max_bic_search_size <= 1:
+ max_bic_search_size = int(X0.shape[0] * max_bic_search_size)
+ # print('max bic search:',max_bic_search_size)
+ if max_bic_search_size > 0:
+ idx = np.array(range(X.shape[0]))
+ np.random.shuffle(idx)
+ Xbic = X[idx[:max_bic_search_size]]
+ else:
+ Xbic = X
+ self.MH_dim = X0.shape[1]
+
+ best_bic = 1e100
+ best_C = -1
+ mymin = self.min_components
+ mymax = self.n_components
+ mystep = self.step_components
+ for itt in range(10):
+ for C in range(mymin, mymax, mystep):
+
+ if not C in self.gms:
+ gm = GaussianMixture(n_components=C, max_iter=self.max_iter)
+ gm.fit(Xbic)
+ else:
+ gm = self.gms[C]
+
+ yp = gm.predict(Xbic)
+ top_bic = []
+ for i in range(100):
+ idx3 = np.random.randint(0, Xbic.shape[0] - 1, Xbic.shape[0])
+ top_bic.append(gm.bic(Xbic[idx3]))
+ top_bic = np.array(top_bic)
+ self.bics[C] = top_bic.mean() + 1.96 * top_bic.std()
+
+ # self.bics[C]=gm.bic(Xbic)
+ self.gms[C] = gm
+
+ if self.bics[C] < best_bic:
+ best_bic = self.bics[C]
+ best_C = C
+ self.gm = gm
+ self.yp = yp
+
+ if plot_decisions:
+ pp.figure()
+ # pp.scatter(X[:,0],X[:,1],c=yp)
+ plot_decision_regions(Xbic, yp, clf=self.gm)
+ pp.savefig('bicfile.png')
+ pp.close()
+
+ if self.verbose:
+ print("C:", C, 'bic:', self.bics[C], flush=True)
+ if (mystep > 1):
+ mymin = best_C - mystep - 1
+ if mymin <= 2:
+ mymin = 2
+ mymax = best_C + mystep
+ if mystep > 5:
+ mystep = mystep // 5
+ else:
+ mystep = 1
+ if self.verbose:
+ print('new itt:', mymin, mymax, mystep)
+ else:
+ break
+ self.opt_bics = np.argmin(self.bics[self.min_components:]) + self.min_components
+ if self.verbose:
+ print('optbic:', self.opt_bics, flush=True)
+
+ Y = self.gm.predict(X)
+
+ clusters = set(list(Y))
+ Matrix = self.getMatrix(X, clusters)
+ # show_closest=False
+ if show_closest:
+ for i in range(Matrix.shape[0]):
+ MC = Matrix.copy()
+ MC[i, i] = 1e100
+ iclosest = np.argmin(MC[i, :])
+ Yc = Y.copy()
+ Yc[Yc == i] = -1
+ Yc[Yc == iclosest] = -2
+ MC[i, iclosest] = 1e100
+ iclosest2 = np.argmin(MC[i, :])
+ Yc[Yc == iclosest2] = -3
+ Yc[Yc >= 0] = 0
+ pp.figure()
+ for c in set(list(Yc)):
+ pp.scatter(X[Yc == c, 0], X[Yc == c, 1], label=str(c))
+ pp.legend()
+ pp.savefig('img/f-' + str(i) + '.jpg')
+ pp.close()
+ quit()
+ if self.verbose:
+ print('prep matr:\n', np.round(Matrix, 2))
+
+ eps_vals = set(list(Matrix.reshape(Matrix.shape[0] * Matrix.shape[1])))
+ eps_np = np.array(sorted(eps_vals))
+ eps_np_mid = (eps_np[1:] + eps_np[:-1]) / 2.
+
+ if self.verbose:
+ print('unique matrix vals:', eps_np_mid, eps_np_mid.shape[0])
+
+ best_cl_num, superlabels = self.getOptSuperclusters(Matrix, self.gm, X, Y, eps_np_mid)
+
+ # calculate centroids
+ self._calculate_centroids(X)
+
+ def predict(self, X0):
+ X = self.ss.transform(X0)
+ Y = self.gm.predict(X)
+ return self.labels[Y]
+
+ def predict_proba(self, X0):
+ X = self.ss.transform(X0)
+ Y = self.gm.predict_proba(X)
+ superclusters = set(list(self.labels))
+ res = np.zeros((Y.shape[0], len(superclusters)))
+ for scl in superclusters:
+ res[:, scl] = Y[:, self.labels == scl].sum(axis=1)
+ # res/=np.expand_dims(res.sum(axis=1),-1)
+ return res
+
+ def fit_predict(self, X):
+ self.fit(X)
+ return self.predict(X)
+
+ def aic(self, X):
+ return 0
+
+ def bic(self, X):
+ return 0
+
+ def _calculate_centroids(self, X):
+ """Calculates centroids for each cluster based on the selected method.
+ @param X: Train dataset
+ """
+ unique_labels = np.unique(self.yp)
+ centroids = np.zeros((len(unique_labels), X.shape[1]))
+
+ for i, label in enumerate(unique_labels):
+ points_in_cluster = X[self.yp == label]
+
+ if self.centroid_type == 'mean':
+ centroid = np.mean(points_in_cluster, axis=0).reshape(1, -1)
+ elif self.centroid_type == 'median':
+ centroid = np.median(points_in_cluster, axis=0).reshape(1, -1)
+ elif self.centroid_type == 'geometric':
+ centroid = ss.gmean(points_in_cluster, axis=0).reshape(1, -1)
+ elif self.centroid_type == 'rms':
+ centroid = np.sqrt(np.mean(np.square(points_in_cluster), axis=0)).reshape(1, -1)
+ else:
+ raise ValueError(f"Unknown centroid calculation method: {self.centroid_type}")
+
+ centroids[i] = self.ss.inverse_transform(centroid)[0]
+
+ self.cluster_centers_ = centroids
+
+ def get_centroids(self):
+ """Return centroids"""
+ return self.cluster_centers_
diff --git a/README.md b/README.md
index adafa2f..b011797 100644
--- a/README.md
+++ b/README.md
@@ -1,41 +1,53 @@
# GMSDB Clusterer
Run for simple test:
-
- python GMSDB.py
-
+```shell
+python GMSDB.py
+```
## Simple use (sklearn -like style), no speed improvement, default significance level alpha=0.1.
Maximal number of gaussian components=50:
+```python
+from sklearn.datasets import make_blobs
from GMSDB import GMSDB
-clf=GMSDB(n_components=50)
+S,N=1000,5
+X,Y=make_blobs(n_samples=S, n_features=2, centers=N, cluster_std=0.3, random_state=42)
+clf=GMSDB(n_components=50)
clf.fit(X)
-
Y=clf.predict(X)
+print(clf.get_centroids())
+```
## Complex use (with speed improvement for stages 1 and 3):
-clf=GMSDB(min_components=2,step_components=100,n_components=900,rand_search=1000,rand_level=0.5)
+```python
+clf = GMSDB(min_components=2,step_components=100,n_components=900,rand_search=1000,rand_level=0.5)
+```
## Complex use (with custom significance level alpha=0.15):
+```python
clf=GMSDB(n_components=50,alpha_stage2=0.15,alpha_stage4=0.15)
+```
## Verbose use (show debug information):
-
+```python
clf=GMSDB(n_components=50,verbose=True)
+```
# Install with pip
You could install it from PyPI:
-
+```shell
pip install gmsdb==2.1
+```
Import:
-
+```python
from gmsdb import GMSDB
+```
# Paper:
https://arxiv.org/pdf/2309.02623v2.pdf