Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
294 changes: 166 additions & 128 deletions Clustering_conda.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
id: clustering_example_conda
description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
version: 1.4
version: "1.5.0"
benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
storage: http://omnibenchmark.org:9000
benchmark_yaml_spec: 0.04
storage_api: S3
storage_bucket_name: clusteringexampleconda
storage:
api: S3
endpoint: http://omnibenchmark.mls.uzh.ch:9000
bucket_name: clusteringexampleconda
software_backend: conda
software_environments:
clustbench:
Expand All @@ -24,12 +24,12 @@ metric_collectors:
software_environment: "fcps"
repository:
url: https://github.com/imallona/clustering_report
commit: bbb9d56
commit: "040"
inputs:
- metrics.scores
outputs:
- id: plotting.html
path: "{input}/{name}/plotting_report.html"
path: "{name}/plotting_report.html"
stages:
## clustbench data ##########################################################

Expand All @@ -40,152 +40,192 @@ stages:
software_environment: "clustbench"
repository:
url: https://github.com/imallona/clustbench_data
commit: 366c5a2
commit: fc67ebd
parameters: # comments depict the possible cardinalities and the number of curated labelsets
- values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1
- values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1
# - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2
# - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1
# - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1
# - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2
# - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1
# - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1
# - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1
# - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1
# - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6
# - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1
# - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2
# - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1
# - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1
# - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2
# - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2
# - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2
# - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2
# - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1
# - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1
# - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1
# - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1
# - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1
# - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1
# - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1
# - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5
# - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2
# - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1
# - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2
# - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3
# - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1
# - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1
# - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1
# - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1
# - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1
# - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1
# - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1
# - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1
# - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2
# - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1
# - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1
- dataset_generator: "fcps"
dataset_name: ["atom", "chainlink"] # 2 1
# - dataset_generator: "fcps"
# dataset_name: ["engytime", "hepta", "lsun", "target", "tetra", "twodiamonds", "wingnut"] # 7 1, 3 1, 2,6 2, 4 1, 2 1, 2 1
# - dataset_generator: "graves"
# dataset_name: ["dense"] # 2 1
# - dataset_generator: "graves"
# dataset_name: ["fuzzyx"] # 2,4,5 6
# - dataset_generator: "graves"
# dataset_name: ["line"] # 2 1
# - dataset_generator: "graves"
# dataset_name: ["parabolic"] # 2,4 2
# - dataset_generator: "graves"
# dataset_name: ["ring"] # 2 1
# - dataset_generator: "graves"
# dataset_name: ["ring_noisy"] # 2 1
# - dataset_generator: "graves"
# dataset_name: ["ring_outliers"] # 2,5 2
# - dataset_generator: "graves"
# dataset_name: ["zigzag"] # 3,5 2
# - dataset_generator: "graves"
# dataset_name: ["zigzag_noisy"] # 3,5 2
# - dataset_generator: "graves"
# dataset_name: ["zigzag_outliers"] # 3,5 2
# - dataset_generator: "other"
# dataset_name: ["chameleon_t4_8k"] # 6 1
# - dataset_generator: "other"
# dataset_name: ["chameleon_t5_8k"] # 6 1
# - dataset_generator: "other"
# dataset_name: ["hdbscan"] # 6 1
# - dataset_generator: "other"
# dataset_name: ["iris"] # 3 1
# - dataset_generator: "other"
# dataset_name: ["iris5"] # 3 1
# - dataset_generator: "other"
# dataset_name: ["square"] # 2 1
# - dataset_generator: "sipu"
# dataset_name: ["aggregation"] # 7 1
# - dataset_generator: "sipu"
# dataset_name: ["compound"] # 4,5,6 5
# - dataset_generator: "sipu"
# dataset_name: ["flame"] # 2 2
# - dataset_generator: "sipu"
# dataset_name: ["jain"] # 2 1
# - dataset_generator: "sipu"
# dataset_name: ["pathbased"] # 3,4 2
# - dataset_generator: "sipu"
# dataset_name: ["r15"] # 8,9,15 3
# - dataset_generator: "sipu"
# dataset_name: ["spiral"] # 3 1
# - dataset_generator: "sipu"
# dataset_name: ["unbalance"] # 8 1
# - dataset_generator: "uci"
# dataset_name: ["ecoli"] # 8 1
# - dataset_generator: "uci"
# dataset_name: ["ionosphere"] # 2 1
# - dataset_generator: "uci"
# dataset_name: ["sonar"] # 2 1
# - dataset_generator: "uci"
# dataset_name: ["statlog"] # 7 1
# - dataset_generator: "uci"
# dataset_name: ["wdbc"] # 2 1
# - dataset_generator: "uci"
# dataset_name: ["wine"] # 3 1
# - dataset_generator: "uci"
# dataset_name: ["yeast"] # 10 1
# - dataset_generator: "wut"
# dataset_name: ["circles"] # 4 1
# - dataset_generator: "wut"
# dataset_name: ["cross"] # 4 1
# - dataset_generator: "wut"
# dataset_name: ["graph"] # 10 1
# - dataset_generator: "wut"
# dataset_name: ["isolation"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["labirynth"] # 6 1
# - dataset_generator: "wut"
# dataset_name: ["mk1"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["mk2"] # 2 1
# - dataset_generator: "wut"
# dataset_name: ["mk3"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["mk4"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["olympic"] # 5 1
# - dataset_generator: "wut"
# dataset_name: ["smile"] # 4,6 2
# - dataset_generator: "wut"
# dataset_name: ["stripes"] # 2 1
# - dataset_generator: "wut"
# dataset_name: ["trajectories"] # 4 1
# - dataset_generator: "wut"
# dataset_name: ["trapped_lovers"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["twosplashes"] # 2 1
# - dataset_generator: "wut"
# dataset_name: ["windows"] # 5 1
# - dataset_generator: "wut"
# dataset_name: ["x1"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["x2"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["x3"] # 4 1
# - dataset_generator: "wut"
# dataset_name: ["z1"] # 3 1
# - dataset_generator: "wut"
# dataset_name: ["z2"] # 5 1
# - dataset_generator: "wut"
# dataset_name: ["z3"] # 4 1
outputs:
- id: data.matrix
path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
path: "{dataset}.data.gz"
- id: data.true_labels
path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
path: "{dataset}.labels0.gz"

## clustbench methods (fastcluster) ###################################################################

- id: clustering
modules:
- id: fastcluster
name: "fastcluster algorithm"
software_environment: "clustbench"
repository:
url: https://github.com/imallona/clustbench_fastcluster
# url: /home/imallona/src/clustbench_fastcluster/
commit: "45e43d3"
commit: e644ce5
parameters:
- values: ["--linkage", "complete"]
# - values: ["--linkage", "ward"]
# - values: ["--linkage", "average"]
# - values: ["--linkage", "weighted"]
# - values: ["--linkage", "median"]
# - values: ["--linkage", "centroid"]
- linkage: "complete"
#- linkage: ["ward", "average", "weighted", "median", "centroid"]
- id: sklearn
name: "sklearn"
software_environment: "clustbench"
repository:
url: https://github.com/imallona/clustbench_sklearn
#url: /home/imallona/src/clustbench_sklearn
commit: 5877378
commit: dcf35e1
parameters:
- values: ["--method", "birch"]
# - values: ["--method", "kmeans"]
# - values: ["--method", "spectral"] ## too slow
# - values: ["--method", "gm"]
- method: "birch"
# ["kmeans, "gm"]
# ["spectral"] ## too slow
- id: agglomerative
name: "agglomerative"
software_environment: "clustbench"
repository:
url: https://github.com/imallona/clustbench_agglomerative
commit: 5454368
commit: 9d086a9
parameters:
- values: ["--linkage", "average"]
# - values: ["--linkage", "complete"]
# - values: ["--linkage", "ward"]
- linkage: "average"
# ["complete", "ward"]
- id: genieclust
name: "genieclust"
software_environment: "clustbench"
repository:
url: https://github.com/imallona/clustbench_genieclust
commit: 6090043
commit: 7d9e799
parameters:
- values: ["--method", "genie", "--gini_threshold", 0.5]
# - values: ["--method", "gic"]
# - values: ["--method", "ica"]
- method: "genie"
# method: ["gic", "ica"]
gini_threshold: 0.5
- id: fcps
name: "fcps"
software_environment: "fcps"
repository:
url: https://github.com/imallona/clustbench_fcps
commit: fc37faa
commit: e780fed
parameters:
# - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in Conda
- values: ["--method", "FCPS_Minimax", "--seed", 2]
# - values: ["--method", "FCPS_MinEnergy", "--seed", 2]
# - values: ["--method", "FCPS_HDBSCAN_2", "--seed", 2]
# - values: ["--method", "FCPS_HDBSCAN_4", "--seed", 2]
# - values: ["--method", "FCPS_HDBSCAN_8", "--seed", 2]
# - values: ["--method", "FCPS_Diana", "--seed", 2]
# - values: ["--method", "FCPS_Fanny", "--seed", 2]
# - values: ["--method", "FCPS_Hardcl", "--seed", 2]
# - values: ["--method", "FCPS_Softcl", "--seed", 2]
# - values: ["--method", "FCPS_Clara", "--seed", 2]
# - values: ["--method", "FCPS_PAM", "--seed", 2]
- method: "FCPS_Minimax"
seed: 2
# - "FCPS_AdaptiveDensityPeak" # not in Conda
# - "FCPS_MinEnergy",
# - "FCPS_HDBSCAN_2",
# - "FCPS_HDBSCAN_4",
# - "FCPS_HDBSCAN_8",
# - "FCPS_Diana",
# - "FCPS_Fanny",
# - "FCPS_Hardcl",
# - "FCPS_Softcl",
# - "FCPS_Clara",
# - "FCPS_PAM"
inputs:
- entries:
- data.matrix
- data.true_labels
- data.matrix
- data.true_labels
outputs:
- id: clustering.predicted_ks_range
path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
path: "{dataset}_ks_range.labels.gz"

- id: metrics
modules:
Expand All @@ -194,23 +234,21 @@ stages:
software_environment: "clustbench"
repository:
url: https://github.com/imallona/clustbench_metrics
commit: 9132d45
commit: c4eda85
parameters:
- values: ["--metric", "normalized_clustering_accuracy"]
- values: ["--metric", "adjusted_fm_score"]
# - values: ["--metric", "adjusted_mi_score"]
# - values: ["--metric", "adjusted_rand_score"]
# - values: ["--metric", "fm_score"]
# - values: ["--metric", "mi_score"]
# - values: ["--metric", "normalized_clustering_accuracy"]
# - values: ["--metric", "normalized_mi_score"]
# - values: ["--metric", "normalized_pivoted_accuracy"]
# - values: ["--metric", "pair_sets_index"]
# - values: ["--metric", "rand_score"]
- metric: ["normalized_clustering_accuracy", "adjusted_fm_score"]
# - "adjusted_mi_score"
# - "adjusted_rand_score"
# - "fm_score"
# - "mi_score"
# - "normalized_clustering_accuracy"
# - "normalized_mi_score"
# - "normalized_pivoted_accuracy"
# - "pair_sets_index"
# - "rand_score"
inputs:
- entries:
- clustering.predicted_ks_range
- data.true_labels
- clustering.predicted_ks_range
- data.true_labels
outputs:
- id: metrics.scores
path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
path: "{dataset}.scores.gz"
Loading
Loading