diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 9a8e52c..bc56d6b 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,11 +1,11 @@ id: clustering_example_conda description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 +version: "1.5.0" benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleconda +storage: + api: S3 + endpoint: http://omnibenchmark.mls.uzh.ch:9000 + bucket_name: clusteringexampleconda software_backend: conda software_environments: clustbench: @@ -24,12 +24,12 @@ metric_collectors: software_environment: "fcps" repository: url: https://github.com/imallona/clustering_report - commit: bbb9d56 + commit: "040" inputs: - metrics.scores outputs: - id: plotting.html - path: "{input}/{name}/plotting_report.html" + path: "{name}/plotting_report.html" stages: ## clustbench data ########################################################## @@ -40,78 +40,126 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: fc67ebd parameters: # comments depict the possible cardinalities and the number of curated labelsets - - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + - dataset_generator: "fcps" + dataset_name: ["atom", "chainlink"] # 2 1 + # - dataset_generator: "fcps" + # dataset_name: ["engytime", "hepta", "lsun", "target", "tetra", "twodiamonds", "wingnut"] # 7 1, 3 1, 2,6 2, 4 1, 2 1, 2 1 + # - dataset_generator: "graves" + # dataset_name: ["dense"] # 2 1 + # - dataset_generator: "graves" + # dataset_name: ["fuzzyx"] # 2,4,5 6 + # - dataset_generator: "graves" + # dataset_name: ["line"] # 2 1 + # - dataset_generator: "graves" + # dataset_name: ["parabolic"] # 2,4 2 + # - dataset_generator: "graves" + # dataset_name: ["ring"] # 2 1 + # - dataset_generator: "graves" + # dataset_name: ["ring_noisy"] # 2 1 + # - dataset_generator: "graves" + # dataset_name: ["ring_outliers"] # 2,5 2 + # - dataset_generator: "graves" + # dataset_name: ["zigzag"] # 3,5 2 + # - dataset_generator: "graves" + # dataset_name: ["zigzag_noisy"] # 3,5 2 + # - dataset_generator: "graves" + # dataset_name: ["zigzag_outliers"] # 3,5 2 + # - dataset_generator: "other" + # dataset_name: ["chameleon_t4_8k"] # 6 1 + # - dataset_generator: "other" + # dataset_name: ["chameleon_t5_8k"] # 6 1 + # - dataset_generator: "other" + # dataset_name: ["hdbscan"] # 6 1 + # - dataset_generator: "other" + # dataset_name: ["iris"] # 3 1 + # - dataset_generator: "other" + # dataset_name: ["iris5"] # 3 1 + # - dataset_generator: "other" + # dataset_name: ["square"] # 2 1 + # - dataset_generator: "sipu" + # dataset_name: ["aggregation"] # 7 1 + # - dataset_generator: "sipu" + # dataset_name: ["compound"] # 4,5,6 5 + # - dataset_generator: "sipu" + # dataset_name: ["flame"] # 2 2 + # - dataset_generator: "sipu" + # dataset_name: ["jain"] # 2 1 + # - dataset_generator: "sipu" + # dataset_name: ["pathbased"] # 3,4 2 + # - dataset_generator: "sipu" + # dataset_name: ["r15"] # 8,9,15 3 + # - dataset_generator: "sipu" + # dataset_name: ["spiral"] # 3 1 + # - dataset_generator: "sipu" + # dataset_name: ["unbalance"] # 8 1 + # - dataset_generator: "uci" + # dataset_name: ["ecoli"] # 8 1 + # - dataset_generator: "uci" + # dataset_name: ["ionosphere"] # 2 1 + # - dataset_generator: "uci" + # dataset_name: ["sonar"] # 2 1 + # - dataset_generator: "uci" + # dataset_name: ["statlog"] # 7 1 + # - dataset_generator: "uci" + # dataset_name: ["wdbc"] # 2 1 + # - dataset_generator: "uci" + # dataset_name: ["wine"] # 3 1 + # - dataset_generator: "uci" + # dataset_name: ["yeast"] # 10 1 + # - dataset_generator: "wut" + # dataset_name: ["circles"] # 4 1 + # - dataset_generator: "wut" + # dataset_name: ["cross"] # 4 1 + # - dataset_generator: "wut" + # dataset_name: ["graph"] # 10 1 + # - dataset_generator: "wut" + # dataset_name: ["isolation"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["labirynth"] # 6 1 + # - dataset_generator: "wut" + # dataset_name: ["mk1"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["mk2"] # 2 1 + # - dataset_generator: "wut" + # dataset_name: ["mk3"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["mk4"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["olympic"] # 5 1 + # - dataset_generator: "wut" + # dataset_name: ["smile"] # 4,6 2 + # - dataset_generator: "wut" + # dataset_name: ["stripes"] # 2 1 + # - dataset_generator: "wut" + # dataset_name: ["trajectories"] # 4 1 + # - dataset_generator: "wut" + # dataset_name: ["trapped_lovers"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["twosplashes"] # 2 1 + # - dataset_generator: "wut" + # dataset_name: ["windows"] # 5 1 + # - dataset_generator: "wut" + # dataset_name: ["x1"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["x2"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["x3"] # 4 1 + # - dataset_generator: "wut" + # dataset_name: ["z1"] # 3 1 + # - dataset_generator: "wut" + # dataset_name: ["z2"] # 5 1 + # - dataset_generator: "wut" + # dataset_name: ["z3"] # 4 1 outputs: - id: data.matrix - path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + path: "{dataset}.data.gz" - id: data.true_labels - path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + path: "{dataset}.labels0.gz" ## clustbench methods (fastcluster) ################################################################### - + - id: clustering modules: - id: fastcluster @@ -119,73 +167,65 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ - commit: "45e43d3" + commit: e644ce5 parameters: - - values: ["--linkage", "complete"] - # - values: ["--linkage", "ward"] - # - values: ["--linkage", "average"] - # - values: ["--linkage", "weighted"] - # - values: ["--linkage", "median"] - # - values: ["--linkage", "centroid"] + - linkage: "complete" + #- linkage: ["ward", "average", "weighted", "median", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn - commit: 5877378 + commit: dcf35e1 parameters: - - values: ["--method", "birch"] - # - values: ["--method", "kmeans"] - # - values: ["--method", "spectral"] ## too slow - # - values: ["--method", "gm"] + - method: "birch" + # ["kmeans, "gm"] + # ["spectral"] ## too slow - id: agglomerative name: "agglomerative" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_agglomerative - commit: 5454368 + commit: 9d086a9 parameters: - - values: ["--linkage", "average"] - # - values: ["--linkage", "complete"] - # - values: ["--linkage", "ward"] + - linkage: "average" + # ["complete", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_genieclust - commit: 6090043 + commit: 7d9e799 parameters: - - values: ["--method", "genie", "--gini_threshold", 0.5] - # - values: ["--method", "gic"] - # - values: ["--method", "ica"] + - method: "genie" + # method: ["gic", "ica"] + gini_threshold: 0.5 - id: fcps name: "fcps" software_environment: "fcps" repository: url: https://github.com/imallona/clustbench_fcps - commit: fc37faa + commit: e780fed parameters: - # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in Conda - - values: ["--method", "FCPS_Minimax", "--seed", 2] - # - values: ["--method", "FCPS_MinEnergy", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_2", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_4", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_8", "--seed", 2] - # - values: ["--method", "FCPS_Diana", "--seed", 2] - # - values: ["--method", "FCPS_Fanny", "--seed", 2] - # - values: ["--method", "FCPS_Hardcl", "--seed", 2] - # - values: ["--method", "FCPS_Softcl", "--seed", 2] - # - values: ["--method", "FCPS_Clara", "--seed", 2] - # - values: ["--method", "FCPS_PAM", "--seed", 2] + - method: "FCPS_Minimax" + seed: 2 + # - "FCPS_AdaptiveDensityPeak" # not in Conda + # - "FCPS_MinEnergy", + # - "FCPS_HDBSCAN_2", + # - "FCPS_HDBSCAN_4", + # - "FCPS_HDBSCAN_8", + # - "FCPS_Diana", + # - "FCPS_Fanny", + # - "FCPS_Hardcl", + # - "FCPS_Softcl", + # - "FCPS_Clara", + # - "FCPS_PAM" inputs: - - entries: - - data.matrix - - data.true_labels + - data.matrix + - data.true_labels outputs: - id: clustering.predicted_ks_range - path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + path: "{dataset}_ks_range.labels.gz" - id: metrics modules: @@ -194,23 +234,21 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 + commit: c4eda85 parameters: - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "adjusted_fm_score"] - # - values: ["--metric", "adjusted_mi_score"] - # - values: ["--metric", "adjusted_rand_score"] - # - values: ["--metric", "fm_score"] - # - values: ["--metric", "mi_score"] - # - values: ["--metric", "normalized_clustering_accuracy"] - # - values: ["--metric", "normalized_mi_score"] - # - values: ["--metric", "normalized_pivoted_accuracy"] - # - values: ["--metric", "pair_sets_index"] - # - values: ["--metric", "rand_score"] + - metric: ["normalized_clustering_accuracy", "adjusted_fm_score"] + # - "adjusted_mi_score" + # - "adjusted_rand_score" + # - "fm_score" + # - "mi_score" + # - "normalized_clustering_accuracy" + # - "normalized_mi_score" + # - "normalized_pivoted_accuracy" + # - "pair_sets_index" + # - "rand_score" inputs: - - entries: - - clustering.predicted_ks_range - - data.true_labels + - clustering.predicted_ks_range + - data.true_labels outputs: - id: metrics.scores - path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" + path: "{dataset}.scores.gz" diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 1b0afee..4382ad7 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,11 +1,11 @@ id: clustering_example_envmodules -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 +description: "Clustering benchmark on Gagolewski's, true number of clusters plus minus 2." +version: "1.5.0" benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleenvmodules +storage: + api: S3 + endpoint: http://omnibenchmark.mls.uzh.ch:9000 + bucket_name: clusteringexampleenvmodules software_backend: envmodules software_environments: clustbench: @@ -24,12 +24,12 @@ metric_collectors: software_environment: "fcps" repository: url: https://github.com/imallona/clustering_report - commit: bbb9d56 + commit: "040" inputs: - metrics.scores outputs: - id: plotting.html - path: "{input}/{name}/plotting_report.html" + path: "{name}/plotting_report.html" stages: ## clustbench data ########################################################## @@ -40,78 +40,139 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: fc67ebd parameters: # comments depict the possible cardinalities and the number of curated labelsets - - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + - dataset_generator: "fcps" + dataset_name: ["atom", "chainlink"] # 2 1 + + # - dataset_generator: "fcps" + # dataset_name: "engytime" # 2 2 + # - dataset_generator: "fcps" + # dataset_name: "hepta" # 7 1 + # - dataset_generator: "fcps" + # dataset_name: "lsun" # 3 1 + # - dataset_generator: "fcps" + # dataset_name: "target" # 2, 6 2 + # - dataset_generator: "fcps" + # dataset_name: "tetra" # 4 1 + # - dataset_generator: "fcps" + # dataset_name: "twodiamonds" # 2 1 + # - dataset_generator: "fcps" + # dataset_name: "wingnut" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "dense" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "fuzzyx" # 2, 4, 5 6 + # - dataset_generator: "graves" + # dataset_name: "line" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "parabolic" # 2, 42 + # - dataset_generator: "graves" + # dataset_name: "ring" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "ring_noisy" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "ring_outliers" # 2, 52 + # - dataset_generator: "graves" + # dataset_name: "zigzag" # 3, 5 2 + # - dataset_generator: "graves" + # dataset_name: "zigzag_noisy" # 3, 52 + # - dataset_generator: "graves" + # dataset_name: "zigzag_outliers" # 3, 52 + # - dataset_generator: "other" + # dataset_name: "chameleon_t4_8k" # 6 1 + # - dataset_generator: "other" + # dataset_name: "chameleon_t5_8k" # 6 1 + # - dataset_generator: "other" + # dataset_name: "hdbscan" # 6 1 + # - dataset_generator: "other" + # dataset_name: "iris" # 3 1 + # - dataset_generator: "other" + # dataset_name: "iris5" # 3 1 + # - dataset_generator: "other" + # dataset_name: "square" # 2 1 + # - dataset_generator: "sipu" + # dataset_name: "aggregation" # 7 1 + # - dataset_generator: "sipu" + # dataset_name: "compound" # 4, 5, 6 5 + # - dataset_generator: "sipu" + # dataset_name: "flame" # 2 2 + # - dataset_generator: "sipu" + # dataset_name: "jain" # 2 1 + # - dataset_generator: "sipu" + # dataset_name: "pathbased" # 3, 4 2 + # - dataset_generator: "sipu" + # dataset_name: "r15" # 8, 9, 15 3 + # - dataset_generator: "sipu" + # dataset_name: "spiral" # 3 1 + # - dataset_generator: "sipu" + # dataset_name: "unbalance" # 8 1 + # - dataset_generator: "uci" + # dataset_name: "ecoli" # 8 1 + # - dataset_generator: "uci" + # dataset_name: "ionosphere" # 2 1 + # - dataset_generator: "uci" + # dataset_name: "sonar" # 2 1 + # - dataset_generator: "uci" + # dataset_name: "statlog" # 7 1 + # - dataset_generator: "uci" + # dataset_name: "wdbc" # 2 1 + # - dataset_generator: "uci" + # dataset_name: "wine" # 3 1 + # - dataset_generator: "uci" + # dataset_name: "yeast" # 10 1 + # - dataset_generator: "wut" + # dataset_name: "circles" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "cross" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "graph" # 10 1 + # - dataset_generator: "wut" + # dataset_name: "isolation" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "labirynth" # 6 1 + # - dataset_generator: "wut" + # dataset_name: "mk1" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "mk2" # 2 1 + # - dataset_generator: "wut" + # dataset_name: "mk3" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "mk4" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "olympic" # 5 1 + # - dataset_generator: "wut" + # dataset_name: "smile" # 4, 6 2 + # - dataset_generator: "wut" + # dataset_name: "stripes" # 2 1 + # - dataset_generator: "wut" + # dataset_name: "trajectories" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "trapped_lovers" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "twosplashes" # 2 1 + # - dataset_generator: "wut" + # dataset_name: "windows" # 5 1 + # - dataset_generator: "wut" + # dataset_name: "x1" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "x2" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "x3" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "z1" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "z2" # 5 1 + # - dataset_generator: "wut" + # dataset_name: "z3" # 4 1 outputs: - id: data.matrix - path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + path: "{dataset}.data.gz" - id: data.true_labels - path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + path: "{dataset}.labels0.gz" ## clustbench methods (fastcluster) ################################################################### - + - id: clustering modules: - id: fastcluster @@ -119,73 +180,65 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ - commit: "45e43d3" + commit: e644ce5 parameters: - - values: ["--linkage", "complete"] - # - values: ["--linkage", "ward"] - # - values: ["--linkage", "average"] - # - values: ["--linkage", "weighted"] - # - values: ["--linkage", "median"] - # - values: ["--linkage", "centroid"] + - linkage: "complete" + #- linkage: ["ward", "average", "weighted", "median", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn - commit: 5877378 + commit: dcf35e1 parameters: - - values: ["--method", "birch"] - # - values: ["--method", "kmeans"] - # - values: ["--method", "spectral"] ## too slow - # - values: ["--method", "gm"] + - method: "birch" + # ["kmeans, "gm"] + # ["spectral"] ## too slow - id: agglomerative name: "agglomerative" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_agglomerative - commit: 5454368 + commit: 9d086a9 parameters: - - values: ["--linkage", "average"] - # - values: ["--linkage", "complete"] - # - values: ["--linkage", "ward"] + - linkage: "average" + # ["complete", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_genieclust - commit: 6090043 + commit: 7d9e799 parameters: - - values: ["--method", "genie", "--gini_threshold", 0.5] - # - values: ["--method", "gic"] - # - values: ["--method", "ica"] + - method: "genie" + gini_threshold: 0.5 + # method: ["gic", "ica"] - id: fcps name: "fcps" software_environment: "fcps" repository: url: https://github.com/imallona/clustbench_fcps - commit: fc37faa + commit: e780fed parameters: - # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in Conda - - values: ["--method", "FCPS_Minimax", "--seed", 2] - # - values: ["--method", "FCPS_MinEnergy", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_2", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_4", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_8", "--seed", 2] - # - values: ["--method", "FCPS_Diana", "--seed", 2] - # - values: ["--method", "FCPS_Fanny", "--seed", 2] - # - values: ["--method", "FCPS_Hardcl", "--seed", 2] - # - values: ["--method", "FCPS_Softcl", "--seed", 2] - # - values: ["--method", "FCPS_Clara", "--seed", 2] - # - values: ["--method", "FCPS_PAM", "--seed", 2] + - method: "FCPS_Minimax" + seed: 2 + # - "FCPS_AdaptiveDensityPeak" # not in Conda + # - "FCPS_MinEnergy", + # - "FCPS_HDBSCAN_2", + # - "FCPS_HDBSCAN_4", + # - "FCPS_HDBSCAN_8", + # - "FCPS_Diana", + # - "FCPS_Fanny", + # - "FCPS_Hardcl", + # - "FCPS_Softcl", + # - "FCPS_Clara", + # - "FCPS_PAM" inputs: - - entries: - - data.matrix - - data.true_labels + - data.matrix + - data.true_labels outputs: - id: clustering.predicted_ks_range - path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + path: "{dataset}_ks_range.labels.gz" - id: metrics modules: @@ -194,23 +247,21 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 + commit: c4eda85 parameters: - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "adjusted_fm_score"] - # - values: ["--metric", "adjusted_mi_score"] - # - values: ["--metric", "adjusted_rand_score"] - # - values: ["--metric", "fm_score"] - # - values: ["--metric", "mi_score"] - # - values: ["--metric", "normalized_clustering_accuracy"] - # - values: ["--metric", "normalized_mi_score"] - # - values: ["--metric", "normalized_pivoted_accuracy"] - # - values: ["--metric", "pair_sets_index"] - # - values: ["--metric", "rand_score"] + - metric: ["normalized_clustering_accuracy", "adjusted_fm_score"] + # - "adjusted_mi_score" + # - "adjusted_rand_score" + # - "fm_score" + # - "mi_score" + # - "normalized_clustering_accuracy" + # - "normalized_mi_score" + # - "normalized_pivoted_accuracy" + # - "pair_sets_index" + # - "rand_score" inputs: - - entries: - - clustering.predicted_ks_range - - data.true_labels + - clustering.predicted_ks_range + - data.true_labels outputs: - id: metrics.scores - path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" + path: "{dataset}.scores.gz" diff --git a/Clustering_oras.yml b/Clustering_oras.yml index b8afaeb..3005070 100644 --- a/Clustering_oras.yml +++ b/Clustering_oras.yml @@ -1,11 +1,11 @@ id: clustering_example_oras -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 +description: "Clustering benchmark on Gagolewski's, true number of clusters plus minus 2." +version: "1.5.0" benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleoras +storage: + api: S3 + endpoint: http://omnibenchmark.mls.uzh.ch:9000 + bucket_name: clusteringexampleoras software_backend: apptainer software_environments: clustbench: @@ -21,15 +21,15 @@ software_environments: metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "fcps" + software_environment: fcps repository: url: https://github.com/imallona/clustering_report - commit: bbb9d56 + commit: "040" inputs: - metrics.scores outputs: - id: plotting.html - path: "{input}/{name}/plotting_report.html" + path: "{name}/plotting_report.html" stages: ## clustbench data ########################################################## @@ -40,78 +40,139 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: fc67ebd parameters: # comments depict the possible cardinalities and the number of curated labelsets - - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + - dataset_generator: "fcps" + dataset_name: ["atom", "chainlink"] # 2 1 + + # - dataset_generator: "fcps" + # dataset_name: "engytime" # 2 2 + # - dataset_generator: "fcps" + # dataset_name: "hepta" # 7 1 + # - dataset_generator: "fcps" + # dataset_name: "lsun" # 3 1 + # - dataset_generator: "fcps" + # dataset_name: "target" # 2, 6 2 + # - dataset_generator: "fcps" + # dataset_name: "tetra" # 4 1 + # - dataset_generator: "fcps" + # dataset_name: "twodiamonds" # 2 1 + # - dataset_generator: "fcps" + # dataset_name: "wingnut" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "dense" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "fuzzyx" # 2, 4, 5 6 + # - dataset_generator: "graves" + # dataset_name: "line" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "parabolic" # 2, 42 + # - dataset_generator: "graves" + # dataset_name: "ring" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "ring_noisy" # 2 1 + # - dataset_generator: "graves" + # dataset_name: "ring_outliers" # 2, 52 + # - dataset_generator: "graves" + # dataset_name: "zigzag" # 3, 5 2 + # - dataset_generator: "graves" + # dataset_name: "zigzag_noisy" # 3, 52 + # - dataset_generator: "graves" + # dataset_name: "zigzag_outliers" # 3, 52 + # - dataset_generator: "other" + # dataset_name: "chameleon_t4_8k" # 6 1 + # - dataset_generator: "other" + # dataset_name: "chameleon_t5_8k" # 6 1 + # - dataset_generator: "other" + # dataset_name: "hdbscan" # 6 1 + # - dataset_generator: "other" + # dataset_name: "iris" # 3 1 + # - dataset_generator: "other" + # dataset_name: "iris5" # 3 1 + # - dataset_generator: "other" + # dataset_name: "square" # 2 1 + # - dataset_generator: "sipu" + # dataset_name: "aggregation" # 7 1 + # - dataset_generator: "sipu" + # dataset_name: "compound" # 4, 5, 6 5 + # - dataset_generator: "sipu" + # dataset_name: "flame" # 2 2 + # - dataset_generator: "sipu" + # dataset_name: "jain" # 2 1 + # - dataset_generator: "sipu" + # dataset_name: "pathbased" # 3, 4 2 + # - dataset_generator: "sipu" + # dataset_name: "r15" # 8, 9, 15 3 + # - dataset_generator: "sipu" + # dataset_name: "spiral" # 3 1 + # - dataset_generator: "sipu" + # dataset_name: "unbalance" # 8 1 + # - dataset_generator: "uci" + # dataset_name: "ecoli" # 8 1 + # - dataset_generator: "uci" + # dataset_name: "ionosphere" # 2 1 + # - dataset_generator: "uci" + # dataset_name: "sonar" # 2 1 + # - dataset_generator: "uci" + # dataset_name: "statlog" # 7 1 + # - dataset_generator: "uci" + # dataset_name: "wdbc" # 2 1 + # - dataset_generator: "uci" + # dataset_name: "wine" # 3 1 + # - dataset_generator: "uci" + # dataset_name: "yeast" # 10 1 + # - dataset_generator: "wut" + # dataset_name: "circles" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "cross" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "graph" # 10 1 + # - dataset_generator: "wut" + # dataset_name: "isolation" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "labirynth" # 6 1 + # - dataset_generator: "wut" + # dataset_name: "mk1" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "mk2" # 2 1 + # - dataset_generator: "wut" + # dataset_name: "mk3" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "mk4" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "olympic" # 5 1 + # - dataset_generator: "wut" + # dataset_name: "smile" # 4, 6 2 + # - dataset_generator: "wut" + # dataset_name: "stripes" # 2 1 + # - dataset_generator: "wut" + # dataset_name: "trajectories" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "trapped_lovers" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "twosplashes" # 2 1 + # - dataset_generator: "wut" + # dataset_name: "windows" # 5 1 + # - dataset_generator: "wut" + # dataset_name: "x1" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "x2" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "x3" # 4 1 + # - dataset_generator: "wut" + # dataset_name: "z1" # 3 1 + # - dataset_generator: "wut" + # dataset_name: "z2" # 5 1 + # - dataset_generator: "wut" + # dataset_name: "z3" # 4 1 outputs: - id: data.matrix - path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + path: "{dataset}.data.gz" - id: data.true_labels - path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + path: "{dataset}.labels0.gz" ## clustbench methods (fastcluster) ################################################################### - + - id: clustering modules: - id: fastcluster @@ -119,73 +180,65 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ - commit: "45e43d3" + commit: e644ce5 parameters: - - values: ["--linkage", "complete"] - # - values: ["--linkage", "ward"] - # - values: ["--linkage", "average"] - # - values: ["--linkage", "weighted"] - # - values: ["--linkage", "median"] - # - values: ["--linkage", "centroid"] + - linkage: "complete" + #- linkage: ["ward", "average", "weighted", "median", "centroid"] - id: sklearn name: "sklearn" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn - commit: 5877378 + commit: dcf35e1 parameters: - - values: ["--method", "birch"] - # - values: ["--method", "kmeans"] - # - values: ["--method", "spectral"] ## too slow - # - values: ["--method", "gm"] + - method: "birch" + # ["kmeans, "gm"] + # ["spectral"] ## too slow - id: agglomerative name: "agglomerative" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_agglomerative - commit: 5454368 + commit: 9d086a9 parameters: - - values: ["--linkage", "average"] - # - values: ["--linkage", "complete"] - # - values: ["--linkage", "ward"] + - linkage: "average" + # ["complete", "ward"] - id: genieclust name: "genieclust" software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_genieclust - commit: 6090043 + commit: 7d9e799 parameters: - - values: ["--method", "genie", "--gini_threshold", 0.5] - # - values: ["--method", "gic"] - # - values: ["--method", "ica"] + - method: "genie" + # method: ["gic", "ica"] + gini_threshold: 0.5 - id: fcps name: "fcps" software_environment: "fcps" repository: url: https://github.com/imallona/clustbench_fcps - commit: fc37faa + commit: e780fed parameters: - # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in Conda - - values: ["--method", "FCPS_Minimax", "--seed", 2] - # - values: ["--method", "FCPS_MinEnergy", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_2", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_4", "--seed", 2] - # - values: ["--method", "FCPS_HDBSCAN_8", "--seed", 2] - # - values: ["--method", "FCPS_Diana", "--seed", 2] - # - values: ["--method", "FCPS_Fanny", "--seed", 2] - # - values: ["--method", "FCPS_Hardcl", "--seed", 2] - # - values: ["--method", "FCPS_Softcl", "--seed", 2] - # - values: ["--method", "FCPS_Clara", "--seed", 2] - # - values: ["--method", "FCPS_PAM", "--seed", 2] + - method: "FCPS_Minimax" + seed: 2 + # - "FCPS_AdaptiveDensityPeak" # not in Conda + # - "FCPS_MinEnergy", + # - "FCPS_HDBSCAN_2", + # - "FCPS_HDBSCAN_4", + # - "FCPS_HDBSCAN_8", + # - "FCPS_Diana", + # - "FCPS_Fanny", + # - "FCPS_Hardcl", + # - "FCPS_Softcl", + # - "FCPS_Clara", + # - "FCPS_PAM" inputs: - - entries: - - data.matrix - - data.true_labels + - data.matrix + - data.true_labels outputs: - id: clustering.predicted_ks_range - path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + path: "{dataset}_ks_range.labels.gz" - id: metrics modules: @@ -194,23 +247,21 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 + commit: c4eda85 parameters: - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "adjusted_fm_score"] - # - values: ["--metric", "adjusted_mi_score"] - # - values: ["--metric", "adjusted_rand_score"] - # - values: ["--metric", "fm_score"] - # - values: ["--metric", "mi_score"] - # - values: ["--metric", "normalized_clustering_accuracy"] - # - values: ["--metric", "normalized_mi_score"] - # - values: ["--metric", "normalized_pivoted_accuracy"] - # - values: ["--metric", "pair_sets_index"] - # - values: ["--metric", "rand_score"] + - metric: ["normalized_clustering_accuracy", "adjusted_fm_score"] + # - "adjusted_mi_score" + # - "adjusted_rand_score" + # - "fm_score" + # - "mi_score" + # - "normalized_clustering_accuracy" + # - "normalized_mi_score" + # - "normalized_pivoted_accuracy" + # - "pair_sets_index" + # - "rand_score" inputs: - - entries: - - clustering.predicted_ks_range - - data.true_labels + - clustering.predicted_ks_range + - data.true_labels outputs: - id: metrics.scores - path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" + path: "{dataset}.scores.gz" diff --git a/envs/clustbench.yml b/envs/clustbench.yml index 6cb6201..fe3e3a0 100644 --- a/envs/clustbench.yml +++ b/envs/clustbench.yml @@ -8,7 +8,6 @@ dependencies: - pip: #- "clustering-benchmarks==1.1.5" - 'https://github.com/gagolews/clustering-benchmarks/releases/download/v1.1.5/clustering_benchmarks-1.1.5.tar.gz' - - "wget" - "fastcluster==1.2.6" - "numpy==1.26.4" - "scipy==1.14.1"