From e7be305f709f247c64358b92d87c2944d5b9fb7a Mon Sep 17 00:00:00 2001 From: Vanshitaaa20 Date: Wed, 16 Jul 2025 03:11:31 +0530 Subject: [PATCH 1/3] added benchmark for pc and ges --- benchmarks/causal_discovery.py | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 benchmarks/causal_discovery.py diff --git a/benchmarks/causal_discovery.py b/benchmarks/causal_discovery.py new file mode 100644 index 0000000..50c052b --- /dev/null +++ b/benchmarks/causal_discovery.py @@ -0,0 +1,76 @@ +import numpy as np + +from pgmpy.base import DAG +from pgmpy.estimators import PC, GES +from pgmpy.metrics import SHD +from pgmpy.factors.continuous import LinearGaussianCPD +from pgmpy.models import LinearGaussianBayesianNetwork as LGBN + + +def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) -> DAG: + dag = DAG.get_random(n_nodes=num_nodes, edge_prob=edge_prob, seed=seed) + for i in range(num_nodes): + dag.add_node(f"X_{i}") + return dag + +def compute_shd_direct(true_dag, learned_dag) -> int: + E_true = set(true_dag.edges()) + E_est = set(learned_dag.edges()) + return len(E_true.symmetric_difference(E_est)) + +num_trials = 10 +shd_pc_list = [] +shd_ges_list = [] + +for trial in range(num_trials): + np.random.seed(trial) + print(f"\nTrial {trial + 1}/{num_trials}") + + true_dag = generate_random_dag(num_nodes=5, edge_prob=0.3, seed=trial) + + lgbn = LGBN(true_dag.edges()) + lgbn.add_nodes_from(true_dag.nodes()) + for node in true_dag.nodes(): + parents = list(lgbn.get_parents(node)) + beta = [0.0] + list(np.random.uniform(0.5, 1.5, size=len(parents))) + cpd = LinearGaussianCPD(variable=node, beta=beta, std=1, evidence=parents) + lgbn.add_cpds(cpd) + + data = lgbn.simulate(n=1000) + + try: + learned_dag_pc = PC(data).estimate( + ci_test="pearsonr", + variant="stable", + return_type="dag", + ) + except Exception as e: + print(" PC estimation failed:", e) + continue + + try: + ges_out = GES(data).estimate(scoring_method="bic-g") + learned_dag_ges = ( + ges_out["model"] + if isinstance(ges_out, dict) and "model" in ges_out + else (ges_out[0] if isinstance(ges_out, tuple) else ges_out) + ) + except Exception as e: + print(" GES estimation failed:", e) + continue + + for g in (learned_dag_pc, learned_dag_ges): + g.add_nodes_from(true_dag.nodes()) + + shd_pc = compute_shd_direct(true_dag, learned_dag_pc) + shd_ges = compute_shd_direct(true_dag, learned_dag_ges) + + shd_pc_list.append(shd_pc) + shd_ges_list.append(shd_ges) + + print(" SHD (PC):", shd_pc) + print(" SHD (GES):", shd_ges) + +print(f"\nAverage SHD over {len(shd_pc_list)} successful trials:") +print(f" PC: {np.mean(shd_pc_list):.2f} ± {np.std(shd_pc_list):.2f}") +print(f" GES: {np.mean(shd_ges_list):.2f} ± {np.std(shd_ges_list):.2f}") From 96322331a9f163a5d8a54e0db2f168948b48476f Mon Sep 17 00:00:00 2001 From: Vanshitaaa20 Date: Sat, 23 Aug 2025 19:15:28 +0530 Subject: [PATCH 2/3] Print both edge list and node list in SHD benchmarking --- benchmarks/causal_discovery.py | 61 ++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/benchmarks/causal_discovery.py b/benchmarks/causal_discovery.py index 50c052b..ef19f24 100644 --- a/benchmarks/causal_discovery.py +++ b/benchmarks/causal_discovery.py @@ -1,11 +1,34 @@ import numpy as np - from pgmpy.base import DAG from pgmpy.estimators import PC, GES from pgmpy.metrics import SHD from pgmpy.factors.continuous import LinearGaussianCPD from pgmpy.models import LinearGaussianBayesianNetwork as LGBN +""" +Benchmarking Structural Hamming Distance (SHD) for Causal Discovery Algorithms: PC and GES + +Algorithm Definitions: +---------------------- +- PC (Peter-Clark) Algorithm: + A constraint-based algorithm that starts with a complete undirected graph and removes edges + based on conditional independence tests. It then orients edges using separation sets and + rules like the collider rule: + X → Z ← Y if X ⟂⟂ Y | Z + +- GES (Greedy Equivalence Search) Algorithm: + A score-based algorithm that performs greedy forward and backward search in the space + of equivalence classes of DAGs to maximize a scoring criterion such as BIC. + + Scoring function (Bayesian Information Criterion - BIC): + Score(G : D) = log P(D | G) - λ * |G| + +Metric: +------- +- SHD (Structural Hamming Distance): + Measures the number of edge insertions, deletions, or reversals required to convert + one DAG into another. +""" def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) -> DAG: dag = DAG.get_random(n_nodes=num_nodes, edge_prob=edge_prob, seed=seed) @@ -13,21 +36,18 @@ def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) - dag.add_node(f"X_{i}") return dag -def compute_shd_direct(true_dag, learned_dag) -> int: - E_true = set(true_dag.edges()) - E_est = set(learned_dag.edges()) - return len(E_true.symmetric_difference(E_est)) - +# Benchmark parameters num_trials = 10 shd_pc_list = [] shd_ges_list = [] +# Run trials for trial in range(num_trials): np.random.seed(trial) print(f"\nTrial {trial + 1}/{num_trials}") true_dag = generate_random_dag(num_nodes=5, edge_prob=0.3, seed=trial) - + lgbn = LGBN(true_dag.edges()) lgbn.add_nodes_from(true_dag.nodes()) for node in true_dag.nodes(): @@ -37,7 +57,8 @@ def compute_shd_direct(true_dag, learned_dag) -> int: lgbn.add_cpds(cpd) data = lgbn.simulate(n=1000) - + + # PC Estimation try: learned_dag_pc = PC(data).estimate( ci_test="pearsonr", @@ -47,7 +68,8 @@ def compute_shd_direct(true_dag, learned_dag) -> int: except Exception as e: print(" PC estimation failed:", e) continue - + + # GES Estimation try: ges_out = GES(data).estimate(scoring_method="bic-g") learned_dag_ges = ( @@ -59,11 +81,23 @@ def compute_shd_direct(true_dag, learned_dag) -> int: print(" GES estimation failed:", e) continue - for g in (learned_dag_pc, learned_dag_ges): - g.add_nodes_from(true_dag.nodes()) + # Ensure node alignment + all_nodes = sorted(set(true_dag.nodes()).union( + set(learned_dag_pc.nodes())).union(set(learned_dag_ges.nodes()))) + true_dag.add_nodes_from(all_nodes) + learned_dag_pc.add_nodes_from(all_nodes) + learned_dag_ges.add_nodes_from(all_nodes) - shd_pc = compute_shd_direct(true_dag, learned_dag_pc) - shd_ges = compute_shd_direct(true_dag, learned_dag_ges) + # Compute SHD using built-in method + try: + shd_pc = SHD(true_dag, learned_dag_pc) + shd_ges = SHD(true_dag, learned_dag_ges) + except Exception as e: + print(" SHD computation failed:", e) + print(" true_dag edges:", true_dag.edges()) + print(" learned_dag_pc edges:", learned_dag_pc.edges()) + print(" learned_dag_ges edges:", learned_dag_ges.edges()) + continue shd_pc_list.append(shd_pc) shd_ges_list.append(shd_ges) @@ -71,6 +105,7 @@ def compute_shd_direct(true_dag, learned_dag) -> int: print(" SHD (PC):", shd_pc) print(" SHD (GES):", shd_ges) +# Final Results print(f"\nAverage SHD over {len(shd_pc_list)} successful trials:") print(f" PC: {np.mean(shd_pc_list):.2f} ± {np.std(shd_pc_list):.2f}") print(f" GES: {np.mean(shd_ges_list):.2f} ± {np.std(shd_ges_list):.2f}") From e49851abb4acba4fff741f4fc0168150b0be35a9 Mon Sep 17 00:00:00 2001 From: Vanshitaaa20 Date: Sat, 13 Sep 2025 11:20:13 +0530 Subject: [PATCH 3/3] Remove redundant node alignment; keep raw DAGs for debugging --- benchmarks/causal_discovery.py | 91 +++++++++++----------------------- 1 file changed, 28 insertions(+), 63 deletions(-) diff --git a/benchmarks/causal_discovery.py b/benchmarks/causal_discovery.py index ef19f24..ead3903 100644 --- a/benchmarks/causal_discovery.py +++ b/benchmarks/causal_discovery.py @@ -1,8 +1,9 @@ import numpy as np +import pandas as pd +import networkx as nx from pgmpy.base import DAG from pgmpy.estimators import PC, GES from pgmpy.metrics import SHD -from pgmpy.factors.continuous import LinearGaussianCPD from pgmpy.models import LinearGaussianBayesianNetwork as LGBN """ @@ -30,82 +31,46 @@ one DAG into another. """ -def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) -> DAG: - dag = DAG.get_random(n_nodes=num_nodes, edge_prob=edge_prob, seed=seed) - for i in range(num_nodes): - dag.add_node(f"X_{i}") - return dag - # Benchmark parameters num_trials = 10 -shd_pc_list = [] -shd_ges_list = [] +results = [] -# Run trials for trial in range(num_trials): np.random.seed(trial) print(f"\nTrial {trial + 1}/{num_trials}") - true_dag = generate_random_dag(num_nodes=5, edge_prob=0.3, seed=trial) - - lgbn = LGBN(true_dag.edges()) - lgbn.add_nodes_from(true_dag.nodes()) - for node in true_dag.nodes(): - parents = list(lgbn.get_parents(node)) - beta = [0.0] + list(np.random.uniform(0.5, 1.5, size=len(parents))) - cpd = LinearGaussianCPD(variable=node, beta=beta, std=1, evidence=parents) - lgbn.add_cpds(cpd) + # Generate random true model + lgbn = LGBN.get_random(n_nodes=5, edge_prob=0.3, seed=trial) + true_dag = DAG(lgbn.edges()) + true_dag.add_nodes_from(lgbn.nodes()) + # Simulate data data = lgbn.simulate(n=1000) - # PC Estimation - try: - learned_dag_pc = PC(data).estimate( - ci_test="pearsonr", - variant="stable", - return_type="dag", - ) - except Exception as e: - print(" PC estimation failed:", e) - continue - - # GES Estimation - try: - ges_out = GES(data).estimate(scoring_method="bic-g") - learned_dag_ges = ( - ges_out["model"] - if isinstance(ges_out, dict) and "model" in ges_out - else (ges_out[0] if isinstance(ges_out, tuple) else ges_out) - ) - except Exception as e: - print(" GES estimation failed:", e) - continue + # PC estimation + pc_est = PC(data).estimate(ci_test="pearsonr", variant="stable", return_type="dag") + learned_dag_pc = nx.DiGraph(pc_est.edges()) - # Ensure node alignment - all_nodes = sorted(set(true_dag.nodes()).union( - set(learned_dag_pc.nodes())).union(set(learned_dag_ges.nodes()))) - true_dag.add_nodes_from(all_nodes) - learned_dag_pc.add_nodes_from(all_nodes) - learned_dag_ges.add_nodes_from(all_nodes) + # GES estimation + ges_out = GES(data).estimate(scoring_method="bic-g") + ges_model = ges_out["model"] if isinstance(ges_out, dict) else ges_out + learned_dag_ges = nx.DiGraph(ges_model.edges()) - # Compute SHD using built-in method - try: - shd_pc = SHD(true_dag, learned_dag_pc) - shd_ges = SHD(true_dag, learned_dag_ges) - except Exception as e: - print(" SHD computation failed:", e) - print(" true_dag edges:", true_dag.edges()) - print(" learned_dag_pc edges:", learned_dag_pc.edges()) - print(" learned_dag_ges edges:", learned_dag_ges.edges()) - continue + # Debug: show node sets before SHD + print(" True nodes: ", set(true_dag.nodes())) + print(" PC nodes: ", set(learned_dag_pc.nodes())) + print(" GES nodes: ", set(learned_dag_ges.nodes())) - shd_pc_list.append(shd_pc) - shd_ges_list.append(shd_ges) + # Compute SHD (this will crash if node sets differ) + shd_pc = SHD(true_dag, learned_dag_pc) + shd_ges = SHD(true_dag, learned_dag_ges) + results.append({"trial": trial + 1, "shd_pc": shd_pc, "shd_ges": shd_ges}) print(" SHD (PC):", shd_pc) print(" SHD (GES):", shd_ges) -# Final Results -print(f"\nAverage SHD over {len(shd_pc_list)} successful trials:") -print(f" PC: {np.mean(shd_pc_list):.2f} ± {np.std(shd_pc_list):.2f}") -print(f" GES: {np.mean(shd_ges_list):.2f} ± {np.std(shd_ges_list):.2f}") +# Summary +df = pd.DataFrame(results) +print("\nAverage SHD over trials:") +print(f" PC: {df['shd_pc'].mean():.2f} ± {df['shd_pc'].std():.2f}") +print(f" GES: {df['shd_ges'].mean():.2f} ± {df['shd_ges'].std():.2f}") \ No newline at end of file