From e7be305f709f247c64358b92d87c2944d5b9fb7a Mon Sep 17 00:00:00 2001
From: Vanshitaaa20 <vanshitapandey31017@gmail.com>
Date: Wed, 16 Jul 2025 03:11:31 +0530
Subject: [PATCH 1/3] added benchmark for pc and ges

---
 benchmarks/causal_discovery.py | 76 ++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 benchmarks/causal_discovery.py

diff --git a/benchmarks/causal_discovery.py b/benchmarks/causal_discovery.py
new file mode 100644
index 0000000..50c052b
--- /dev/null
+++ b/benchmarks/causal_discovery.py
@@ -0,0 +1,76 @@
+import numpy as np
+
+from pgmpy.base import DAG
+from pgmpy.estimators import PC, GES
+from pgmpy.metrics import SHD
+from pgmpy.factors.continuous import LinearGaussianCPD
+from pgmpy.models import LinearGaussianBayesianNetwork as LGBN
+
+
+def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) -> DAG:
+    dag = DAG.get_random(n_nodes=num_nodes, edge_prob=edge_prob, seed=seed)
+    for i in range(num_nodes):
+        dag.add_node(f"X_{i}")
+    return dag
+
+def compute_shd_direct(true_dag, learned_dag) -> int:
+    E_true = set(true_dag.edges())
+    E_est  = set(learned_dag.edges())
+    return len(E_true.symmetric_difference(E_est))
+
+num_trials = 10
+shd_pc_list = []
+shd_ges_list = []
+
+for trial in range(num_trials):
+    np.random.seed(trial)
+    print(f"\nTrial {trial + 1}/{num_trials}")
+
+    true_dag = generate_random_dag(num_nodes=5, edge_prob=0.3, seed=trial)
+    
+    lgbn = LGBN(true_dag.edges())
+    lgbn.add_nodes_from(true_dag.nodes())
+    for node in true_dag.nodes():
+        parents = list(lgbn.get_parents(node))
+        beta = [0.0] + list(np.random.uniform(0.5, 1.5, size=len(parents)))
+        cpd = LinearGaussianCPD(variable=node, beta=beta, std=1, evidence=parents)
+        lgbn.add_cpds(cpd)
+
+    data = lgbn.simulate(n=1000)
+    
+    try:
+        learned_dag_pc = PC(data).estimate(
+            ci_test="pearsonr",
+            variant="stable",
+            return_type="dag",
+        )
+    except Exception as e:
+        print(" PC estimation failed:", e)
+        continue
+    
+    try:
+        ges_out = GES(data).estimate(scoring_method="bic-g")
+        learned_dag_ges = (
+            ges_out["model"]
+            if isinstance(ges_out, dict) and "model" in ges_out
+            else (ges_out[0] if isinstance(ges_out, tuple) else ges_out)
+        )
+    except Exception as e:
+        print(" GES estimation failed:", e)
+        continue
+
+    for g in (learned_dag_pc, learned_dag_ges):
+        g.add_nodes_from(true_dag.nodes())
+
+    shd_pc  = compute_shd_direct(true_dag, learned_dag_pc)
+    shd_ges = compute_shd_direct(true_dag, learned_dag_ges)
+
+    shd_pc_list.append(shd_pc)
+    shd_ges_list.append(shd_ges)
+
+    print(" SHD (PC):", shd_pc)
+    print(" SHD (GES):", shd_ges)
+
+print(f"\nAverage SHD over {len(shd_pc_list)} successful trials:")
+print(f"  PC:  {np.mean(shd_pc_list):.2f} ± {np.std(shd_pc_list):.2f}")
+print(f"  GES: {np.mean(shd_ges_list):.2f} ± {np.std(shd_ges_list):.2f}")

From 96322331a9f163a5d8a54e0db2f168948b48476f Mon Sep 17 00:00:00 2001
From: Vanshitaaa20 <vanshitapandey31017@gmail.com>
Date: Sat, 23 Aug 2025 19:15:28 +0530
Subject: [PATCH 2/3] Print both edge list and node list in SHD benchmarking

---
 benchmarks/causal_discovery.py | 61 ++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/benchmarks/causal_discovery.py b/benchmarks/causal_discovery.py
index 50c052b..ef19f24 100644
--- a/benchmarks/causal_discovery.py
+++ b/benchmarks/causal_discovery.py
@@ -1,11 +1,34 @@
 import numpy as np
-
 from pgmpy.base import DAG
 from pgmpy.estimators import PC, GES
 from pgmpy.metrics import SHD
 from pgmpy.factors.continuous import LinearGaussianCPD
 from pgmpy.models import LinearGaussianBayesianNetwork as LGBN
 
+"""
+Benchmarking Structural Hamming Distance (SHD) for Causal Discovery Algorithms: PC and GES
+
+Algorithm Definitions:
+----------------------
+- PC (Peter-Clark) Algorithm:
+  A constraint-based algorithm that starts with a complete undirected graph and removes edges
+  based on conditional independence tests. It then orients edges using separation sets and
+  rules like the collider rule:
+      X → Z ← Y   if X ⟂⟂ Y | Z
+
+- GES (Greedy Equivalence Search) Algorithm:
+  A score-based algorithm that performs greedy forward and backward search in the space
+  of equivalence classes of DAGs to maximize a scoring criterion such as BIC.
+
+  Scoring function (Bayesian Information Criterion - BIC):
+      Score(G : D) = log P(D | G) - λ * |G|
+
+Metric:
+-------
+- SHD (Structural Hamming Distance):
+  Measures the number of edge insertions, deletions, or reversals required to convert
+  one DAG into another.
+"""
 
 def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) -> DAG:
     dag = DAG.get_random(n_nodes=num_nodes, edge_prob=edge_prob, seed=seed)
@@ -13,21 +36,18 @@ def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) -
         dag.add_node(f"X_{i}")
     return dag
 
-def compute_shd_direct(true_dag, learned_dag) -> int:
-    E_true = set(true_dag.edges())
-    E_est  = set(learned_dag.edges())
-    return len(E_true.symmetric_difference(E_est))
-
+# Benchmark parameters
 num_trials = 10
 shd_pc_list = []
 shd_ges_list = []
 
+# Run trials
 for trial in range(num_trials):
     np.random.seed(trial)
     print(f"\nTrial {trial + 1}/{num_trials}")
 
     true_dag = generate_random_dag(num_nodes=5, edge_prob=0.3, seed=trial)
-    
+
     lgbn = LGBN(true_dag.edges())
     lgbn.add_nodes_from(true_dag.nodes())
     for node in true_dag.nodes():
@@ -37,7 +57,8 @@ def compute_shd_direct(true_dag, learned_dag) -> int:
         lgbn.add_cpds(cpd)
 
     data = lgbn.simulate(n=1000)
-    
+
+    # PC Estimation
     try:
         learned_dag_pc = PC(data).estimate(
             ci_test="pearsonr",
@@ -47,7 +68,8 @@ def compute_shd_direct(true_dag, learned_dag) -> int:
     except Exception as e:
         print(" PC estimation failed:", e)
         continue
-    
+
+    # GES Estimation
     try:
         ges_out = GES(data).estimate(scoring_method="bic-g")
         learned_dag_ges = (
@@ -59,11 +81,23 @@ def compute_shd_direct(true_dag, learned_dag) -> int:
         print(" GES estimation failed:", e)
         continue
 
-    for g in (learned_dag_pc, learned_dag_ges):
-        g.add_nodes_from(true_dag.nodes())
+    # Ensure node alignment
+    all_nodes = sorted(set(true_dag.nodes()).union(
+        set(learned_dag_pc.nodes())).union(set(learned_dag_ges.nodes())))
+    true_dag.add_nodes_from(all_nodes)
+    learned_dag_pc.add_nodes_from(all_nodes)
+    learned_dag_ges.add_nodes_from(all_nodes)
 
-    shd_pc  = compute_shd_direct(true_dag, learned_dag_pc)
-    shd_ges = compute_shd_direct(true_dag, learned_dag_ges)
+    # Compute SHD using built-in method
+    try:
+        shd_pc = SHD(true_dag, learned_dag_pc)
+        shd_ges = SHD(true_dag, learned_dag_ges)
+    except Exception as e:
+        print(" SHD computation failed:", e)
+        print(" true_dag edges:", true_dag.edges())
+        print(" learned_dag_pc edges:", learned_dag_pc.edges())
+        print(" learned_dag_ges edges:", learned_dag_ges.edges())
+        continue
 
     shd_pc_list.append(shd_pc)
     shd_ges_list.append(shd_ges)
@@ -71,6 +105,7 @@ def compute_shd_direct(true_dag, learned_dag) -> int:
     print(" SHD (PC):", shd_pc)
     print(" SHD (GES):", shd_ges)
 
+# Final Results
 print(f"\nAverage SHD over {len(shd_pc_list)} successful trials:")
 print(f"  PC:  {np.mean(shd_pc_list):.2f} ± {np.std(shd_pc_list):.2f}")
 print(f"  GES: {np.mean(shd_ges_list):.2f} ± {np.std(shd_ges_list):.2f}")

From e49851abb4acba4fff741f4fc0168150b0be35a9 Mon Sep 17 00:00:00 2001
From: Vanshitaaa20 <vanshitapandey31017@gmail.com>
Date: Sat, 13 Sep 2025 11:20:13 +0530
Subject: [PATCH 3/3] Remove redundant node alignment; keep raw DAGs for
 debugging

---
 benchmarks/causal_discovery.py | 91 +++++++++++-----------------------
 1 file changed, 28 insertions(+), 63 deletions(-)

diff --git a/benchmarks/causal_discovery.py b/benchmarks/causal_discovery.py
index ef19f24..ead3903 100644
--- a/benchmarks/causal_discovery.py
+++ b/benchmarks/causal_discovery.py
@@ -1,8 +1,9 @@
 import numpy as np
+import pandas as pd
+import networkx as nx
 from pgmpy.base import DAG
 from pgmpy.estimators import PC, GES
 from pgmpy.metrics import SHD
-from pgmpy.factors.continuous import LinearGaussianCPD
 from pgmpy.models import LinearGaussianBayesianNetwork as LGBN
 
 """
@@ -30,82 +31,46 @@
   one DAG into another.
 """
 
-def generate_random_dag(num_nodes: int, edge_prob: float = 0.3, seed: int = 0) -> DAG:
-    dag = DAG.get_random(n_nodes=num_nodes, edge_prob=edge_prob, seed=seed)
-    for i in range(num_nodes):
-        dag.add_node(f"X_{i}")
-    return dag
-
 # Benchmark parameters
 num_trials = 10
-shd_pc_list = []
-shd_ges_list = []
+results = []
 
-# Run trials
 for trial in range(num_trials):
     np.random.seed(trial)
     print(f"\nTrial {trial + 1}/{num_trials}")
 
-    true_dag = generate_random_dag(num_nodes=5, edge_prob=0.3, seed=trial)
-
-    lgbn = LGBN(true_dag.edges())
-    lgbn.add_nodes_from(true_dag.nodes())
-    for node in true_dag.nodes():
-        parents = list(lgbn.get_parents(node))
-        beta = [0.0] + list(np.random.uniform(0.5, 1.5, size=len(parents)))
-        cpd = LinearGaussianCPD(variable=node, beta=beta, std=1, evidence=parents)
-        lgbn.add_cpds(cpd)
+    # Generate random true model
+    lgbn = LGBN.get_random(n_nodes=5, edge_prob=0.3, seed=trial)
+    true_dag = DAG(lgbn.edges())
+    true_dag.add_nodes_from(lgbn.nodes()) 
 
+    # Simulate data
     data = lgbn.simulate(n=1000)
 
-    # PC Estimation
-    try:
-        learned_dag_pc = PC(data).estimate(
-            ci_test="pearsonr",
-            variant="stable",
-            return_type="dag",
-        )
-    except Exception as e:
-        print(" PC estimation failed:", e)
-        continue
-
-    # GES Estimation
-    try:
-        ges_out = GES(data).estimate(scoring_method="bic-g")
-        learned_dag_ges = (
-            ges_out["model"]
-            if isinstance(ges_out, dict) and "model" in ges_out
-            else (ges_out[0] if isinstance(ges_out, tuple) else ges_out)
-        )
-    except Exception as e:
-        print(" GES estimation failed:", e)
-        continue
+    # PC estimation
+    pc_est = PC(data).estimate(ci_test="pearsonr", variant="stable", return_type="dag")
+    learned_dag_pc = nx.DiGraph(pc_est.edges())
 
-    # Ensure node alignment
-    all_nodes = sorted(set(true_dag.nodes()).union(
-        set(learned_dag_pc.nodes())).union(set(learned_dag_ges.nodes())))
-    true_dag.add_nodes_from(all_nodes)
-    learned_dag_pc.add_nodes_from(all_nodes)
-    learned_dag_ges.add_nodes_from(all_nodes)
+    # GES estimation
+    ges_out = GES(data).estimate(scoring_method="bic-g")
+    ges_model = ges_out["model"] if isinstance(ges_out, dict) else ges_out
+    learned_dag_ges = nx.DiGraph(ges_model.edges())
 
-    # Compute SHD using built-in method
-    try:
-        shd_pc = SHD(true_dag, learned_dag_pc)
-        shd_ges = SHD(true_dag, learned_dag_ges)
-    except Exception as e:
-        print(" SHD computation failed:", e)
-        print(" true_dag edges:", true_dag.edges())
-        print(" learned_dag_pc edges:", learned_dag_pc.edges())
-        print(" learned_dag_ges edges:", learned_dag_ges.edges())
-        continue
+    # Debug: show node sets before SHD
+    print(" True nodes:   ", set(true_dag.nodes()))
+    print(" PC nodes:     ", set(learned_dag_pc.nodes()))
+    print(" GES nodes:    ", set(learned_dag_ges.nodes()))
 
-    shd_pc_list.append(shd_pc)
-    shd_ges_list.append(shd_ges)
+    # Compute SHD (this will crash if node sets differ)
+    shd_pc = SHD(true_dag, learned_dag_pc)
+    shd_ges = SHD(true_dag, learned_dag_ges)
 
+    results.append({"trial": trial + 1, "shd_pc": shd_pc, "shd_ges": shd_ges})
     print(" SHD (PC):", shd_pc)
     print(" SHD (GES):", shd_ges)
 
-# Final Results
-print(f"\nAverage SHD over {len(shd_pc_list)} successful trials:")
-print(f"  PC:  {np.mean(shd_pc_list):.2f} ± {np.std(shd_pc_list):.2f}")
-print(f"  GES: {np.mean(shd_ges_list):.2f} ± {np.std(shd_ges_list):.2f}")
+# Summary
+df = pd.DataFrame(results)
+print("\nAverage SHD over trials:")
+print(f"  PC:  {df['shd_pc'].mean():.2f} ± {df['shd_pc'].std():.2f}")
+print(f"  GES: {df['shd_ges'].mean():.2f} ± {df['shd_ges'].std():.2f}")
\ No newline at end of file